In [1]:
import findspark
findspark.init()

In [52]:
import pyspark
import pandas as pd 
import numpy as np
import os 
import datetime
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import matplotlib.ticker as mticker
import plotly.express as px

from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from math import radians, cos, sin, asin, sqrt
from pyspark.sql import functions as F
from pyspark.sql.functions import col, row_number, count, when, isnan, abs, cos, atan2, coalesce, avg, lit

from cmath import pi
from math import atan2, radians

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [3]:
spark = SparkSession.builder.master("local[*]").config("spark.executor.memory", "48g").config("spark.driver.memory", "48g").config("spark.driver.maxResultSize","0").config("spark.executor.heartbeatInterval","100000").getOrCreate()

In [4]:
gdrive_path = 'C:\\Users\\matts\\Google Drive\\berkeley\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data\\'

In [72]:
df_left = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'joined_data/joined_open_schools_only_10-10-22.csv'))

In [73]:
df_left.printSchema()

root
 |-- index: string (nullable = true)
 |-- year: string (nullable = true)
 |-- cdscode: string (nullable = true)
 |-- school_county: string (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_open_date: string (nullable = true)
 |-- school_closed_date: string (nullable = true)
 |-- school_lat: string (nullable = true)
 |-- school_lon: string (nullable = true)
 |-- school_wspd: string (nullable = true)
 |-- school_elevation_m: string (nullable = true)
 |-- year_month: string (nullable = true)
 |-- total_population: string (nullable = true)
 |-- total_population_male: string (nullable = true)
 |-- total_population_female: string (nullable = true)
 |-- population_0_4: string (nullable = true)
 |-- population_0_4_male: string (nullable = true)
 |-- population_0_4_female: string (nullable = true)
 |-- population_5_9: string (nullable = true)
 |-- population_5_9_male: string (nullable = true)
 |-- population_5_9_female: string (nullable = true)
 |-- population_10_14: 

In [74]:
df_left = df_left.withColumn('school_lat',df_left.school_lat.cast('double'))
df_left = df_left.withColumn('school_lon',df_left.school_lon.cast('double'))
df_left = df_left.withColumn('point_source_lat',df_left.point_source_lat.cast('double'))
df_left = df_left.withColumn('point_source_lon',df_left.point_source_lon.cast('double'))
df_left = df_left.withColumn('year',df_left.year_month.substr(0,4).cast('int'))
df_left = df_left.withColumn('month',df_left.year_month.substr(6,2).cast('int'))

In [75]:
df_left.printSchema()

root
 |-- index: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- cdscode: string (nullable = true)
 |-- school_county: string (nullable = true)
 |-- school_zip: string (nullable = true)
 |-- school_open_date: string (nullable = true)
 |-- school_closed_date: string (nullable = true)
 |-- school_lat: double (nullable = true)
 |-- school_lon: double (nullable = true)
 |-- school_wspd: string (nullable = true)
 |-- school_elevation_m: string (nullable = true)
 |-- year_month: string (nullable = true)
 |-- total_population: string (nullable = true)
 |-- total_population_male: string (nullable = true)
 |-- total_population_female: string (nullable = true)
 |-- population_0_4: string (nullable = true)
 |-- population_0_4_male: string (nullable = true)
 |-- population_0_4_female: string (nullable = true)
 |-- population_5_9: string (nullable = true)
 |-- population_5_9_male: string (nullable = true)
 |-- population_5_9_female: string (nullable = true)
 |-- population_10_14:

In [57]:
df_left.limit(10).show()

+-----+----+--------------+-------------+----------+----------------+------------------+----------+----------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+--------+----------------+----------------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+----------------+--

In [8]:
df_school_temps = spark.read.parquet(os.path.join(gdrive_path, 'temperature/schools_temp_lookup.parquet'))

In [9]:
df_school_temps.printSchema()

root
 |-- school_lat: double (nullable = true)
 |-- school_lon: double (nullable = true)
 |-- elevation_meters: double (nullable = true)
 |-- time: string (nullable = true)
 |-- avg_temp: double (nullable = true)
 |-- __index_level_0__: long (nullable = true)



In [10]:
cols_to_drop = ['elevation_meters','__index_level_0__']

df_school_temps = df_school_temps.drop(*cols_to_drop)

In [11]:
df_school_temps.show()

+----------+----------+----------+--------+
|school_lat|school_lon|      time|avg_temp|
+----------+----------+----------+--------+
|  37.11521|-120.26603|2006-02-01|     9.9|
|  37.11521|-120.26603|2006-03-01|     9.5|
|  37.11521|-120.26603|2006-04-01|    null|
|  37.11521|-120.26603|2006-05-01|    20.6|
|  37.11521|-120.26603|2006-06-01|    25.3|
|  37.11521|-120.26603|2006-07-01|    28.5|
|  37.11521|-120.26603|2006-08-01|    23.9|
|  37.11521|-120.26603|2006-09-01|    22.1|
|  37.11521|-120.26603|2006-10-01|    15.6|
|  37.11521|-120.26603|2006-11-01|    null|
|  37.11521|-120.26603|2006-12-01|    null|
|  37.11521|-120.26603|2007-01-01|     4.8|
|  37.11521|-120.26603|2007-02-01|    10.1|
|  37.11521|-120.26603|2007-03-01|    null|
|  37.11521|-120.26603|2007-04-01|    null|
|  37.11521|-120.26603|2007-05-01|    20.0|
|  37.11521|-120.26603|2007-06-01|    null|
|  37.11521|-120.26603|2007-07-01|    null|
|  37.11521|-120.26603|2007-08-01|    25.4|
|  37.11521|-120.26603|2007-09-0

In [12]:
# spot check for nulls

df_school_temps.select([count(when( col(c).isNull(), c)).alias(c) for c in df_school_temps.columns]).show()

+----------+----------+----+--------+
|school_lat|school_lon|time|avg_temp|
+----------+----------+----+--------+
|         0|         0|   0|  423883|
+----------+----------+----+--------+



In [13]:
df_school_temps = df_school_temps.filter(df_school_temps.time.isNotNull()).filter(df_school_temps.avg_temp.isNotNull())

In [14]:
# spot check for nulls

df_school_temps.select([count(when( col(c).isNull(), c)).alias(c) for c in df_school_temps.columns]).show()

+----------+----------+----+--------+
|school_lat|school_lon|time|avg_temp|
+----------+----------+----+--------+
|         0|         0|   0|       0|
+----------+----------+----+--------+



In [15]:
df_school_temps.count()

2060211

In [16]:
# rename columns to concatenate

df_school_temps = df_school_temps.withColumnRenamed("school_lat","temp_lat").withColumnRenamed("school_lon","temp_lon")

In [17]:
df_school_temps = df_school_temps.dropDuplicates(df_school_temps.columns)

In [18]:
df_school_temps.count()

1834753

In [19]:
df_ps_temps = spark.read.parquet(os.path.join(gdrive_path, 'temperature/point_poll_sources_temp_lookup.parquet'))

In [20]:
df_ps_temps.printSchema()

root
 |-- point_source_lat: double (nullable = true)
 |-- point_source_lon: double (nullable = true)
 |-- elevation_meters: double (nullable = true)
 |-- time: string (nullable = true)
 |-- avg_temp: double (nullable = true)
 |-- __index_level_0__: long (nullable = true)



In [21]:
cols_to_drop = ['elevation_meters','__index_level_0__']

df_ps_temps = df_ps_temps.drop(*cols_to_drop)

In [22]:
df_ps_temps.show()

+----------------+----------------+----------+--------+
|point_source_lat|point_source_lon|      time|avg_temp|
+----------------+----------------+----------+--------+
|         37.1056|       -120.2487|2006-02-01|     9.9|
|         37.1056|       -120.2487|2006-03-01|     9.5|
|         37.1056|       -120.2487|2006-04-01|    null|
|         37.1056|       -120.2487|2006-05-01|    20.6|
|         37.1056|       -120.2487|2006-06-01|    25.3|
|         37.1056|       -120.2487|2006-07-01|    28.4|
|         37.1056|       -120.2487|2006-08-01|    23.8|
|         37.1056|       -120.2487|2006-09-01|    22.1|
|         37.1056|       -120.2487|2006-10-01|    15.5|
|         37.1056|       -120.2487|2006-11-01|    null|
|         37.1056|       -120.2487|2006-12-01|    null|
|         37.1056|       -120.2487|2007-01-01|     4.7|
|         37.1056|       -120.2487|2007-02-01|    10.1|
|         37.1056|       -120.2487|2007-03-01|    null|
|         37.1056|       -120.2487|2007-04-01|  

In [23]:
# spot check for nulls

df_ps_temps.select([count(when( col(c).isNull(), c)).alias(c) for c in df_ps_temps.columns]).show()

+----------------+----------------+----+--------+
|point_source_lat|point_source_lon|time|avg_temp|
+----------------+----------------+----+--------+
|               0|               0|   0|  116377|
+----------------+----------------+----+--------+



In [24]:
df_ps_temps = df_ps_temps.filter(df_ps_temps.time.isNotNull()).filter(df_ps_temps.avg_temp.isNotNull())

In [25]:
# spot check for nulls

df_ps_temps.select([count(when( col(c).isNull(), c)).alias(c) for c in df_ps_temps.columns]).show()

+----------------+----------------+----+--------+
|point_source_lat|point_source_lon|time|avg_temp|
+----------------+----------------+----+--------+
|               0|               0|   0|       0|
+----------------+----------------+----+--------+



In [26]:
df_ps_temps.count()

483895

In [27]:
df_ps_temps = df_ps_temps.withColumnRenamed("ps_lat","temp_lat").withColumnRenamed("ps_lon","temp_lon")

In [28]:
df_temps_merged = df_school_temps.union(df_ps_temps)

In [29]:
df_temps_merged.count()

2318648

In [30]:
# check for duplicates

df_temps_merged = df_temps_merged.dropDuplicates(df_temps_merged.columns)

In [31]:
df_temps_merged.count()

2318648

In [32]:
df_temps_merged.show()

+---------+-----------+----------+--------+
| temp_lat|   temp_lon|      time|avg_temp|
+---------+-----------+----------+--------+
|37.122449| -120.26487|2010-04-01|    12.8|
|37.131113|-120.242025|2018-02-01|     9.3|
|37.116435| -120.26264|2016-07-01|    26.9|
|33.026732|-116.878278|2008-09-01|    21.7|
|33.026732|-116.878278|2016-10-01|    17.5|
|33.026732|-116.878278|2017-10-01|    20.8|
|33.028424| -116.86976|2013-11-01|    13.7|
|33.015334| -116.89097|2014-04-01|    15.0|
|33.032084|-116.957018|2016-02-01|    15.6|
|34.861329|-120.442788|2000-10-01|    14.9|
|34.861329|-120.442788|2002-09-01|    16.8|
|34.861329|-120.442788|2015-07-01|    19.1|
| 34.88976|-120.426345|2006-08-01|    18.2|
| 34.88976|-120.426345|2009-08-01|    17.5|
| 34.88976|-120.426345|2016-02-01|    14.4|
|  34.8621| -120.44016|2003-03-01|    12.5|
|34.867755|-120.430803|2004-12-01|    10.5|
|34.866008| -120.43321|2012-12-01|    10.6|
|34.866008| -120.43321|2015-12-01|    10.1|
|39.164975| -122.90268|2005-07-0

In [33]:
df_temps_merged = df_temps_merged.withColumn('year',col('time').cast('string').substr(0,4).cast('int'))
df_temps_merged = df_temps_merged.withColumn('month',col('time').cast('string').substr(6,2).cast('int'))

In [57]:
df_temps_merged = df_temps_merged.drop(col('time'))

In [60]:
df_temps_merged.show()

+---------+-----------+--------+----+-----+
| temp_lat|   temp_lon|avg_temp|year|month|
+---------+-----------+--------+----+-----+
|37.122449| -120.26487|    12.8|2010|    4|
|37.131113|-120.242025|     9.3|2018|    2|
|37.116435| -120.26264|    26.9|2016|    7|
|33.026732|-116.878278|    21.7|2008|    9|
|33.026732|-116.878278|    17.5|2016|   10|
|33.026732|-116.878278|    20.8|2017|   10|
|33.028424| -116.86976|    13.7|2013|   11|
|33.015334| -116.89097|    15.0|2014|    4|
|33.032084|-116.957018|    15.6|2016|    2|
|34.861329|-120.442788|    14.9|2000|   10|
|34.861329|-120.442788|    16.8|2002|    9|
|34.861329|-120.442788|    19.1|2015|    7|
| 34.88976|-120.426345|    18.2|2006|    8|
| 34.88976|-120.426345|    17.5|2009|    8|
| 34.88976|-120.426345|    14.4|2016|    2|
|  34.8621| -120.44016|    12.5|2003|    3|
|34.867755|-120.430803|    10.5|2004|   12|
|34.866008| -120.43321|    10.6|2012|   12|
|34.866008| -120.43321|    10.1|2015|   12|
|39.164975| -122.90268|    22.9|

In [69]:
df_temps_merged.dtypes

[('temp_lat', 'double'),
 ('temp_lon', 'double'),
 ('avg_temp', 'double'),
 ('year', 'int'),
 ('month', 'int')]

In [38]:
df_temps_lat_lon_month_lookup = df_temps_merged.groupBy('temp_lat','temp_lon','month').agg(avg("avg_temp").alias("avg_temp"))

In [39]:
df_temps_lat_lon_month_lookup.show()

+---------+-----------+-----+------------------+
| temp_lat|   temp_lon|month|          avg_temp|
+---------+-----------+-----+------------------+
| 34.15203|-118.221966|    1|13.573684210526315|
|34.135447|-118.248937|    9|23.968421052631573|
|33.878056|-118.403684|    6|19.099999999999998|
|33.922703| -118.35969|    6|19.099999999999998|
|33.824775|-118.326426|   11| 16.92105263157895|
| 33.83538|-118.078142|    2|14.084210526315792|
| 33.77985| -118.10496|    4| 17.04736842105263|
|34.241048|-118.481689|    7|              24.4|
|34.022506|-118.365295|   11|16.810526315789474|
|33.158964|-117.327021|    7|20.950000000000003|
|32.745829|  -116.9713|   12| 13.08421052631579|
|32.790456| -117.00419|    6|18.499999999999996|
|32.910433|-117.141714|   12| 13.38421052631579|
|32.754147| -117.07949|   11|16.336842105263155|
|32.849116| -115.57493|    9| 30.99166666666667|
|34.141531| -117.25834|    6|            21.875|
|35.594325| -119.34936|    2|12.342105263157896|
|33.936138| -117.187

In [40]:
df_temps_lat_lon_month_lookup.count()

154922

In [41]:
df_temps_lat_lon_lookup = df_temps_merged.groupBy('temp_lat','temp_lon').agg(avg("avg_temp").alias("avg_temp"))

In [44]:
df_temps_lat_lon_lookup.show()

+---------+-----------+------------------+
| temp_lat|   temp_lon|          avg_temp|
+---------+-----------+------------------+
|34.085101|-118.181626| 18.25482456140351|
|34.091841|-118.299875| 18.55482456140351|
|33.919825|-118.032435|17.900877192982456|
|33.788281|-118.313877| 17.99254385964912|
|33.878233|-118.270776| 17.96798245614035|
|34.212953|-118.506579|18.304385964912278|
|34.018209| -118.34422| 17.49298245614035|
|33.993337|-118.339355| 17.29298245614035|
|32.620116|-116.990621| 16.93464912280702|
|34.098017|-117.421523|18.771311475409835|
|34.169325|-117.303747|18.553278688524586|
|35.279403|-119.008348|19.443781094527363|
|38.000382|-121.740373|16.471304347826088|
|37.556835|-122.000112|15.319819819819822|
|37.640593|-122.109252|14.925877192982457|
|37.369374| -121.83409| 15.50541871921182|
|37.289052|-121.823926|15.505418719211821|
|38.133595| -121.26082|16.629556650246307|
| 38.55841|-121.709187|16.317982456140353|
|38.556253|-121.466108| 16.40964912280702|
+---------+

In [68]:
df_temps_lat_lon_lookup.dtypes

[('temp_lat', 'double'), ('temp_lon', 'double'), ('avg_temp', 'double')]

In [45]:
df_temps_lat_lon_lookup.count()

13586

In [46]:
df_temps_avg_temp = df_temps_merged.agg(avg("avg_temp").alias("avg_temp"))

In [92]:
df_temps_avg_temp.collect()[0][0]

16.993707583039793

In [61]:
# create duplicates with renamed columns for tidier merging

df_temps_merged_schools = df_temps_merged.withColumnRenamed('temp_lat','school_lat').withColumnRenamed('temp_lon','school_lon')
df_temps_merged_ps = df_temps_merged.withColumnRenamed('temp_lat','point_source_lat').withColumnRenamed('temp_lon','point_source_lon')

df_temps_lat_lon_month_lookup_schools = df_temps_lat_lon_month_lookup.withColumnRenamed('temp_lat','school_lat').withColumnRenamed('temp_lon','school_lon')
df_temps_lat_lon_month_lookup_ps = df_temps_lat_lon_month_lookup.withColumnRenamed('temp_lat','point_source_lat').withColumnRenamed('temp_lon','point_source_lon')

df_temps_lat_lon_lookup_schools = df_temps_lat_lon_lookup.withColumnRenamed('temp_lat','school_lat').withColumnRenamed('temp_lon','school_lon')
df_temps_lat_lon_lookup_ps = df_temps_lat_lon_lookup.withColumnRenamed('temp_lat','point_source_lat').withColumnRenamed('temp_lon','point_source_lon')

In [70]:
df_temps_merged_schools.show()

+----------+-----------+--------+----+-----+
|school_lat| school_lon|avg_temp|year|month|
+----------+-----------+--------+----+-----+
| 37.122449| -120.26487|    12.8|2010|    4|
| 37.131113|-120.242025|     9.3|2018|    2|
| 37.116435| -120.26264|    26.9|2016|    7|
| 33.026732|-116.878278|    21.7|2008|    9|
| 33.026732|-116.878278|    17.5|2016|   10|
| 33.026732|-116.878278|    20.8|2017|   10|
| 33.028424| -116.86976|    13.7|2013|   11|
| 33.015334| -116.89097|    15.0|2014|    4|
| 33.032084|-116.957018|    15.6|2016|    2|
| 34.861329|-120.442788|    14.9|2000|   10|
| 34.861329|-120.442788|    16.8|2002|    9|
| 34.861329|-120.442788|    19.1|2015|    7|
|  34.88976|-120.426345|    18.2|2006|    8|
|  34.88976|-120.426345|    17.5|2009|    8|
|  34.88976|-120.426345|    14.4|2016|    2|
|   34.8621| -120.44016|    12.5|2003|    3|
| 34.867755|-120.430803|    10.5|2004|   12|
| 34.866008| -120.43321|    10.6|2012|   12|
| 34.866008| -120.43321|    10.1|2015|   12|
| 39.16497

In [76]:
df_left_step1 = df_left.join(df_temps_merged_schools, ['school_lat','school_lon','year','month'], how='left')
df_left_step1 = df_left_step1.withColumnRenamed('avg_temp','temp1')

In [77]:
df_left_step1.show()

+----------+-----------+----+-----+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+----------------+----------------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

In [80]:
df_left_step1.filter(col('temp1').isNull()).count()

871668

In [85]:
df_left_step2 = df_left_step1.join(df_temps_lat_lon_month_lookup_schools, ['school_lat','school_lon','month'], how='left')
df_left_step2 = df_left_step2.withColumnRenamed('avg_temp','temp2')

In [87]:
df_left_step2.filter(col('temp2').isNull()).count()

344524

In [88]:
df_left_step3 = df_left_step2.join(df_temps_lat_lon_lookup_schools, ['school_lat','school_lon'], how='left')
df_left_step3 = df_left_step3.withColumnRenamed('avg_temp','temp3')

In [90]:
df_left_step3.filter(col('temp3').isNull()).count()

238334

In [93]:
df_left_step4 = df_left_step3.withColumn("temp4",lit(df_temps_avg_temp.collect()[0][0]))

In [94]:
df_left_step4.show()

+----------+-----------+-----+----+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+----------------+----------------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

## coalesce columns, then repeat for point source temps

In [95]:
cols = ['temp1','temp2','temp3','temp4']

df_left_schooltemps = df_left_step4.withColumn("school_temperature",coalesce(*cols))

In [96]:
df_left_schooltemps.show()

+----------+-----------+-----+----+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+----------------+----------------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

In [97]:
df_left_schooltemps.filter(col('school_temperature').isNull()).count()

0

In [98]:
cols_to_drop = ['temp1','temp2','temp3','temp4']

df_left = df_left_schooltemps.drop(*cols_to_drop)

In [99]:
df_left_step1 = df_left.join(df_temps_merged_ps, ['point_source_lat','point_source_lon','year','month'], how='left')
df_left_step1 = df_left_step1.withColumnRenamed('avg_temp','temp1')

In [101]:
df_left_step1.show()

+----------------+----------------+----+-----+----------+-----------+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

In [100]:
df_left_step1.filter(col('temp1').isNull()).count()

861646

In [102]:
df_left_step2 = df_left_step1.join(df_temps_lat_lon_month_lookup_ps, ['point_source_lat','point_source_lon','month'], how='left')
df_left_step2 = df_left_step2.withColumnRenamed('avg_temp','temp2')

In [103]:
df_left_step2.show()

+----------------+----------------+-----+----+----------+-----------+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

In [104]:
df_left_step2.filter(col('temp2').isNull()).count()

332437

In [105]:
df_left_step3 = df_left_step2.join(df_temps_lat_lon_lookup_ps, ['point_source_lat','point_source_lon'], how='left')
df_left_step3 = df_left_step3.withColumnRenamed('avg_temp','temp3')

In [106]:
df_left_step3.show()

+----------------+----------------+-----+----+----------+-----------+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

In [107]:
df_left_step3.filter(col('temp3').isNull()).count()

228141

In [108]:
df_left_step4 = df_left_step3.withColumn("temp4",lit(df_temps_avg_temp.collect()[0][0]))

In [109]:
df_left_step4.show()

+----------------+----------------+-----+----+----------+-----------+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

In [110]:
cols = ['temp1','temp2','temp3','temp4']

df_left_alltemps = df_left_step4.withColumn("ps_temperature",coalesce(*cols))

In [112]:
df_left_alltemps.show()

+----------------+----------------+-----+----+----------+-----------+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

In [113]:
df_left_alltemps.filter(col('school_temperature') != col('ps_temperature')).count()

1578330

## surprisingly, about half of the school and point source temperature measurements do not match

In [114]:
cols_to_drop = ['temp1','temp2','temp3','temp4']

df_left = df_left_alltemps.drop(*cols_to_drop)

In [115]:
df_left.show()

+----------------+----------------+-----+----+----------+-----------+-----+--------------+---------------+----------+----------------+------------------+------------------+------------------+----------+----------------+---------------------+-----------------------+--------------+-------------------+---------------------+--------------+-------------------+---------------------+----------------+---------------------+-----------------------+----------------+---------------------+-----------------------+----------------+------------------+-----------------+---------+--------------+---------------------+-------------------+-------------------+-----------+-----------+------------------+------------------+------------------+---------------------+------------------+------------------+------------------+-------------------------+----------+----------------------+------------+--------+------------------+-------------------+----------------+--------------------+-------------------------+---------

In [116]:
df_left.filter(col('ps_temperature').isNull()).count()

0

In [117]:
df_left.write.parquet('C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\non-push files\\joined_data_with_temperatures_10-16-22.parquet')