In [1]:
import findspark
findspark.init()

In [18]:
import pyspark
import pandas as pd 
import numpy as np
import os 
import datetime
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.window import Window
from math import radians, cos, sin, asin, sqrt
from pyspark.sql import functions as F
from pyspark.sql.functions import col, row_number

In [3]:
spark = SparkSession.builder.master("local[*]").config("spark.executor.memory", "48g").config("spark.driver.memory", "48g").config("spark.driver.maxResultSize","0").getOrCreate()

In [4]:
spark_df = spark.sql('''select 'spark' as hello ''')

In [5]:
spark_df.show()

+-----+
|hello|
+-----+
|spark|
+-----+



In [4]:
gdrive_path = 'C:\\Users\\matts\\Google Drive\\berkeley\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data'

In [5]:
df_point_sources = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'Point source/pollution_point_sources.csv'))

In [6]:
df_point_sources.limit(5).show()

+-----------+-----------+--------+-----------+------------------+
|checked_lat|checked_lon|zip_code|report_year|PM25_emissions_TPY|
+-----------+-----------+--------+-----------+------------------+
|  33.811466| -117.91555| 92803.0|       2002|       1.787853962|
|  34.088242|-117.470116| 92335.0|       2002|            1.7892|
|  33.911602|-118.281799| 93420.0|       2002|       1.791299781|
|  37.944618|-121.325859| 95203.0|       2002|       1.797499934|
|  39.221817|-121.054955| 95945.0|       2002|1.8015400000000001|
+-----------+-----------+--------+-----------+------------------+



In [7]:
school_locs = spark.read.option("header",True).csv(os.path.join(gdrive_path, 'schools/filtered_joined_schools_data.csv'))

In [8]:
school_locs = school_locs.select("CDSCode","Latitude","Longitude")

In [9]:
school_locs.limit(5).show()

+-------------+---------+----------+
|      CDSCode| Latitude| Longitude|
+-------------+---------+----------+
|1100170000000|37.658212|-122.09713|
|1100170109835|37.521436|-121.99391|
|1100170112607|37.804520|-122.26815|
|1100170118489|37.868991|-122.27844|
|1100170123968|37.784648|-122.23863|
+-------------+---------+----------+



In [15]:
def haversine(lat1, lon1, lat2, lon2):

      R = 3959.87433

      dLat = radians(lat2 - lat1)
      dLon = radians(lon2 - lon1)
      lat1 = radians(lat1)
      lat2 = radians(lat2)

      a = sin(dLat/2)**2 + cos(lat1)*cos(lat2)*sin(dLon/2)**2
      c = 2*asin(sqrt(a))

      return R * c

In [16]:
udf_haversine = F.udf(haversine)

In [11]:
schools_pointsources = school_locs.crossJoin(df_point_sources)

#school_locs.count()
#df_point_sources.count()
#schools_pointsources.count()

In [12]:
schools_pointsources.limit(5).show()

+-------------+---------+----------+-----------+-----------+--------+-----------+------------------+
|      CDSCode| Latitude| Longitude|checked_lat|checked_lon|zip_code|report_year|PM25_emissions_TPY|
+-------------+---------+----------+-----------+-----------+--------+-----------+------------------+
|1100170000000|37.658212|-122.09713|  33.811466| -117.91555| 92803.0|       2002|       1.787853962|
|1100170000000|37.658212|-122.09713|  34.088242|-117.470116| 92335.0|       2002|            1.7892|
|1100170000000|37.658212|-122.09713|  33.911602|-118.281799| 93420.0|       2002|       1.791299781|
|1100170000000|37.658212|-122.09713|  37.944618|-121.325859| 95203.0|       2002|       1.797499934|
|1100170000000|37.658212|-122.09713|  39.221817|-121.054955| 95945.0|       2002|1.8015400000000001|
+-------------+---------+----------+-----------+-----------+--------+-----------+------------------+



In [13]:
schools_pointsources = schools_pointsources.withColumn("Latitude",schools_pointsources.Latitude.cast('double'))
schools_pointsources = schools_pointsources.withColumn("Longitude",schools_pointsources.Longitude.cast('double'))
schools_pointsources = schools_pointsources.withColumn("checked_lat",schools_pointsources.checked_lat.cast('double'))
schools_pointsources = schools_pointsources.withColumn("checked_lon",schools_pointsources.checked_lon.cast('double'))

In [17]:
schools_pointsources = schools_pointsources.withColumn("distance_miles", udf_haversine(schools_pointsources.Latitude,schools_pointsources.Longitude,schools_pointsources.checked_lat,schools_pointsources.checked_lon).cast(DoubleType()))

In [None]:
schools_pointsources.cache()

In [21]:
school_window = Window.partitionBy("CDSCode").orderBy(col("distance_miles").asc())

each_school_min = schools_pointsources.withColumn("row",row_number().over(school_window)).filter(col("row") == 1).drop("row")

In [19]:
schools_pointsources.write.parquet('C:\\Users\\matts\\Documents\\Berkeley MIDS\\DataSci 210 Capstone\\all_schools_to_point_source_distances')

In [20]:
each_school_min = schools_pointsources.groupBy("CDSCode").min("distance_miles")

In [22]:
eachschoolmin_pd = each_school_min.toPandas()

In [23]:
display(eachschoolmin_pd)

Unnamed: 0,CDSCode,Latitude,Longitude,checked_lat,checked_lon,zip_code,report_year,PM25_emissions_TPY,distance_miles
0,10101080119628,36.730149,-119.80866,36.728039,-119.792295,93706.0,2005,7.672591507,0.918145
1,10621176116313,36.816708,-119.67297,36.830643,-119.684105,93612.0,2002,2.3047826099999997,1.143254
2,10621660140038,36.808119,-119.80753,36.841000,-119.790300,93721.0,2008,1.5648188,2.464320
3,10767780122473,36.650368,-119.79540,36.677660,-119.752690,93725.0,2008,1.5408536,3.027262
4,11101161130103,39.749587,-122.19749,39.750229,-122.190392,95951.0,2002,16.382483357,0.379767
...,...,...,...,...,...,...,...,...,...
13292,56725386055289,34.182719,-119.17214,34.189600,-119.166020,93030.0,2008,1.5376,0.590410
13293,56725460120634,34.208023,-119.07623,34.213760,-119.094330,0.0,2014,4.2757449838,1.107879
13294,57727106096671,38.658609,-121.79355,38.670731,-121.812576,95695.0,2002,17.408783137,1.325161
13295,7617476003925,37.826924,-122.13063,37.797170,-122.229880,94602.0,2011,1.2367289499999998,5.796192


In [24]:
eachschoolmin_pd.to_csv('C:\\Users\\matts\\Google Drive\\berkeley\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data\\schools\\school_to_point_lookup.csv')

In [25]:
school_distance_to_point_source_max = 5

In [26]:
point_sources_within_5miles_schools = schools_pointsources.filter(schools_pointsources.distance_miles < school_distance_to_point_source_max)

In [27]:
count_point_sources_within_5miles_schools = point_sources_within_5miles_schools.groupBy("CDSCode").count()

In [28]:
school_5miles_pointsource = count_point_sources_within_5miles_schools.toPandas()

In [29]:
school_5miles_pointsource.to_csv('C:\\Users\\matts\\Google Drive\\berkeley\\.shortcut-targets-by-id\\11wLy1WKwOTcthBs1rpfEzkqax2BZG-6E\\W210_Capstone\\Data\\schools\\pointsources_within_5miles_by_school.csv')