In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyspark as ps
import scipy.stats as stats
import sys
sys.path.append("..")
from src.support_functions import get_covid_data, fixing_datetime, get_zip_income, get_turnstile_data, clean_up_turnstile_data

import folium 
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap

plt.style.use('ggplot')


In [2]:
data_by_zip, tests_by_day, tests_by_boro = get_covid_data()

In [3]:
data_by_zip = data_by_zip.rename(columns={"MODIFIED_ZCTA": "Zip",
                            "NEIGHBORHOOD_NAME": "Neighborhood",
                            "BOROUGH_GROUP": "Borough",
                            "COVID_CASE_COUNT": "Covid_Case_Count",
                            "COVID_CASE_RATE": "Covid_Case_Rate",
                            "POP_DENOMINATOR": "Pop_Denominator",
                            "COVID_DEATH_COUNT": "Covid_Death_Count",
                            "COVID_DEATH_RATE":"Covid_Death_Rate",
                            "PERCENT_POSITIVE":"Percent_Positive",
                            "TOTAL_COVID_TESTS": "Total_Covid_Tests"})

In [4]:
tests_by_day = tests_by_day.rename(columns={"DATE": "Date",
                             "TOTAL_TESTS": "Total_Tests",
                             "POSITIVE_TESTS": "Positive_Tests",
                             "PERCENT_POSITIVE": "Percent_Positive",
                             "TOTAL_TESTS_7DAYS_AVG": "Total_Tests_7Days_AVG",
                             "POSITIVE_TESTS_7DAYS_AVG": "Positive_Tests_7Days_AVG",
                             "PERCENT_POSITIVE_7DAYS_AVG": "Percent_Positive_7Days_AVG",
                             "INCOMPLETE": "Incomplete"})

In [5]:
tests_by_boro = tests_by_boro.rename(columns={"BOROUGH_GROUP": "Borough",
                             "CASE_RATE": "Case_Rate",
                             "HOSPITALIZED_RATE": "Hospitalized_Rate",
                             "DEATH_RATE": "Death_Rate",
                             "CASE_COUNT": "Case_Count",
                             "HOSPITALIZED_COUNT": "Hospitalized_Count",
                             "DEATH_COUNT": "Death_Count"})

In [6]:
median_income = get_zip_income()
med_income = median_income[['median_household_income', 'lat', 'lng', 'population', 'population_density']]
data = data_by_zip[['Zip', 'Neighborhood','Borough', 'Covid_Case_Count', 'Total_Covid_Tests', 'Covid_Death_Count']]
hm_covid = data.join(med_income)
hm_covid

Unnamed: 0,Zip,Neighborhood,Borough,Covid_Case_Count,Total_Covid_Tests,Covid_Death_Count,median_household_income,lat,lng,population,population_density
0,10001,Chelsea/NoMad/West Chelsea,Manhattan,479,10193,28,81671,40.750,-73.990,21102,33959.0
1,10002,Chinatown/Lower East Side,Manhattan,1452,24932,160,33218,40.720,-73.990,81410,92573.0
2,10003,East Village/Gramercy/Greenwich Village,Manhattan,701,25829,35,92540,40.730,-73.990,56024,97188.0
3,10004,Financial District,Manhattan,57,1350,1,129313,40.700,-74.020,3089,5519.0
4,10005,Financial District,Manhattan,120,3228,2,124670,40.705,-74.005,7135,97048.0
...,...,...,...,...,...,...,...,...,...,...,...
172,11691,Edgemere/Far Rockaway,Queens,3136,22996,379,39409,40.600,-73.760,60035,21185.0
173,11692,Arverne/Edgemere,Queens,742,6635,95,43354,40.590,-73.800,18540,18566.0
174,11693,Arverne/Broad Channel,Queens,372,3408,29,50570,40.610,-73.820,11916,11950.0
175,11694,Belle Harbor-Neponsit/Rockaway Park,Queens,833,6892,90,76944,40.580,-73.850,20408,14944.0


In [7]:
tests_by_day = fixing_datetime(tests_by_day)

In [8]:
turnstile2019_df, turnstile2020_df = get_turnstile_data()

In [9]:
turnstile2020_df.head()

Unnamed: 0,C/A,Unit,SCP,Station,Line Name,Division,Date,Time,Description,Entries,Exits
0,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/23/2020,00:00:00,REGULAR,7476781,2544029
1,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/23/2020,04:00:00,REGULAR,7476785,2544030
2,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/23/2020,08:00:00,REGULAR,7476802,2544088
3,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/23/2020,12:00:00,REGULAR,7476847,2544188
4,A002,R051,02-00-00,59 ST,NQR456W,BMT,10/23/2020,16:00:00,REGULAR,7477040,2544233


In [10]:
ts_df_2019 = fixing_datetime(turnstile2019_df)
ts_df_2020 = fixing_datetime(turnstile2020_df)
ts_df_2019 = clean_up_turnstile_data(ts_df_2019)
ts_df_2020 = clean_up_turnstile_data(ts_df_2020)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.sort_values(["C/A", "Unit", "SCP", "Station", "Date", "Time"], inplace=True, ascending=False)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(subset=["C/A", "Unit", "SCP", "Station", "Date", "Time"], inplace=True)


In [11]:
gb2019 = ts_df_2019[['Station', 'SCP', 'Date', 'Entries']].groupby(['Date', 'Station', 'SCP']).agg([min, max])
gb2019.reset_index()

gb2019['Total'] = gb2019[('Entries', 'max')] - gb2019[('Entries', 'min')]
gb2019.drop(gb2019[gb2019['Total'] < 10].index, inplace=True)
gb2019.drop(gb2019[gb2019['Total'] > 10000].index, inplace=True)

sum_per_station_2019 = gb2019.groupby(level=[0, 1]).sum()

sum_per_date_2019 = sum_per_station_2019.groupby(level=[0]).sum()

sum_per_date_2019['week_avg_2019'] = sum_per_date_2019.iloc[:,2].rolling(window=7).mean()

In [12]:
gb2020 = ts_df_2020[['Station', 'SCP', 'Date', 'Entries']].groupby(['Date', 'Station', 'SCP']).agg([min, max])
gb2020.reset_index()

gb2020['Total'] = gb2020[('Entries', 'max')] - gb2020[('Entries', 'min')]
gb2020.drop(gb2020[gb2020['Total'] < 10].index, inplace=True)
gb2020.drop(gb2020[gb2020['Total'] > 10000].index, inplace=True)

sum_per_station_2020 = gb2020.groupby(level=[0, 1]).sum()

sum_per_date_2020 = sum_per_station_2020.groupby(level=[0]).sum()

sum_per_date_2020['week_avg_2020'] = sum_per_date_2020.iloc[:,2].rolling(window=7).mean()

In [13]:
# man_map = folium.Map(location=[40.7831, -73.9712],zoom_start=13.5)
# mc = MarkerCluster()
# for ind,row in man20.iterrows():
#     mc.add_child(folium.CircleMarker(location=[row['latitude'],row['longitude']],
#     radius=1,color='#500cc'))
#     man_map.add_child(mc)
#     man_map

In [14]:
hm_covid

Unnamed: 0,Zip,Neighborhood,Borough,Covid_Case_Count,Total_Covid_Tests,Covid_Death_Count,median_household_income,lat,lng,population,population_density
0,10001,Chelsea/NoMad/West Chelsea,Manhattan,479,10193,28,81671,40.750,-73.990,21102,33959.0
1,10002,Chinatown/Lower East Side,Manhattan,1452,24932,160,33218,40.720,-73.990,81410,92573.0
2,10003,East Village/Gramercy/Greenwich Village,Manhattan,701,25829,35,92540,40.730,-73.990,56024,97188.0
3,10004,Financial District,Manhattan,57,1350,1,129313,40.700,-74.020,3089,5519.0
4,10005,Financial District,Manhattan,120,3228,2,124670,40.705,-74.005,7135,97048.0
...,...,...,...,...,...,...,...,...,...,...,...
172,11691,Edgemere/Far Rockaway,Queens,3136,22996,379,39409,40.600,-73.760,60035,21185.0
173,11692,Arverne/Edgemere,Queens,742,6635,95,43354,40.590,-73.800,18540,18566.0
174,11693,Arverne/Broad Channel,Queens,372,3408,29,50570,40.610,-73.820,11916,11950.0
175,11694,Belle Harbor-Neponsit/Rockaway Park,Queens,833,6892,90,76944,40.580,-73.850,20408,14944.0


In [15]:
riders_per_station_2020 = sum_per_station_2020.groupby(level=[1]).sum()
riders_per_station_2020.reset_index()

Unnamed: 0_level_0,Station,Entries,Entries,Total
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,Unnamed: 4_level_1
0,1 AV,49230503444,49231148450,645006
1,103 ST,77949568,77950994,1426
2,103 ST-CORONA,12711300301,12712528665,1228364
3,104 ST,306083042242,306083107485,65243
4,110 ST,7869804829,7870294809,489980
...,...,...,...,...
371,WOODLAWN,6512111663,6512456697,345034
372,WORLD TRADE CTR,203718548973,203718922343,373370
373,WTC-CORTLANDT,147896523767,147896773421,249654
374,YORK ST,8187567471,8187922847,355376


In [16]:
riders_per_station_2020.drop([('Entries', 'max'), ('Entries', 'min')], axis=1, inplace=True)
riders_per_station_2020.sort_values(by='Total', ascending = False).reset_index()

Unnamed: 0,Station,Total
,,
0,14 ST-UNION SQ,2881147
1,FULTON ST,2396929
2,JKSN HT-ROOSVLT,2395646
3,FLUSHING-MAIN,2330032
4,59 ST COLUMBUS,1978491
...,...,...
371,HOYT ST,18107
372,BROAD CHANNEL,13506
373,PARK PLACE,1668


In [17]:
riders_per_station_2019 = sum_per_station_2019.groupby(level=[1]).sum()
riders_per_station_2019.reset_index()
riders_per_station_2019.drop([('Entries', 'max'), ('Entries', 'min')], axis=1, inplace=True)
riders_per_station_2019.sort_values(by='Total', ascending = False).reset_index()

Unnamed: 0,Station,Total
,,
0,14 ST-UNION SQ,15183707
1,FULTON ST,12399722
2,59 ST COLUMBUS,10628234
3,34 ST-PENN STA,10081790
4,TIMES SQ-42 ST,9871705
...,...,...
373,BROAD CHANNEL,52037
374,ORCHARD BEACH,7649
375,25 ST,5555


In [18]:
hm_covid['Zip'] = hm_covid['Zip'].astype(str)

In [19]:
nyc_map = folium.Map(location=[40.7, -73.9],zoom_start=10)
mc = MarkerCluster()
for ind,row in hm_covid.iterrows():
    mc.add_child(folium.CircleMarker([row['lat'], row['lng']],
                        radius=15,
                        fill_color='#500cc'
                       ))
    nyc_map.add_child(mc)
    
heat_data = [[row['lat'],row['lng']] for index, row in hm_covid.iterrows()]
HeatMap(heat_data).add_to(nyc_map)

<folium.plugins.heat_map.HeatMap at 0x7fbd36f79f50>

In [20]:
nyc_map

In [21]:
ny_geo = '../data/nyc_zip_code_tabulation_areas_polygons.geojson'

In [22]:
!pwd

/Users/thomasjoy/Desktop/galvanize/NYC-MTA-Usage-During-COVID-19/notebooks


In [23]:
nyc_covid = folium.Map(location=[40.7831, -73.90], zoom_start=10, tiles = None)
folium.TileLayer('CartoDB positron', name='Light Map', control=False).add_to(nyc_covid)
nyc_covid.choropleth(
        geo_data = ny_geo,
        fill_color='YlOrRd',
        fill_opacity=0.7,
        line_opacity=0.2,
#         fill_opacity = 1.0,
#         line_opacity = 0.8,
        data = hm_covid,
        columns = ['Zip', 'Covid_Case_Count'],
        key_on = 'feature.properties.postalcode',
        legend_name='COVID-19 Case Count'
    )
nyc_covid.save('../img/covid_hotspots.html')



In [24]:
nyc_covid

In [25]:
# nyc_mta_usage = folium.Map(location=[40.7831, -73.90], zoom_start=10, tiles = None)
# folium.TileLayer('CartoDB positron', name='Light Map', control=False).add_to(nyc_mta_usage)
# nyc_mta_usage.choropleth(
#         geo_data = ny_geo,
#         fill_opacity = 1.0,
#         line_opacity = 0.8,
#         data = hm_covid,
#         columns = ['Zip', 'Covid_Case_Count'],
#         key_on = 'feature.properties.postalcode',
# #         threshold_scale=myscale
#     )
# #riders_per_station_2020

In [56]:
hm_covid

Unnamed: 0,Zip,Neighborhood,Borough,Covid_Case_Count,Total_Covid_Tests,Covid_Death_Count,median_household_income,lat,lng,population,population_density,postalcode
0,10001,Chelsea/NoMad/West Chelsea,Manhattan,479,10193,28,81671,40.750,-73.990,21102,33959.0,10001
1,10002,Chinatown/Lower East Side,Manhattan,1452,24932,160,33218,40.720,-73.990,81410,92573.0,10002
2,10003,East Village/Gramercy/Greenwich Village,Manhattan,701,25829,35,92540,40.730,-73.990,56024,97188.0,10003
3,10004,Financial District,Manhattan,57,1350,1,129313,40.700,-74.020,3089,5519.0,10004
4,10005,Financial District,Manhattan,120,3228,2,124670,40.705,-74.005,7135,97048.0,10005
...,...,...,...,...,...,...,...,...,...,...,...,...
172,11691,Edgemere/Far Rockaway,Queens,3136,22996,379,39409,40.600,-73.760,60035,21185.0,11691
173,11692,Arverne/Edgemere,Queens,742,6635,95,43354,40.590,-73.800,18540,18566.0,11692
174,11693,Arverne/Broad Channel,Queens,372,3408,29,50570,40.610,-73.820,11916,11950.0,11693
175,11694,Belle Harbor-Neponsit/Rockaway Park,Queens,833,6892,90,76944,40.580,-73.850,20408,14944.0,11694


In [58]:
hm_covid[hm_covid['Covid_Case_Count'] == hm_covid['Covid_Case_Count'].max()]

Unnamed: 0,Zip,Neighborhood,Borough,Covid_Case_Count,Total_Covid_Tests,Covid_Death_Count,median_household_income,lat,lng,population,population_density,postalcode
139,11368,Corona/North Corona,Queens,5313,36742,448,45964,40.75,-73.85,109931,41768.0,11368


In [59]:
hm_covid[hm_covid['Covid_Case_Count'] == hm_covid['Covid_Case_Count'].min()]

Unnamed: 0,Zip,Neighborhood,Borough,Covid_Case_Count,Total_Covid_Tests,Covid_Death_Count,median_household_income,lat,lng,population,population_density,postalcode
5,10006,Financial District,Manhattan,52,1256,0,119274,40.708,-74.013,3011,32796.0,10006


In [26]:
import json
import pandas as pd
import geopandas as gpd
from bokeh.io import output_notebook, show, output_file
from bokeh.plotting import figure, save
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar
from bokeh.palettes import brewer

In [27]:
#Change the data type and column name to match the geojson file 
hm_covid['ZIPCODE'] = hm_covid['Zip'].astype(int)
hm_covid['ZIPCODE'] = hm_covid['ZIPCODE'].astype(str)
hm_covid.rename(index=str, columns={'ZIPCODE': 'postalcode'}, inplace=True)
#Turn the GeoJson file into a data frame and merge it with df 
#data = 'zipcode.geojson'
gdf = gpd.read_file(ny_geo)
merge = gdf.merge(hm_covid, how='left', on='postalcode')
#Turn the merged data frame back into a json file  
merged_json = json.loads(merge.to_json())
json_data = json.dumps(merged_json)

In [122]:
geosource = GeoJSONDataSource(geojson = json_data)
#set the color palette 
palette = brewer['YlOrRd'][8]
palette = palette[::-1]
color_mapper = LinearColorMapper(palette = palette, low = 52, high = 5313,  nan_color = '#d9d9d9')
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8,width = 500, height = 20,
border_line_color='black',location = (0,0), orientation ='horizontal')
#Set the size and title of the graph
p = figure(title = 'NYC Coronavirus Case Counts', plot_height = 700 , plot_width = 700, toolbar_location = None, 
          tooltips=[
         ("Neighborhood","@Neighborhood"),
         ("Zip Code","@postalcode"),
         ("Covid-19 Count", "@Covid_Case_Count")])
#Makes it so there are no gird lines
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.patches('xs','ys', source = geosource,fill_color = {'field'     :'Covid_Case_Count', 'transform' : color_mapper},
         line_color = 'black', line_width = 0.25, fill_alpha = 1)
p.add_layout(color_bar, 'below')

output_notebook()



In [123]:
show(p)


In [30]:
# Save the plot by passing the plot -object and output path
save(obj=p, filename='../img/geopandas_covid.html')

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/thomasjoy/Desktop/galvanize/NYC-MTA-Usage-During-COVID-19/img/geopandas_covid.html'

In [120]:
geosource = GeoJSONDataSource(geojson = json_data)
#set the color palette 
palette = brewer['BuGn'][8]
palette = palette[::-1]
color_mapper = LinearColorMapper(palette = palette, low = 10000, high = 150000,  nan_color = '#d9d9d9')
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8,width = 500, height = 20,
border_line_color='black',location = (0,0), orientation ='horizontal')
#Set the size and title of the graph
p = figure(title = 'NYC Median Income Levels', plot_height = 700 , plot_width = 700, toolbar_location = None, 
          tooltips=[
         ("Neighborhood","@Neighborhood"),
         ("Zip Code","@postalcode"),
         ("Median Income", "@median_household_income")])
#Makes it so there are no gird lines
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.patches('xs','ys', source = geosource,fill_color = {'field'     :'median_household_income', 'transform' : color_mapper},
         line_color = 'black', line_width = 0.25, fill_alpha = 1)
p.add_layout(color_bar, 'below')

output_notebook()



In [121]:
show(p)

In [104]:
save(obj=p, filename='../img/geopandas_income.html')

  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")


'/Users/thomasjoy/Desktop/galvanize/NYC-MTA-Usage-During-COVID-19/img/geopandas_income.html'

In [126]:
geosource = GeoJSONDataSource(geojson = json_data)
#set the color palette 
palette = brewer['RdYlGn'][8]
palette = palette[::-1]
color_mapper = LinearColorMapper(palette = palette, low = 0, high = 40000,  nan_color = '#d9d9d9')
color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8,width = 500, height = 20,
border_line_color='black',location = (0,0), orientation ='horizontal')
#Set the size and title of the graph
p = figure(title = 'NYC Coronavirus Tests Given', plot_height = 700 , plot_width = 700, toolbar_location = None, 
          tooltips=[
         ("Neighborhood","@Neighborhood"),
         ("Zip Code","@postalcode"),
         ("Covid-19 Tests Given", "@Total_Covid_Tests")])
#Makes it so there are no gird lines
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.patches('xs','ys', source = geosource,fill_color = {'field'     :'Total_Covid_Tests', 'transform' : color_mapper},
         line_color = 'black', line_width = 0.25, fill_alpha = 1)
p.add_layout(color_bar, 'below')

output_notebook()



In [127]:
show(p)

In [31]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [32]:
def checker(wrong_options,correct_options):
    names_array=[]
    ratio_array=[]    
    for wrong_option in wrong_options:
        if wrong_option in correct_options:
           names_array.append(wrong_option)
           ratio_array.append('100')
        else:   
            x=process.extractOne(wrong_option,correct_options,scorer=fuzz.token_set_ratio)
            names_array.append(x[0])
            ratio_array.append(x[1])
    return names_array,ratio_array

In [86]:
gdata = gpd.read_file('../data/SubwayStations.geojson')

In [87]:
gdata

Unnamed: 0,name,url,line,objectid,notes,geometry
0,Astor Pl,http://web.mta.info/nyct/service/,4-6-6 Express,1,"4 nights, 6-all times, 6 Express-weekdays AM s...",POINT (-73.99107 40.73005)
1,Canal St,http://web.mta.info/nyct/service/,4-6-6 Express,2,"4 nights, 6-all times, 6 Express-weekdays AM s...",POINT (-74.00019 40.71880)
2,50th St,http://web.mta.info/nyct/service/,1-2,3,"1-all times, 2-nights",POINT (-73.98385 40.76173)
3,Bergen St,http://web.mta.info/nyct/service/,2-3-4,4,"4-nights, 3-all other times, 2-all times",POINT (-73.97500 40.68086)
4,Pennsylvania Ave,http://web.mta.info/nyct/service/,3-4,5,"4-nights, 3-all other times",POINT (-73.89489 40.66471)
...,...,...,...,...,...,...
468,Coney Island - Stillwell Av,http://web.mta.info/nyct/service/,D-F-N-Q,469,"D,F,N,Q-all times",POINT (-73.98124 40.57728)
469,34th St - Hudson Yards,http://web.mta.info/nyct/service/,7-7 Express,470,"7-all times, 7 Express-rush hours AM westbound...",POINT (-74.00220 40.75545)
470,72nd St,http://web.mta.info/nyct/service/,Q,641,Q-all times,POINT (-73.95836 40.76880)
471,86th St,http://web.mta.info/nyct/service/,Q,642,Q-all times,POINT (-73.95177 40.77786)


In [88]:
gdata = gdata.drop(["url", "line", "objectid", "notes"], axis=1, errors="ignore")

In [89]:
gdata

Unnamed: 0,name,geometry
0,Astor Pl,POINT (-73.99107 40.73005)
1,Canal St,POINT (-74.00019 40.71880)
2,50th St,POINT (-73.98385 40.76173)
3,Bergen St,POINT (-73.97500 40.68086)
4,Pennsylvania Ave,POINT (-73.89489 40.66471)
...,...,...
468,Coney Island - Stillwell Av,POINT (-73.98124 40.57728)
469,34th St - Hudson Yards,POINT (-74.00220 40.75545)
470,72nd St,POINT (-73.95836 40.76880)
471,86th St,POINT (-73.95177 40.77786)


In [37]:
riders_per_station_2020 = riders_per_station_2020.reset_index()

In [90]:
str2Match = gdata['name'].fillna('######').tolist()
strOptions = riders_per_station_2020['Station'].fillna('######').tolist()

In [91]:
str2Match[:5]

['Astor Pl', 'Canal St', '50th St', 'Bergen St', 'Pennsylvania Ave']

In [92]:
strOptions[:5]

['1 AV', '103 ST', '103 ST-CORONA', '104 ST', '110 ST']

In [93]:
name_match,ratio_match=checker(str2Match,strOptions)
stations_df = pd.DataFrame()
stations_df['old_names']=pd.Series(str2Match)
stations_df['correct_names']=pd.Series(name_match)
stations_df['correct_ratio']=pd.Series(ratio_match)
#stations_df['rider_count'] = riders_per_station_2020
dfs = riders_per_station_2020.merge(stations_df, how='left', left_on='Station', right_on='correct_names')



In [95]:
dfs = dfs.merge(gdf, how = 'left', left_on = 'old_names', right_on = 'name')

KeyError: 'name'

In [94]:
dfs

Unnamed: 0,"(Station, )","(Total, )",old_names,correct_names,correct_ratio
0,1 AV,645006,,,
1,103 ST,1426,103rd St,103 ST,86.0
2,103 ST,1426,103rd St,103 ST,86.0
3,103 ST,1426,103rd St,103 ST,86.0
4,103 ST-CORONA,1228364,103rd St - Corona Plaza,103 ST-CORONA,82.0
...,...,...,...,...,...
547,WOODLAWN,345034,Woodlawn,WOODLAWN,100.0
548,WORLD TRADE CTR,373370,World Trade Center,WORLD TRADE CTR,91.0
549,WTC-CORTLANDT,249654,,,
550,YORK ST,355376,York St,YORK ST,100.0


In [67]:
dfs.columns = ['Station', 'Total_riders', 'old_names', 'correct_names', 'correct_ratio', 'name', 'geometry']

In [68]:
dfs.columns

Index(['Station', 'Total_riders', 'old_names', 'correct_names',
       'correct_ratio', 'name', 'geometry'],
      dtype='object')

In [69]:
dfs

Unnamed: 0,Station,Total_riders,old_names,correct_names,correct_ratio,name,geometry
0,1 AV,645006,,,,,
1,103 ST,1426,103rd St,103 ST,86.0,103rd St,POINT (-73.96838 40.79945)
2,103 ST,1426,103rd St,103 ST,86.0,103rd St,POINT (-73.96137 40.79606)
3,103 ST,1426,103rd St,103 ST,86.0,103rd St,POINT (-73.94748 40.79060)
4,103 ST,1426,103rd St,103 ST,86.0,103rd St,POINT (-73.96838 40.79945)
...,...,...,...,...,...,...,...
891,WOODLAWN,345034,Woodlawn,WOODLAWN,100.0,Woodlawn,POINT (-73.87875 40.88604)
892,WORLD TRADE CTR,373370,World Trade Center,WORLD TRADE CTR,91.0,World Trade Center,POINT (-74.00974 40.71256)
893,WTC-CORTLANDT,249654,,,,,
894,YORK ST,355376,York St,YORK ST,100.0,York St,POINT (-73.98688 40.69974)


In [70]:
dfs = dfs.drop(["old_names", "correct_names", "correct_ratio"], axis=1, errors="ignore")

In [78]:
dfs = dfs[dfs['geometry'] != None]

In [81]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 817 entries, 1 to 895
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Station       817 non-null    object  
 1   Total_riders  817 non-null    int64   
 2   name          817 non-null    object  
 3   geometry      817 non-null    geometry
dtypes: geometry(1), int64(1), object(2)
memory usage: 71.9+ KB


In [49]:
# #Change the data type and column name to match the geojson file 
# hm_covid['ZIPCODE'] = hm_covid['Zip'].astype(int)
# hm_covid['ZIPCODE'] = hm_covid['ZIPCODE'].astype(str)
# hm_covid.rename(index=str, columns={'ZIPCODE': 'postalcode'}, inplace=True)
# #Turn the GeoJson file into a data frame and merge it with df 
# #data = 'zipcode.geojson'
# gdf = gpd.read_file(ny_geo)
# merge = gdf.merge(hm_covid, how='left', on='postalcode')
# #Turn the merged data frame back into a json file  
# merged_json = json.loads(merge.to_json())
# json_data = json.dumps(merged_json)

In [80]:
m_json = json.loads(dfs.to_json())
j_data = json.dumps(m_json)

OverflowError: Maximum recursion level reached

In [None]:
# geosource = GeoJSONDataSource(geojson = json_data)
# #set the color palette 
# palette = brewer['YlOrRd'][8]
# palette = palette[::-1]
# color_mapper = LinearColorMapper(palette = palette, low = 52, high = 5313,  nan_color = '#d9d9d9')
# color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8,width = 500, height = 20,
# border_line_color='black',location = (0,0), orientation ='horizontal')
# #Set the size and title of the graph
# p = figure(title = 'NYC Coronavirus Case Counts', plot_height = 700 , plot_width = 700, toolbar_location = None, 
#           tooltips=[
#          ("Zip Code","@postalcode"),
#          ("Covid-19 Count", "@Covid_Case_Count")])
# #Makes it so there are no gird lines
# p.xgrid.grid_line_color = None
# p.ygrid.grid_line_color = None
# p.patches('xs','ys', source = geosource,fill_color = {'field'     :'Covid_Case_Count', 'transform' : color_mapper},
#          line_color = 'black', line_width = 0.25, fill_alpha = 1)
# p.add_layout(color_bar, 'below')

# output_notebook()



In [82]:
hm_covid

Unnamed: 0,Zip,Neighborhood,Borough,Covid_Case_Count,Total_Covid_Tests,Covid_Death_Count,median_household_income,lat,lng,population,population_density,postalcode
0,10001,Chelsea/NoMad/West Chelsea,Manhattan,479,10193,28,81671,40.750,-73.990,21102,33959.0,10001
1,10002,Chinatown/Lower East Side,Manhattan,1452,24932,160,33218,40.720,-73.990,81410,92573.0,10002
2,10003,East Village/Gramercy/Greenwich Village,Manhattan,701,25829,35,92540,40.730,-73.990,56024,97188.0,10003
3,10004,Financial District,Manhattan,57,1350,1,129313,40.700,-74.020,3089,5519.0,10004
4,10005,Financial District,Manhattan,120,3228,2,124670,40.705,-74.005,7135,97048.0,10005
...,...,...,...,...,...,...,...,...,...,...,...,...
172,11691,Edgemere/Far Rockaway,Queens,3136,22996,379,39409,40.600,-73.760,60035,21185.0,11691
173,11692,Arverne/Edgemere,Queens,742,6635,95,43354,40.590,-73.800,18540,18566.0,11692
174,11693,Arverne/Broad Channel,Queens,372,3408,29,50570,40.610,-73.820,11916,11950.0,11693
175,11694,Belle Harbor-Neponsit/Rockaway Park,Queens,833,6892,90,76944,40.580,-73.850,20408,14944.0,11694


In [83]:
nyc_income = folium.Map(location=[40.7831, -73.90], zoom_start=10, tiles = None)
folium.TileLayer('CartoDB positron', name='Light Map', control=False).add_to(nyc_covid)
nyc_income.choropleth(
        geo_data = ny_geo,
        fill_color='YlOrRd',
        fill_opacity=0.7,
        line_opacity=0.2,
        data = hm_covid,
        columns = ['Zip', 'median_household_income'],
        key_on = 'feature.properties.postalcode',
        legend_name='Median Household Income'
    )
#nyc_income.save('../img/covid_hotspots.html')



In [84]:
nyc_income

ValueError: The column label 'postalcode' is not unique.