# Notebook Plan

This notebook creates all necessary files to run the d3 map visualization seen on our website. 
https://sarahjune1.github.io/CUSP_innovation_capstone/map.html

1. Read in geoJSON of all city shapes
2. Create summary file and city stats file
3. GeoJSON w/ city locations. i.e. Drop all stats

In [2]:
import geopandas as gpd
import pandas as pd
from fiona.crs import from_epsg

## 1. Read in city geoJSON file 

In [3]:
patent_cities = gpd.read_file('patent_cities.geojson')

In [4]:
patent_cities.head()

Unnamed: 0,bbox_east,bbox_north,bbox_south,bbox_west,place_name,comma_count,county_flag,city,state,name,postal,fips,city_state,geometry
0,-73.700181,40.916179,40.477399,-74.25909,"New York City, New York, USA",2,no_flag,newyork,NewYork,NewYork,ny,36,newyork_ny,"(POLYGON ((-74.25909 40.497207, -74.259089 40...."
1,-73.694626,41.165357,41.100336,-73.733743,"Armonk, Town of North Castle, Westchester Coun...",5,no_flag,armonk,NewYork,NewYork,ny,36,armonk_ny,"POLYGON ((-73.733743 41.118404, -73.733743 41...."
2,-73.823639,41.024019,41.000791,-73.853932,"Ardsley, Town of Greenburgh, Westchester Count...",5,no_flag,ardsley,NewYork,NewYork,ny,36,ardsley_ny,"POLYGON ((-73.853932 41.005698, -73.8539070000..."
3,-73.893042,42.843668,42.763577,-73.983475,"Schenectady, Schenectady County, New York, USA",3,no_flag,schenectady,NewYork,NewYork,ny,36,schenectady_ny,"POLYGON ((-73.9834749 42.8015698, -73.98047 42..."
4,-77.029932,42.164366,42.131597,-77.075799,"Corning, Steuben County, New York, USA",3,no_flag,corning,NewYork,NewYork,ny,36,corning_ny,"POLYGON ((-77.075799 42.158076, -77.075709 42...."


In [5]:
patent_cities.shape

(2496, 14)

## 2. Create summary csv file and city stats JSON

In [6]:
df_stats= pd.read_csv('pre_scaling_data.csv')
df_stats.drop(['Unnamed: 0'], axis = 1, inplace = True)
df_stats.drop_duplicates(subset=['city_state', 'Year'], inplace = True)

In [7]:
cities = pd.DataFrame(df_stats.city_state.value_counts()).reset_index()
cities = list((cities.loc[cities.city_state > 11])['index'])

In [8]:
cities_to_keep = df_stats.loc[df_stats.city_state.isin(cities)]
cities_to_keep.head()

Unnamed: 0,Award Mean,Award Sum,Company Count,Year,city_state,creative_employees,creative_establishments,empowerment_zone,median_household_income,performance_amount,...,percent_creative_class,scaled_perc_creative_class,creative_establishment_ratio,company_count_perc,percent_bachelors,percent_graduate,percent_foreign_born,scaled_population,recipient_mean,performance_mean
0,128725.714286,901080.0,7.0,2001,santaclara_ca,2222.0,188.0,0,69466.0,8192436.0,...,0.013904,0.016659,0.02806,0.001045,0.241177,0.146017,0.348707,11.536261,199534.3,4096218.0
1,166188.029412,5650393.0,34.0,2001,houston_tx,45344.0,11568.0,1,36616.0,401235700.0,...,0.015419,0.022965,0.050016,0.000147,0.15704,0.082908,0.264177,14.4852,629998.1,2994297.0
2,238464.416667,5723146.0,24.0,2001,sanjose_ca,10299.0,1751.0,0,70243.0,207137.3,...,0.023521,0.030553,0.04838,0.000663,0.191269,0.094006,0.368467,13.704515,397788.5,20713.73
3,253001.5,7590045.0,30.0,2001,sunnyvale_ca,1416.0,176.0,0,74409.0,60854710.0,...,0.015296,0.017492,0.029033,0.004949,0.278218,0.203854,0.394581,11.788737,1432841.0,6761635.0
4,175075.826087,4026744.0,23.0,2001,newyork_ny,309161.0,25930.0,1,38293.0,2300854.0,...,0.039986,0.063421,0.035639,3.2e-05,0.153664,0.102363,0.358508,15.895986,208637.6,18706.13


In [9]:
cities_to_keep.city_state.value_counts()

maplegrove_mn           12
evansville_in           12
kenosha_wi              12
watsonville_ca          12
columbia_mo             12
sancarlos_ca            12
rockville_md            12
irvine_ca               12
malibu_ca               12
alpharetta_ga           12
seattle_wa              12
monrovia_ca             12
fortcollins_co          12
milpitas_ca             12
aurora_il               12
newhaven_ct             12
henderson_nv            12
rockford_il             12
raleigh_nc              12
albuquerque_nm          12
warsaw_in               12
campbell_ca             12
emeryville_ca           12
melbourne_fl            12
stpaul_mn               12
fallschurch_va          12
tampa_fl                12
franklin_tn             12
menlopark_ca            12
norfolk_va              12
                        ..
chulavista_ca           12
livonia_mi              12
dallas_tx               12
santaclara_ca           12
loveland_co             12
pawtucket_ri            12
s

In [10]:
all_data = pd.read_csv('patent_data_all_years.csv')
all_data = all_data.loc[(all_data.Year > 2000) & (all_data.Year < 2013)]
all_data.drop(['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis = 1, inplace = True)
all_data.head()

Unnamed: 0,City,Inv_to_Assignee_ratio,Patents,State,Year,assignee_IPC_A,assignee_IPC_B,assignee_IPC_C,assignee_IPC_D,assignee_IPC_E,...,pt_statutory invention registration,pt_utility,Rank,assignee_patent_annual_sum,inventor_patent_annual_sum,inventor_patents_perc,assignee_patents_perc,inventor_rank,assignee_pats_citations_normalized,inventor_pats_citations_normalized
25000,Santa Clara,0.276342,4603,CA,2001,0.02866,0.033437,0.171443,0.004984,0.000727,...,0.000217,0.987617,0,98990,121057,0.010507,0.0465,11,0.056873,0.008476
25001,Armonk,0.00472,4237,NY,2001,0.004449,0.029543,0.010856,0.001602,0.000712,...,0.0,0.99764,1,98990,121057,0.000165,0.042802,723,0.019063,0.000357
25002,Houston,0.342191,3989,TX,2001,0.01882,0.108391,0.087732,0.009622,0.260648,...,0.000251,0.990474,2,98990,121057,0.011276,0.040297,10,0.050546,0.020757
25003,San Jose,1.772214,2656,CA,2001,0.008116,0.023188,0.014493,0.005797,0.0,...,0.0,0.990211,3,98990,121057,0.038883,0.026831,0,0.022082,0.036183
25004,Boise,0.848334,2011,ID,2001,0.003608,0.102391,0.032927,0.000902,0.000451,...,0.0,0.996519,4,98990,121057,0.014093,0.020315,7,0.009612,0.005238


In [11]:
city_vals = pd.DataFrame(all_data.city_state.value_counts()).reset_index()
city_vals = city_vals.loc[city_vals.city_state > 11]
city_vals = list(city_vals['index'])
len(city_vals)

550

In [12]:
all_data = all_data.loc[all_data.city_state.isin(city_vals)]
all_data.city_state.value_counts()

('Newark', 'DE')              12
('Palo Alto', 'CA')           12
('Farmington Hills', 'MI')    12
('New York', 'NY')            12
('Vienna', 'VA')              12
('Northbrook', 'IL')          12
('San Marcos', 'CA')          12
('South Gate', 'CA')          12
('Zeeland', 'MI')             12
('Tokyo', None)               12
('West Chester', 'OH')        12
('Staten Island', 'NY')       12
('Knoxville', 'TN')           12
('San Clemente', 'CA')        12
('Logan', 'UT')               12
('Urbana', 'IL')              12
('Latham', 'NY')              12
('Waukesha', 'WI')            12
('Tustin', 'CA')              12
('Akron', 'OH')               12
('Latrobe', 'PA')             12
('Rancho Dominguez', 'CA')    12
('Glen Allen', 'VA')          12
('Naples', 'FL')              12
('Minneapolis', 'MN')         12
('Tallahassee', 'FL')         12
('Elkhart', 'IN')             12
('Glenview', 'IL')            12
('Islandia', 'NY')            12
('Aurora', 'IL')              12
          

In [13]:
all_data = all_data[['City', 'State', 'Year', 'Patents', 'assignee_pats_cited',
                             'inventor_patents', 'inventor_pats_cited']]
all_data = all_data[pd.notnull(all_data['City'])]
all_data = all_data[pd.notnull(all_data['State'])]

In [14]:
all_data['city_for_viewing'] = all_data['City'] + ', ' + all_data['State']
all_data['City'] = all_data.City.apply(lambda x: str.lower(x))
all_data['City'] = all_data.City.apply(lambda x: x.replace(' ', ''))
all_data['State'] = all_data.State.apply(lambda x: str.lower(x))
all_data['city_state'] = all_data['City'] + '_' + all_data['State']
all_data.drop(['City', 'State'], axis = 1, inplace = True)

In [15]:
all_data.drop(['assignee_pats_cited', 'inventor_pats_cited'], axis = 1, inplace = True)

In [16]:
cities_to_keep = cities_to_keep.merge(all_data, on = ['Year', 'city_state'], how = 'inner')

In [17]:
cities_to_keep.head()

Unnamed: 0,Award Mean,Award Sum,Company Count,Year,city_state,creative_employees,creative_establishments,empowerment_zone,median_household_income,performance_amount,...,company_count_perc,percent_bachelors,percent_graduate,percent_foreign_born,scaled_population,recipient_mean,performance_mean,Patents,inventor_patents,city_for_viewing
0,128725.714286,901080.0,7.0,2001,santaclara_ca,2222.0,188.0,0,69466.0,8192436.0,...,0.001045,0.241177,0.146017,0.348707,11.536261,199534.3,4096218.0,4603,1272,"Santa Clara, CA"
1,166188.029412,5650393.0,34.0,2001,houston_tx,45344.0,11568.0,1,36616.0,401235700.0,...,0.000147,0.15704,0.082908,0.264177,14.4852,629998.1,2994297.0,3989,1365,"Houston, TX"
2,238464.416667,5723146.0,24.0,2001,sanjose_ca,10299.0,1751.0,0,70243.0,207137.3,...,0.000663,0.191269,0.094006,0.368467,13.704515,397788.5,20713.73,2656,4707,"San Jose, CA"
3,253001.5,7590045.0,30.0,2001,sunnyvale_ca,1416.0,176.0,0,74409.0,60854710.0,...,0.004949,0.278218,0.203854,0.394581,11.788737,1432841.0,6761635.0,2000,2581,"Sunnyvale, CA"
4,175075.826087,4026744.0,23.0,2001,newyork_ny,309161.0,25930.0,1,38293.0,2300854.0,...,3.2e-05,0.153664,0.102363,0.358508,15.895986,208637.6,18706.13,1806,1110,"New York, NY"


In [18]:
cities_to_keep.columns

Index([u'Award Mean', u'Award Sum', u'Company Count', u'Year', u'city_state',
       u'creative_employees', u'creative_establishments', u'empowerment_zone',
       u'median_household_income', u'performance_amount', u'performance_count',
       u'recipient_amount', u'recipient_count', u'regular_employees',
       u'regular_establishments', u'total_earned_bachelor',
       u'total_earned_graduate_degree', u'total_foreign',
       u'total_less_than_bachelor', u'total_native', u'total_population',
       u'Score_invented', u'Score_assigned', u'percent_creative_class',
       u'scaled_perc_creative_class', u'creative_establishment_ratio',
       u'company_count_perc', u'percent_bachelors', u'percent_graduate',
       u'percent_foreign_born', u'scaled_population', u'recipient_mean',
       u'performance_mean', u'Patents', u'inventor_patents',
       u'city_for_viewing'],
      dtype='object')

In [30]:
cities_to_keep.to_csv('csv_for_summary.csv')

#### Create city_stats.json

In [19]:
df_stats_dict = {}
for year in list(cities_to_keep.Year.unique()):
    
    temp_df = cities_to_keep.loc[cities_to_keep.Year == year]
    temp_df.set_index('city_state', inplace = True)
    temp_dict = temp_df.to_dict(orient = 'index')
    df_stats_dict[year] = temp_dict

In [33]:
import json
with open('city_stats.json', 'w') as fp:
    json.dump(df_stats_dict, fp)

## 3. GeoJSON w/ city locations. i.e. Drop all stats

In [20]:
cities_to_keep = df_stats.loc[df_stats.city_state.isin(cities)]
cities_to_keep = cities_to_keep.drop_duplicates(subset = 'city_state', keep = 'first')
cities_to_keep_gdf = gpd.GeoDataFrame(cities_to_keep)
cities_to_keep_gdf.crs = from_epsg(4326)

In [21]:
df_only_cities = cities_to_keep_gdf[['city_state', 'Year']]
only_geometries = patent_cities.merge(df_only_cities, on = 'city_state', how = 'inner')
only_geometries = only_geometries.drop_duplicates(subset = 'city_state', keep = 'first')
only_geometries.head()

Unnamed: 0,bbox_east,bbox_north,bbox_south,bbox_west,place_name,comma_count,county_flag,city,state,name,postal,fips,city_state,geometry,Year
0,-73.700181,40.916179,40.477399,-74.25909,"New York City, New York, USA",2,no_flag,newyork,NewYork,NewYork,ny,36,newyork_ny,"(POLYGON ((-74.25909 40.497207, -74.259089 40....",2001
2,-73.893042,42.843668,42.763577,-73.983475,"Schenectady, Schenectady County, New York, USA",3,no_flag,schenectady,NewYork,NewYork,ny,36,schenectady_ny,"POLYGON ((-73.9834749 42.8015698, -73.98047 42...",2001
3,-73.719524,41.069796,40.982445,-73.789848,"White Plains, Westchester County, New York, USA",3,no_flag,whiteplains,NewYork,NewYork,ny,36,whiteplains_ny,"POLYGON ((-73.78984800000001 41.029136, -73.78...",2001
4,-78.795168,42.966469,42.826039,-78.919453,"Buffalo, Erie County, New York, USA",3,no_flag,buffalo,NewYork,NewYork,ny,36,buffalo_ny,"POLYGON ((-78.9194528 42.9471686, -78.91906040...",2001
5,-76.074084,43.086102,42.984371,-76.204603,"Syracuse, Onondaga County, New York, USA",3,no_flag,syracuse,NewYork,NewYork,ny,36,syracuse_ny,"POLYGON ((-76.2046029 43.0452075, -76.20459219...",2001


In [22]:
only_geometries['centroid'] = only_geometries.geometry.apply(lambda x: x.centroid)
only_geometries['lat'] = only_geometries.centroid.y
only_geometries['lon'] = only_geometries.centroid.x
only_geometries.drop(['centroid'], axis = 1, inplace = True)

In [23]:
only_geometries.columns

Index([  u'bbox_east',  u'bbox_north',  u'bbox_south',   u'bbox_west',
        u'place_name', u'comma_count', u'county_flag',        u'city',
             u'state',        u'name',      u'postal',        u'fips',
        u'city_state',    u'geometry',        u'Year',         u'lat',
               u'lon'],
      dtype='object')

In [24]:
only_geometries.shape

(297, 17)

In [1]:
only_geometries.to_file('cities_geo.geojson', driver = 'GeoJSON')