#### Notebook Description


## Importing libraries


In [1]:
import pandas as pd
import random
try:
    import folium
except:
    print('Folium not installed.  Install with !pip install folium to see map visualization.')

## Importing and cleaning datasets 

In [2]:
df_incomes = pd.read_csv('./Data/zip_income_3col')
df_incomes = df_incomes.drop('Unnamed: 0', axis=1)
df_incomes.rename(columns = {'income':'tot_income'}, inplace = True)
list(df_incomes.columns)

['zipcode', 'tot_income', 'state']

In [3]:
df_pops = pd.read_csv('./Data/pop-by-zip-code.csv')
df_pops = df_pops[['zip_code', 'y-2016']]
df_pops.rename(columns={'zip_code':'zipcode','y-2016':'population'},inplace = True)
list(df_pops.columns)

['zipcode', 'population']

In [4]:
df_longlat = pd.read_csv('./Data/zipcode_database.csv')
df_longlat = df_longlat[['Zipcode', 'City', 'Lat', 'Long']]
df_longlat = df_longlat.dropna()
df_longlat.rename(index=str, columns={'Zipcode': 'zipcode', 'Long': 'long', 'Lat': 'lat','City':'city'}, inplace=True)
list(df_longlat.columns)

['zipcode', 'city', 'lat', 'long']

In [5]:
df_counties = pd.read_csv('./Data/zip_to_counties.csv')
df_counties.drop(df_counties.columns[2:],axis=1,inplace = True)
df_counties.columns = ['zipcode','county']
df_counties['county'] = df_counties['county'].str.replace(' County','')
list(df_counties.columns)

['zipcode', 'county']

In [6]:
df_state_professions_description = pd.read_excel('./Data/field_descriptions.xlsx')
df_state_professions_description.drop([i for i in range(8)],inplace = True)
df_state_professions_description.drop([41,42,43,44,45,46],inplace =True)
df_state_professions_description.rename(columns = {'May 2017 OES Estimates':'acronym','Unnamed: 1':'description'},inplace = True)
df_state_professions_description.set_index('acronym',inplace = True)

In [47]:
df_state_professions_description['description']

acronym
Field                                           Field Description
prim_state      Primary state for the MSA (only on MSA and non...
area            MSA, metropolitan division, or state FIPS code...
st                    State abbreviation (only on the state file)
state                         State name (only on the state file)
area_name       Area name (only on metropolitan and nonmetropo...
naics           North American Industry Classification System ...
naics_title     North American Industry Classification System ...
ownership       Ownership type (only on the industry and owner...
occ_code        The 6-digit Standard Occupational Classificati...
occ_title       Standard Occupational Classification title or ...
occ_group       Shows the SOC occupation level: "total"=total ...
tot_emp         Estimated total employment rounded to the near...
emp_prse        Percent relative standard error (RSE) for the ...
pct_total       Percent of industry employment in the given oc...
pc

In [33]:
df_state_professions = pd.read_excel('./Data/state_M2017_dl.xlsx')
df_state_professions.drop(['ANNUAL','HOURLY','STATE'],axis =1,inplace = True)
df_state_professions_ram = df_state_professions[(df_state_professions['OCC_GROUP'] == 'major')|(df_state_professions['OCC_GROUP'] == 'total')].copy()
df_state_professions_ram = df_state_professions_ram[['OCC_TITLE','ST','TOT_EMP','JOBS_1000','H_MEAN','A_MEAN','A_PCT10','A_PCT25','A_MEDIAN','A_PCT75','A_PCT90']]
df_state_professions_ram['OCC_TITLE'] =df_state_professions_ram['OCC_TITLE'].str.replace(' Occupations', '')
df_state_professions_ram.columns = ['occupation','state','tot_employement','perc','h_mean','annual_mean','10','25','med','75','90']

## Comparing datasets common zipcodes

Taking a look on how many zip codes work on the whole dataframe, we have 9751 common ones with zip_to_counties. This could indicate that the zipcodes they have in common are important. 

In [12]:
set_inc = set(df_incomes['zipcode'])
set_pop = set(df_pops['zipcode'])
set_cou = set(df_counties['zipcode'])
set_lon = set(df_longlat['zipcode'])

In [13]:
in_pops_and_incomes = [i for i in set_inc if i in set_pop]
in_pops_incomes_and_counties = [i for i in set_cou if i in in_pops_and_incomes]
in_all = [i for i in set_lon if i in in_pops_incomes_and_counties]

In [14]:
len(in_all)

29770

## Merging dataframes with common zipcode

In [15]:
first = pd.merge(df_incomes[df_incomes['zipcode'].isin(in_all)],
                 df_pops[df_pops['zipcode'].isin(in_all)],
                 on ='zipcode')
second = pd.merge(first,
                  df_longlat[df_longlat['zipcode'].isin(in_all)],
                  on = 'zipcode')
complete = pd.merge(second,
                    df_counties[df_counties['zipcode'].isin(in_all)],
                    on = 'zipcode')

complete['av_income'] = complete['tot_income']*1000/complete['population']
complete['av_income'] = complete['av_income'].apply(lambda x: round(x,0))

In [16]:
complete.to_csv('./Data/complete.csv')

## Testing with random zipcodes 

In [17]:
def pick_random_working_zips(num):
    return [in_all[random.randint(1,len(in_all))] for i in range(num)]

In [18]:
output_df = complete[complete['zipcode'].isin(pick_random_working_zips(20))].copy()
output_df.head()

Unnamed: 0,zipcode,tot_income,state,population,city,lat,long,county,av_income
5415,33132,571687,FL,11433,MIAMI,25.77,-80.2,Miami-Dade,50003.0
9582,46254,767281,IN,40335,INDIANAPOLIS,39.77,-86.14,Marion,19023.0
12569,67352,6044,KS,531,LONGTON,37.37,-96.08,Elk,11382.0
12888,40068,87054,KY,1891,SMITHFIELD,38.38,-85.25,Henry,46036.0
12889,40068,87054,KY,1891,SMITHFIELD,38.38,-85.25,Oldham,46036.0


In [45]:
summer_stats = df_state_professions_ram[df_state_professions_ram['state'].isin(['NH'])].sort_values(by='perc',ascending =False)

In [46]:
summer_stats

Unnamed: 0,occupation,state,tot_employement,perc,h_mean,annual_mean,10,25,med,75,90
20428,All,NH,649950,1000.0,24.54,51040,20020,26440,38900,61070,93580
20820,Office and Administrative Support,NH,110000,169.25,18.26,37990,21560,27730,35900,46080,58560
20800,Sales and Related,NH,79150,121.783,20.29,42210,17590,20690,27770,49550,83300
20750,Food Preparation and Serving Related,NH,58620,90.189,11.96,24870,16720,18310,21950,28970,38110
20962,Production,NH,44980,69.208,18.98,39480,23890,28760,36160,47220,60800
20585,"Education, Training, and Library",NH,43880,67.518,25.45,52930,23060,31360,47720,66970,82840
20663,Healthcare Practitioners and Technical,NH,39180,60.281,43.74,90980,36480,51640,69770,96190,180960
20429,Management,NH,36840,56.687,56.55,117620,52910,71820,101490,146170,201600
21049,Transportation and Material Moving,NH,34140,52.525,17.4,36200,19790,23680,32010,43030,54630
20461,Business and Financial Operations,NH,28550,43.929,34.77,72330,38690,49520,65010,84100,110900


In [25]:
def summary(output):
    print('')
    print('Average total income : ' + str(int(output['tot_income'].mean())))
    print('Median total income : ' + str(output['tot_income'].median()))
    print('States concerned : ' + str(output['state'].unique()))
    print('Cities concerned : ' + str(output['city'].unique()))
    print('Average population : ' + str(int(output['population'].mean())))    
    print('Median population : ' + str(output['population'].median()))

In [26]:
summary(output_df)


Average total income : 355302
Median total income : 212147.0
States concerned : ['FL' 'IN' 'KS' 'KY' 'ME' 'MN' 'MO' 'NJ' 'OK' 'PA' 'RI' 'SC' 'TN' 'TX'
 'VA' 'WV']
Cities concerned : ['MIAMI' 'INDIANAPOLIS' 'LONGTON' 'SMITHFIELD' 'FALMOUTH' 'WHITING'
 'SAINT PAUL' 'DOE RUN' 'SALEM' 'ROFF' 'SOUTH PARK' 'JOHNSTON'
 'JONESVILLE' 'REEVESVILLE' 'MORRISTOWN' 'WINNSBORO' 'TYLER' 'MERCEDES'
 'ROCHELLE' 'ENTERPRISE']
Average population : 12612
Median population : 9759.0


### Visualization

Graphic to show which zip codes were selected.

In [121]:
map_zip = folium.Map(location=[40, -100], zoom_start=4)
output_df.apply(lambda row:folium.CircleMarker(location=[row["lat"], row["long"]], 
                                              radius=10).add_to(map_zip), axis=1)
map_zip
# map_zip.save('zipcode_map.html')

## Geojson Implentation

In [155]:
alaska = pd.read_json('./State-zip-code-GeoJSON/ak_alaska_zip_codes_geo.min.json')

In [163]:
alaska['features'][0]['geometry']['coordinates']

[[[-170.287655, 57.126766],
  [-170.287641, 57.126927],
  [-170.287631, 57.12706],
  [-170.287437, 57.127353],
  [-170.287413, 57.127391],
  [-170.287377, 57.127502],
  [-170.287358, 57.127558],
  [-170.287325, 57.127662],
  [-170.287271, 57.127704],
  [-170.287225, 57.127742],
  [-170.287061, 57.127874],
  [-170.287, 57.127924],
  [-170.28669, 57.128163],
  [-170.286634, 57.128161],
  [-170.286528, 57.128163],
  [-170.286318, 57.128169],
  [-170.286521, 57.128388],
  [-170.288203, 57.130195],
  [-170.293861, 57.136273],
  [-170.295748, 57.1383],
  [-170.296171, 57.138824],
  [-170.297443, 57.140397],
  [-170.297867, 57.140922],
  [-170.297935, 57.141006],
  [-170.298139, 57.141259],
  [-170.298208, 57.141344],
  [-170.298251, 57.141397],
  [-170.290561, 57.144244],
  [-170.290793, 57.145052],
  [-170.293065, 57.147739],
  [-170.29329, 57.147909],
  [-170.294356, 57.14872],
  [-170.295783, 57.149806],
  [-170.303963, 57.15491],
  [-170.312815, 57.156351],
  [-170.32484, 57.156769],
  [