#### Notebook Description


## Importing libraries


In [1]:
import pandas as pd
import random
try:
    import folium
except:
    print('Folium not installed.  Install with !pip install folium to see map visualization.')

## Importing and cleaning datasets 

In [2]:
df_incomes = pd.read_csv('./Data/zip_income_3col')
df_incomes = df_incomes.drop('Unnamed: 0', axis=1)
df_incomes.rename(columns = {'income':'tot_income'}, inplace = True)
list(df_incomes.columns)

['zipcode', 'tot_income', 'state']

In [3]:
df_pops = pd.read_csv('./Data/pop-by-zip-code.csv')
df_pops = df_pops[['zip_code', 'y-2016']]
df_pops.rename(columns={'zip_code':'zipcode','y-2016':'population'},inplace = True)
list(df_pops.columns)

['zipcode', 'population']

In [4]:
df_longlat = pd.read_csv('./Data/zipcode_database.csv')
df_longlat = df_longlat[['Zipcode', 'City', 'Lat', 'Long']]
df_longlat = df_longlat.dropna()
df_longlat.rename(index=str, columns={'Zipcode': 'zipcode', 'Long': 'long', 'Lat': 'lat','City':'city'}, inplace=True)
list(df_longlat.columns)

['zipcode', 'city', 'lat', 'long']

In [5]:
df_counties = pd.read_csv('./Data/zip_to_counties.csv')
df_counties.drop(df_counties.columns[2:],axis=1,inplace = True)
df_counties.columns = ['zipcode','county']
df_counties['county'] = df_counties['county'].str.replace(' County','')
list(df_counties.columns)

['zipcode', 'county']

In [6]:
df_state_professions_description = pd.read_excel('./Data/field_descriptions.xlsx')
df_state_professions_description.drop([i for i in range(8)],inplace = True)
df_state_professions_description.drop([41,42,43,44,45,46],inplace =True)
df_state_professions_description.rename(columns = {'May 2017 OES Estimates':'acronym','Unnamed: 1':'description'},inplace = True)
df_state_professions_description.set_index('acronym',inplace = True)

In [7]:
df_state_professions = pd.read_excel('./Data/state_M2017_dl.xlsx')
df_state_professions.drop(['ANNUAL','HOURLY','STATE'],axis =1,inplace = True)
df_state_professions_ram = df_state_professions[(df_state_professions['OCC_GROUP'] == 'major')|(df_state_professions['OCC_GROUP'] == 'total')].copy()
df_state_professions_ram = df_state_professions_ram[['OCC_TITLE','ST','TOT_EMP','JOBS_1000','H_MEAN','A_MEAN','A_PCT10','A_PCT25','A_MEDIAN','A_PCT75','A_PCT90']]
df_state_professions_ram['OCC_TITLE'] =df_state_professions_ram['OCC_TITLE'].str.replace(' Occupations', '')
df_state_professions_ram.columns = ['occupation','state','tot_employement','perc','h_mean','annual_mean','10','25','med','75','90']

## Comparing datasets common zipcodes

Taking a look on how many zip codes work on the whole dataframe, we have 9751 common ones with zip_to_counties. This could indicate that the zipcodes they have in common are important. 

In [8]:
set_inc = set(df_incomes['zipcode'])
set_pop = set(df_pops['zipcode'])
set_cou = set(df_counties['zipcode'])
set_lon = set(df_longlat['zipcode'])

In [9]:
in_pops_and_incomes = [i for i in set_inc if i in set_pop]
in_pops_incomes_and_counties = [i for i in set_cou if i in in_pops_and_incomes]
in_all = [i for i in set_lon if i in in_pops_incomes_and_counties]

In [10]:
len(in_all)

29770

## Merging dataframes with common zipcode

In [11]:
first = pd.merge(df_incomes[df_incomes['zipcode'].isin(in_all)],
                 df_pops[df_pops['zipcode'].isin(in_all)],
                 on ='zipcode')
second = pd.merge(first,
                  df_longlat[df_longlat['zipcode'].isin(in_all)],
                  on = 'zipcode')
complete = pd.merge(second,
                    df_counties[df_counties['zipcode'].isin(in_all)],
                    on = 'zipcode')

complete['av_income'] = complete['tot_income']*1000/complete['population']
complete['av_income'] = complete['av_income'].apply(lambda x: round(x,0))

In [12]:
complete.to_csv('./Data/complete.csv')

## Testing with random zipcodes 

In [13]:
def pick_random_working_zips(num):
    return [in_all[random.randint(1,len(in_all))] for i in range(num)]

In [14]:
output_df = complete[complete['zipcode'].isin(pick_random_working_zips(20))].copy()
output_df.head()

Unnamed: 0,zipcode,tot_income,state,population,city,lat,long,county,av_income
300,35640,593404,AL,25434,HARTSELLE,34.43,-86.93,Morgan,23331.0
1961,72658,27358,AR,2223,NORFORK,36.2,-92.28,Baxter,12307.0
1962,72658,27358,AR,2223,NORFORK,36.2,-92.28,Marion,12307.0
3003,93702,398148,CA,46021,FRESNO,36.74,-119.75,Fresno,8651.0
3600,95629,29754,CA,676,FIDDLETOWN,38.52,-120.7,Amador,44015.0


In [15]:
summer_stats = df_state_professions_ram[df_state_professions_ram['state'].isin(['NH'])].sort_values(by='perc',ascending =False)

In [16]:
summer_stats

Unnamed: 0,occupation,state,tot_employement,perc,h_mean,annual_mean,10,25,med,75,90
20428,All,NH,649950,1000.0,24.54,51040,20020,26440,38900,61070,93580
20820,Office and Administrative Support,NH,110000,169.25,18.26,37990,21560,27730,35900,46080,58560
20800,Sales and Related,NH,79150,121.783,20.29,42210,17590,20690,27770,49550,83300
20750,Food Preparation and Serving Related,NH,58620,90.189,11.96,24870,16720,18310,21950,28970,38110
20962,Production,NH,44980,69.208,18.98,39480,23890,28760,36160,47220,60800
20585,"Education, Training, and Library",NH,43880,67.518,25.45,52930,23060,31360,47720,66970,82840
20663,Healthcare Practitioners and Technical,NH,39180,60.281,43.74,90980,36480,51640,69770,96190,180960
20429,Management,NH,36840,56.687,56.55,117620,52910,71820,101490,146170,201600
21049,Transportation and Material Moving,NH,34140,52.525,17.4,36200,19790,23680,32010,43030,54630
20461,Business and Financial Operations,NH,28550,43.929,34.77,72330,38690,49520,65010,84100,110900


In [17]:
def summary(output):
    print('')
    print('Average total income : ' + str(int(output['tot_income'].mean())))
    print('Median total income : ' + str(output['tot_income'].median()))
    print('States concerned : ' + str(output['state'].unique()))
    print('Cities concerned : ' + str(output['city'].unique()))
    print('Average population : ' + str(int(output['population'].mean())))    
    print('Median population : ' + str(output['population'].median()))

In [18]:
summary(output_df)


Average total income : 368601
Median total income : 92138.0
States concerned : ['AL' 'AR' 'CA' 'FL' 'IL' 'LA' 'ME' 'MA' 'MN' 'MS' 'MO' 'NV' 'NJ' 'NC'
 'OH' 'WI']
Cities concerned : ['HARTSELLE' 'NORFORK' 'FRESNO' 'FIDDLETOWN' 'MIAMI' 'VICTORIA'
 'VILLE PLATTE' 'CASCO' 'SANDISFIELD' 'CHARLTON' 'BABBITT' 'BRANDON'
 'BRECKENRIDGE' 'LAS VEGAS' 'ROEBLING' 'POINT PLEASANT BEACH' 'KANNAPOLIS'
 'WACO' 'NORTH BALTIMORE' 'SPRINGBROOK']
Average population : 11519
Median population : 3927.0


### Visualization

Graphic to show which zip codes were selected.

In [121]:
map_zip = folium.Map(location=[40, -100], zoom_start=4)
output_df.apply(lambda row:folium.CircleMarker(location=[row["lat"], row["long"]], 
                                              radius=10).add_to(map_zip), axis=1)
map_zip
# map_zip.save('zipcode_map.html')

## Geojson getting centroids 

In [32]:
def getting_centroids(extension):
    to_go_for = '../State-zip-code-GeoJSON/' + extension
    dataframe = pd.read_json(to_go_for)
    centroids = []
    for index in range(dataframe.shape[0]) : 
        zipcode = dataframe['features'][index]['properties']['ZCTA5CE10']
        lat = dataframe['features'][index]['properties']['INTPTLAT10']
        lon = dataframe['features'][index]['properties']['INTPTLON10']
        centroids.append([zipcode,lat,lon])

    return pd.DataFrame(centroids,columns = ['zipcode','latitude','longitude'])

In [21]:
import os

In [29]:
files = '../State-zip-code-GeoJSON'
os.chdir(files)
extensions = [f for f in os.listdir('.') if f.endswith('.json')]
extension_names = [f[:2].upper() for f in os.listdir('.') if f.endswith('.json')]

In [44]:
list_of_centroids = [(getting_centroids(extensions[i]), extension_names[i]) for i in range(len(extensions))]

In [58]:
number_of_zips = 0
for i in list_of_centroids:
    number_of_zips += len(i[0])

In [59]:
number_of_zips

33092

In [49]:
centroids_df = pd.DataFrame(list_of_centroids,columns = ['centroids','state'])

In [64]:
centroids_df['centroids'].loc[0]

Unnamed: 0,zipcode,latitude,longitude
0,94601,+37.7755447,-122.2187049
1,94501,+37.7737968,-122.2781230
2,94560,+37.5041413,-122.0323587
3,94587,+37.6031556,-122.0186382
4,94580,+37.6757312,-122.1330170
5,94514,+37.8263717,-121.6225460
6,94703,+37.8639059,-122.2756401
7,95601,+38.4266990,-120.8257170
8,95669,+38.4826191,-120.8994719
9,95901,+39.2239387,-121.4940499


In [60]:
centroids_df['centroids'].loc[1].head()

Unnamed: 0,zipcode,latitude,longitude
0,82052,41.1185509,-105.3086125
1,82731,44.8247692,-105.2921039
2,82325,41.2469594,-106.7131155
3,82225,42.9857099,-104.3239521
4,82430,43.8154543,-108.1849348


In [62]:
centroids_df.to_csv('centroids.csv',index = False)