#### Notebook Description


## Importing libraries


In [2]:
import pandas as pd
import random
try:
    import folium
except:
    print('Folium not installed.  Install with !pip install folium to see map visualization.')

## Importing and cleaning datasets 

In [131]:
df_incomes = pd.read_csv('./Data/zip_income_3col')
df_incomes = df_incomes.drop('Unnamed: 0', axis=1)
df_incomes.rename(columns = {'income':'tot_income'}, inplace = True)
list(df_incomes.columns)

['zipcode', 'tot_income', 'state']

In [132]:
df_pops = pd.read_csv('./Data/pop-by-zip-code.csv')
df_pops = df_pops[['zip_code', 'y-2016']]
df_pops.rename(columns={'zip_code':'zipcode','y-2016':'population'},inplace = True)
list(df_pops.columns)

['zipcode', 'population']

In [133]:
df_longlat = pd.read_csv('./Data/zipcode_database.csv')
df_longlat = df_longlat[['Zipcode', 'City', 'Lat', 'Long']]
df_longlat = df_longlat.dropna()
df_longlat.rename(index=str, columns={'Zipcode': 'zipcode', 'Long': 'long', 'Lat': 'lat','City':'city'}, inplace=True)
list(df_longlat.columns)

['zipcode', 'city', 'lat', 'long']

In [134]:
df_counties = pd.read_csv('./Data/zip_to_counties.csv')
df_counties.drop(df_counties.columns[2:],axis=1,inplace = True)
df_counties.columns = ['zipcode','county']
df_counties['county'] = df_counties['county'].str.replace(' County','')
list(df_counties.columns)

['zipcode', 'county']

## Comparing datasets common zipcodes

Taking a look on how many zip codes work on the whole dataframe, we have 9751 common ones with zip_to_counties. This could indicate that the zipcodes they have in common are important. 

In [90]:
set_inc = set(df_incomes['zipcode'])
set_pop = set(df_pops['zipcode'])
set_cou = set(df_counties['zipcode'])
set_lon = set(df_longlat['zipcode'])

In [91]:
in_pops_and_incomes = [i for i in set_inc if i in set_pop]
in_pops_incomes_and_counties = [i for i in set_cou if i in in_pops_and_incomes]
in_all = [i for i in set_lon if i in in_pops_incomes_and_counties]

In [92]:
len(in_all)

29770

## Merging dataframes with common zipcode

In [107]:
first = pd.merge(df_incomes[df_incomes['zipcode'].isin(in_all)],
                 df_pops[df_pops['zipcode'].isin(in_all)],
                 on ='zipcode')
second = pd.merge(first,
                  df_longlat[df_longlat['zipcode'].isin(in_all)],
                  on = 'zipcode')
complete = pd.merge(second,
                    df_counties[df_counties['zipcode'].isin(in_all)],
                    on = 'zipcode')

complete['av_income'] = complete['tot_income']*1000/complete['population']
complete['av_income'] = complete['av_income'].apply(lambda x: round(x,0))

## Testing with random zipcodes 

In [113]:
def pick_random_working_zips(num):
    return [in_all[random.randint(1,len(in_all))] for i in range(num)]

In [114]:
output_df = complete[complete['zipcode'].isin(pick_random_working_zips(20))].copy()
output_df.head()

Unnamed: 0,zipcode,tot_income,state,population,city,lat,long,county,av_income
1010,99789,9261,AK,347,NUIQSUT,69.83,-152.14,North Slope Borough,26689.0
1563,71962,11253,AR,875,OKOLONA,34.0,-93.33,Clark,12861.0
2695,92661,451311,CA,3419,NEWPORT BEACH,33.6,-117.9,Orange,132001.0
5780,33973,142783,FL,14281,LEHIGH ACRES,26.59,-81.72,Lee,9998.0
5908,34604,245441,FL,10510,BROOKSVILLE,28.48,-82.42,Hernando,23353.0


In [119]:
def summary(output):
    print('')
    print('Average total income : ' + str(int(output['tot_income'].mean())))
    print('Median total income : ' + str(output['tot_income'].median()))
    print('States concerned : ' + str(output['state'].unique()))
    print('Cities concerned : ' + str(output['city'].unique()))
    print('Average population : ' + str(int(output['population'].mean())))    
    print('Median population : ' + str(output['population'].median()))

In [120]:
summary(output_df)


Average total income : 176460
Median total income : 67079.0
States concerned : ['AK' 'AR' 'CA' 'FL' 'IN' 'MN' 'MO' 'NY' 'NC' 'OH' 'OK' 'PA' 'TN' 'TX'
 'WA']
Cities concerned : ['NUIQSUT' 'OKOLONA' 'NEWPORT BEACH' 'LEHIGH ACRES' 'BROOKSVILLE' 'THAYER'
 'GROVE CITY' 'CUSHING' 'SAINT LOUIS' 'MARYVILLE' 'HOLLEY' 'STOKESDALE'
 'CUMBERLAND' 'CARNEY' 'NORMALVILLE' 'JENNERS' 'MAINESBURG' 'KNOXVILLE'
 'VERNON' 'STEVENSON']
Average population : 6262
Median population : 2464.0


### Visualization

Graphic to show which zip codes were selected.

In [121]:
map_zip = folium.Map(location=[40, -100], zoom_start=4)
output_df.apply(lambda row:folium.CircleMarker(location=[row["lat"], row["long"]], 
                                              radius=10).add_to(map_zip), axis=1)
map_zip
# map_zip.save('zipcode_map.html')