#### Notebook Description


## Index


In [3]:
import pandas as pd
import random

try:
    import folium
except:
    print('Folium not installed.  Install with !pip install folium to see map visualization.')

In [6]:
df_incomes = pd.read_csv('./Data/zip_income_3col')
df_incomes = df_incomes.drop('Unnamed: 0', axis=1)

In [7]:
df_pops = pd.read_csv('./Data/pop-by-zip-code.csv')
df_pops = df_pops[['zip_code', 'y-2016']]

In [8]:
df_longlat = pd.read_csv('./Data/zipcode_database.csv')
df_longlat = df_longlat[['Zipcode', 'City', 'Lat', 'Long']]
df_longlat = df_longlat.dropna()
df_longlat.rename(index=str, columns={'Zipcode': 'zipcode', 'Long': 'long', 'Lat': 'lat'}, inplace=True)

In [9]:
df_counties = pd.read_csv('./Data/zip_to_counties.csv')
df_counties.drop(df_counties.columns[2:],axis=1,inplace = True)
df_counties.columns = ['zipcode','county']
df_counties['county'] = df_counties['county'].str.replace(' County','')

#### Comparing datasets common zipcodes

Taking a look on how many zip codes work on the whole dataframe, we have 9751 common ones with zip_to_counties. This could indicate that the zipcodes they have in common are important. 

In [14]:
no_in_pops = [i for i in df_incomes['zipcode'] if i not in df_pops['zip_code']]
no_in_incomes = [i for i in df_pops['zip_code'] if i not in df_incomes['zipcode']]
in_pops_and_incomes = [i for i in df_incomes['zipcode'] if i in df_pops['zip_code']]
in_pops_incomes_and_counties = [i for i in df_counties['zipcode'] if i in in_pops_and_incomes]
in_all = [i for i in df_longlat['zipcode'] if i in in_pops_incomes_and_counties]

In [15]:
len(in_pops_and_incomes),len(set(in_pops_incomes_and_counties)),len(in_all) # set is because multiple counties per zip and vice versa

(9842, 9751, 9751)

In [23]:
def pick_random_working_zips(num):
    return [in_all[random.randint(1,len(in_all))] for i in range(num)]

### Input Zip Codes

Get list of zip codes to return data from.

In [41]:
def get_counties(output):
    list_of_counties = [ list(df_counties[df_counties['zipcode'] == i]['county']) for i in output['zipcode'] ]
    output['counties'] = list_of_counties
def summary(output):
    print('')
    print('Average income : ' + str(int(output['income'].mean())))
    print('Median income : ' + str(output['income'].median()))
    print('states concerned : ' + str(output['state'].unique()))
    print('Average population : ' + str(int(output['population'].mean())))    
    print('Median population : ' + str(output['population'].median()))

In [43]:
output_dict = {'zipcode': {},
               'income': {},
               'state': {},
               'population': {}, 
               'long': {},
               'lat': {}}
output_df = pd.DataFrame(output_dict)
print('Enter zip codes for review.  When all are added, type done as input.')
while True:
    new_zip = input('Zip code: ')
    if new_zip.lower() == 'done':
        break
    try:
        output_df = output_df.append(df_incomes[(df_incomes.zipcode==int(new_zip))], sort=False)
        pop = df_pops.loc[df_pops['zip_code'] == int(new_zip), ['y-2016']].values[0]
        long = df_longlat.loc[df_longlat['zipcode'] == int(new_zip), ['long']].values[0]
        lat = df_longlat.loc[df_longlat['zipcode'] == int(new_zip), ['lat']].values[0]
        output_df.loc[output_df['zipcode'] == int(new_zip), ['population']] = pop
        output_df.loc[output_df['zipcode'] == int(new_zip), ['long']] = long
        output_df.loc[output_df['zipcode'] == int(new_zip), ['lat']] = lat
        
    except:
        print('Invalid zip code entered or database is missing information for this code.')
        

output_df = output_df.reset_index(drop=True)
get_counties(output_df)
summary(output_df)
output_df

Enter zip codes for review.  When all are added, type done as input.
Zip code: 8610
Zip code: 15411
Zip code: 24139
Zip code: 15563
Zip code: 20650
Zip code: 13428
Zip code: 4460
Zip code: 32310
Zip code: 31903
Zip code: 29642
Zip code: done

Average income : 287680
Median income : 114052.5
states concerned : ['NJ' 'PA' 'VA' 'MD' 'NY' 'ME' 'FL' 'GA' 'SC']
Average population : 12044
Median population : 8800.5


Unnamed: 0,zipcode,income,state,population,long,lat,counties
0,8610.0,778231.0,NJ,29911.0,-74.76,40.22,"[Burlington, Mercer]"
1,15411.0,13718.0,PA,720.0,-79.33,39.74,[Somerset]
2,24139.0,10688.0,VA,569.0,-79.46,36.98,"[Bedford, Pittsylvania]"
3,15563.0,70407.0,PA,3222.0,-78.95,40.1,[Somerset]
4,20650.0,613676.0,MD,14379.0,-76.64,38.29,[St. Mary's]
5,13428.0,29343.0,NY,1889.0,-74.57,42.91,[Montgomery]
6,4460.0,21442.0,ME,1145.0,-68.53,45.6,[Penobscot]
7,32310.0,202810.0,FL,16827.0,-84.28,30.45,[Leon]
8,31903.0,157698.0,GA,20559.0,-84.87,32.51,[Muscogee]
9,29642.0,978788.0,SC,31225.0,-82.58,34.82,"[Anderson, Pickens]"


In [38]:
pick_random_working_zips(10)

[8610, 15411, 24139, 15563, 20650, 13428, 4460, 32310, 31903, 29642]

### Visualization

Graphic to show which zip codes were selected.

In [40]:
map_zip = folium.Map(location=[40, -100], zoom_start=4)
output_df.apply(lambda row:folium.CircleMarker(location=[row["lat"], row["long"]], 
                                              radius=10).add_to(map_zip), axis=1)
map_zip
# map_zip.save('zipcode_map.html')