#### Notebook Description


## Index


In [3]:
import pandas as pd
import random

try:
    import folium
except:
    print('Folium not installed.  Install with !pip install folium to see map visualization.')

In [6]:
df_incomes = pd.read_csv('./Data/zip_income_3col')
df_incomes = df_incomes.drop('Unnamed: 0', axis=1)

In [7]:
df_pops = pd.read_csv('./Data/pop-by-zip-code.csv')
df_pops = df_pops[['zip_code', 'y-2016']]

In [8]:
df_longlat = pd.read_csv('./Data/zipcode_database.csv')
df_longlat = df_longlat[['Zipcode', 'City', 'Lat', 'Long']]
df_longlat = df_longlat.dropna()
df_longlat.rename(index=str, columns={'Zipcode': 'zipcode', 'Long': 'long', 'Lat': 'lat'}, inplace=True)

In [9]:
df_counties = pd.read_csv('./Data/zip_to_counties.csv')
df_counties.drop(df_counties.columns[2:],axis=1,inplace = True)
df_counties.columns = ['zipcode','county']
df_counties['county'] = df_counties['county'].str.replace(' County','')

#### Comparing datasets common zipcodes

Taking a look on how many zip codes work on the whole dataframe, we have 9751 common ones with zip_to_counties. This could indicate that the zipcodes they have in common are important. 

In [14]:
no_in_pops = [i for i in df_incomes['zipcode'] if i not in df_pops['zip_code']]
no_in_incomes = [i for i in df_pops['zip_code'] if i not in df_incomes['zipcode']]
in_pops_and_incomes = [i for i in df_incomes['zipcode'] if i in df_pops['zip_code']]
in_pops_incomes_and_counties = [i for i in df_counties['zipcode'] if i in in_pops_and_incomes]
in_all = [i for i in df_longlat['zipcode'] if i in in_pops_incomes_and_counties]

In [15]:
len(in_pops_and_incomes),len(set(in_pops_incomes_and_counties)),len(in_all) # set is because multiple counties per zip and vice versa

(9842, 9751, 9751)

In [23]:
def pick_random_working_zips(num):
    return [in_all[random.randint(1,len(in_all))] for i in range(num)]

In [24]:
pick_random_working_zips(5)

[21538, 4915, 13856, 17565, 23454]

### Input Zip Codes

Get list of zip codes to return data from.

In [29]:
def get_counties(output):
    list_of_counties = [ list(df_counties[df_counties['zipcode'] == i]['county']) for i in output['zipcode'] ]
    output['counties'] = list_of_counties

In [30]:
output_dict = {'zipcode': {},
               'income': {},
               'state': {},
               'population': {}, 
               'long': {},
               'lat': {}}
output_df = pd.DataFrame(output_dict)
print('Enter zip codes for review.  When all are added, type done as input.')
while True:
    new_zip = input('Zip code: ')
    if new_zip.lower() == 'done':
        break
    try:
        output_df = output_df.append(df_incomes[(df_incomes.zipcode==int(new_zip))], sort=False)
        pop = df_pops.loc[df_pops['zip_code'] == int(new_zip), ['y-2016']].values[0]
        long = df_longlat.loc[df_longlat['zipcode'] == int(new_zip), ['long']].values[0]
        lat = df_longlat.loc[df_longlat['zipcode'] == int(new_zip), ['lat']].values[0]
        output_df.loc[output_df['zipcode'] == int(new_zip), ['population']] = pop
        output_df.loc[output_df['zipcode'] == int(new_zip), ['long']] = long
        output_df.loc[output_df['zipcode'] == int(new_zip), ['lat']] = lat
        
    except:
        print('Invalid zip code entered or database is missing information for this code.')
        

output_df = output_df.reset_index(drop=True)
get_counties(output_df)
output_df

Enter zip codes for review.  When all are added, type done as input.
Zip code: 21538
Zip code: 4915
Zip code: 13856
Zip code: 17565
Zip code: 23454
Zip code: done


Unnamed: 0,zipcode,income,state,population,long,lat,counties
0,21538.0,10328.0,MD,442.0,-79.18,39.38,[Garrett]
1,4915.0,219852.0,ME,8818.0,-69.02,44.42,[Waldo]
2,13856.0,116116.0,NY,6258.0,-75.13,42.16,[Delaware]
3,17565.0,85446.0,PA,2788.0,-76.31,39.9,[Lancaster]
4,23454.0,2256450.0,VA,60390.0,-76.04,36.73,[Virginia Beach city]


### Visualization

Graphic to show which zip codes were selected.

In [31]:
map_zip = folium.Map(location=[40, -100], zoom_start=4)
output_df.apply(lambda row:folium.CircleMarker(location=[row["lat"], row["long"]], 
                                              radius=10).add_to(map_zip), axis=1)
map_zip
# map_zip.save('zipcode_map.html')

###  Analysis output
1. Sum total income lost among all zip codes
2. Sum total population affected
3. List all states affected

In [32]:
def summary(output):
    print('average income per zip code : ' + str(int(output['income'].mean())))
    print('median income per zip code : ' + str(output['income'].median()))
    print('states concerned : ' + str(output['state'].unique()))
    print('mean population per zip code : ' + str(int(output['population'].mean())))    
    print('median population per zip code : ' + str(output['population'].median()))

In [33]:
summary(output_df)

average income per zip code : 537638
median income per zip code : 116116.0
states concerned : ['MD' 'ME' 'NY' 'PA' 'VA']
mean population per zip code : 15739
median population per zip code : 6258.0
