#### Notebook Description


## Index


In [54]:
import pandas as pd
import random

In [55]:
df_incomes = pd.read_csv('./Data/zip_income_3col')
df_incomes = df_incomes.drop('Unnamed: 0', axis=1)

In [56]:
df_pops = pd.read_csv('./Data/pop-by-zip-code.csv')
df_pops = df_pops[['zip_code', 'y-2016']]

In [89]:
df_counties = pd.read_csv('./Data/zip_to_counties.csv')
df_counties.drop(df_counties.columns[2:],axis=1,inplace = True)
df_counties.columns = ['zipcode','county']
df_counties['county'] = df_counties['county'].str.replace(' County','')

#### Comparing datasets common zipcodes

Taking a look on how many zip codes work on the whole dataframe, we have 9751 common ones with zip_to_counties. This could indicate that the zipcodes they have in common are important. 

In [90]:
no_in_pops = [i for i in df_incomes['zipcode'] if i not in df_pops['zip_code']]
no_in_incomes = [i for i in df_pops['zip_code'] if i not in df_incomes['zipcode']]
in_pops_and_incomes = [i for i in df_incomes['zipcode'] if i in df_pops['zip_code']]
in_pops_incomes_and_counties = [i for i in df_counties['zipcode'] if i in in_both]

In [93]:
len(in_pops_and_incomes),len(set(in_pops_incomes_and_counties)) # set is because multiple counties per zip and vice versa

(9842, 9751)

In [95]:
def pick_random_working_zips(num):
    return [commons_to_all[random.randint(1,len(commons_to_all))] for i in range(num)]

In [96]:
pick_random_working_zips(5)

[30741, 8812, 29212, 20620, 12601]

### Input Zip Codes

Get list of zip codes to return data from.

In [82]:
output_dict = {'zipcode': {},
               'income': {},
               'state': {},
               'population': {}}
output_df = pd.DataFrame(output_dict)
print('Enter zip codes for review.  When all are added, type done as input.')
while True:
    new_zip = input('Zip code: ')
    if new_zip.lower() == 'done':
        break
    try:
        output_df = output_df.append(df_incomes[(df_incomes.zipcode==int(new_zip))], sort=False)
        pop = df_pops.loc[df_pops['zip_code'] == int(new_zip), ['y-2016']].values[0]
        output_df.loc[output_df['zipcode'] == int(new_zip), ['population']] = pop
    except:
        print('Invalid zip code entered or database is missing information for this code.')
output_df = output_df.reset_index(drop=True)
output_df

Enter zip codes for review.  When all are added, type done as input.
Zip code: 15697
Zip code: 27928
Zip code: 23422
Zip code: 8801
Zip code: 28352
Zip code: done


Unnamed: 0,zipcode,income,state,population
0,15697.0,66792.0,PA,2940.0
1,27928.0,30942.0,NC,1888.0
2,23422.0,10304.0,VA,147.0
3,8801.0,548275.0,NJ,9104.0
4,28352.0,351279.0,NC,25851.0


### Visualization

Graphic to show which zip codes were selected.

###  Analysis output
1. Sum total income lost among all zip codes
2. Sum total population affected
3. List all states affected

In [83]:
def summary(output):
    print('average income per zip code : ' + str(int(output['income'].mean())))
    print('median income per zip code : ' + str(output['income'].median()))
    print('states concerned : ' + str(output['state'].unique()))
    print('mean population per zip code : ' + str(int(output['population'].mean())))    
    print('median population per zip code : ' + str(output['population'].median()))

In [84]:
summary(output_df)

average income per zip code : 201518
median income per zip code : 66792.0
states concerned : ['PA' 'NC' 'VA' 'NJ']
mean population per zip code : 7986
median population per zip code : 2940.0


### Getting counties corresponding to each zip code

In [86]:
def get_counties(output):
    list_of_counties = [ list(zip_to_counties[zip_to_counties['zipcode'] == i]['county']) for i in output['zipcode'] ]
    output['counties'] = list_of_counties

In [87]:
get_counties(output_df)

In [88]:
output_df

Unnamed: 0,zipcode,income,state,population,counties
0,15697.0,66792.0,PA,2940.0,[Westmoreland]
1,27928.0,30942.0,NC,1888.0,"[Tyrrell, Washington]"
2,23422.0,10304.0,VA,147.0,[Accomack]
3,8801.0,548275.0,NJ,9104.0,[Hunterdon]
4,28352.0,351279.0,NC,25851.0,[Scotland]
