#### Notebook Description


## Index


In [3]:
import pandas as pd


In [4]:
df_incomes = pd.read_csv('./Data/zip_income_3col')
df_incomes = df_incomes.drop('Unnamed: 0', axis=1)

In [5]:
df_pops = pd.read_csv('./Data/pop-by-zip-code.csv')
df_pops = df_pops[['zip_code', 'y-2016']]

In [6]:
not_in_pops = [i for i in df_incomes['zipcode'] if i not in df_pops['zip_code']]
in_both = [i for i in df_incomes['zipcode'] if i in df_pops['zip_code']]
not_in_incomes = [i for i in df_pops['zip_code'] if i not in df_incomes['zipcode']]

In [7]:
len(not_in_incomes),len(not_in_pops),len(in_both)

(23549, 20132, 9842)

### Input Zip Codes

Get list of zip codes to return data from.

In [34]:
output_dict = {'zipcode': {},
               'income': {},
               'state': {},
               'population': {}}
output_df = pd.DataFrame(output_dict)
print('Enter zip codes for review.  When all are added, type done as input.')
while True:
    new_zip = input('Zip code: ')
    if new_zip.lower() == 'done':
        break
    try:
        output_df = output_df.append(df_incomes[(df_incomes.zipcode==int(new_zip))], sort=False)
        pop = df_pops.loc[df_pops['zip_code'] == int(new_zip), ['y-2016']].values[0]
        output_df.loc[output_df['zipcode'] == int(new_zip), ['population']] = pop
    except:
        print('Invalid zip code entered or database is missing information for this code.')
output_df = output_df.reset_index(drop=True)
output_df

Enter zip codes for review.  When all are added, type done as input.
Zip code: 36005
Zip code: 36006
Zip code: don
Invalid zip code entered or database is missing information for this code.
Zip code: done


Unnamed: 0,zipcode,income,state,population
0,36005.0,28829.0,AL,1680.0
1,36006.0,24941.0,AL,1342.0


### Visualization

Graphic to show which zip codes were selected.

###  Analysis output
1. Sum total income lost among all zip codes
2. Sum total population affected
3. List all states affected

In [9]:
def summary(output):
    print('average income per zip code : ' + str(output['income'].mean()))
    print('median income per zip code : ' + str(output['income'].median()))
    print('states concerned : ' + str(output['state'].unique()))
    print('mean population per zip code : ' + str(output['population'].mean()))    
    print('median population per zip code : ' + str(output['population'].median()))

In [10]:
summary(output_df)

average income per zip code : 28829.0
median income per zip code : 28829.0
states concerned : ['AL']
mean population per zip code : 1680.0
median population per zip code : 1680.0


### Getting counties corresponding to each zip code

dada import, selection and getting the county str out of the county feature because unnecessary

In [48]:
zip_to_counties = pd.read_csv('./Data/zip_to_counties.csv')
zip_to_counties.drop(zip_to_counties.columns[2:],axis=1,inplace = True)
zip_to_counties.columns = ['zipcode','county']
zip_to_counties['county'] = zip_to_counties['county'].str.replace(' County','')

Once again just to look how many zip codes are common in our dataframes, on 9842 common zip codes for our two dataframes df_incomes and df_pops we have 9751 common ones with zip_to_counties. This could indicate that the zipcodes they have in common are important. 

In [49]:
commons = [i for i in zip_to_counties['zipcode'] if i in df_incomes['zipcode']]
commons_to_all = [i for i in zip_to_counties['zipcode'] if i in in_both]
len(commons),len(set(commons_to_all))

(13095, 9751)

In [50]:
def get_counties(output):
    list_of_counties = [ list(zip_to_counties[zip_to_counties['zipcode'] == i]['county']) for i in output['zipcode'] ]
    output['counties'] = list_of_counties

In [51]:
get_counties(output_df)

In [52]:
output_df

Unnamed: 0,zipcode,income,state,population,counties
0,36005.0,28829.0,AL,1680.0,"[Bullock, Pike]"
1,36006.0,24941.0,AL,1342.0,"[Autauga, Chilton]"
