# 2 - Census Data.
### Extracting a state (Mass) from the raw USA census data.

## Preprocessing

- Claire emailed me census data from ACS at Zipcode level
- Open R12298078_SL860.csv in excel
    - Delete row number 2
    - Format Numbers (special - zipcode 5 digit) on column AQ 'Zipcode Tabulation Area'
    - Also delete ' (5 digit)' from the header (This seems to help with importing)
- Save and close

## USZipcode python package
- https://uszipcode.readthedocs.io/index.html

In [110]:
import pandas as pd
import uszipcode
print('uszipcode', uszipcode.__version__)
from uszipcode import SearchEngine, SimpleZipcode, Zipcode

uszipcode 0.2.2


In [111]:
# define a state
state = "Massachusetts"

In [112]:
# run the search
search = SearchEngine() # use simple_zipcode=False inside () toreturn more census data
results = search.by_state(state,returns=0) #default returns 5, 0 returns all
print ("There are", len(results), "zipcodes in", state)

There are 496 zipcodes in Massachusetts


In [113]:
temp_df = []

for entry in results:
    temp = entry.to_dict()
    temp_df.append(temp)

zipcode_results = pd.DataFrame(temp_df)

In [114]:
zipcode_results.columns = map(str.capitalize, zipcode_results.columns)
zipcode_results.head()

Unnamed: 0,Zipcode,Zipcode_type,Major_city,Post_office_city,Common_city_list,County,State,Lat,Lng,Timezone,...,Land_area_in_sqmi,Water_area_in_sqmi,Housing_units,Occupied_housing_units,Median_home_value,Median_household_income,Bounds_west,Bounds_east,Bounds_north,Bounds_south
0,1001,Standard,Agawam,"Agawam, MA",[Agawam],Hampden County,MA,42.07,-72.63,Eastern,...,11.44,0.86,7557.0,7215.0,213000.0,58733.0,-72.667902,-72.582535,42.100467,42.030795
1,1002,Standard,Amherst,"Amherst, MA","[Amherst, Cushman, Pelham]",Hampshire County,MA,42.38,-72.52,Eastern,...,55.04,1.65,10388.0,9910.0,338900.0,54422.0,-72.546776,-72.355041,42.437947,42.301437
2,1005,Standard,Barre,"Barre, MA",[Barre],Worcester County,MA,42.42,-72.12,Eastern,...,44.24,0.26,2044.0,1904.0,208500.0,68644.0,-72.205174,-72.007388,42.484473,42.356423
3,1007,Standard,Belchertown,"Belchertown, MA",[Belchertown],Hampshire County,MA,42.3,-72.4,Eastern,...,52.64,2.68,5839.0,5595.0,260000.0,71875.0,-72.472287,-72.331642,42.358762,42.185812
4,1008,Standard,Blandford,"Blandford, MA",[Blandford],Hampden County,MA,42.2,-73.0,Eastern,...,53.8,1.96,586.0,503.0,247200.0,71635.0,-73.034916,-72.87218,42.25134,42.113028


### Import US Census Data

In [115]:
usa_census = pd.read_csv('~/Desktop/new_insights/datasets/census/R12298078_SL860.csv', dtype={'ZIP Code Tabulation Area':str}, low_memory = False)

In [116]:
usa_census = usa_census.rename(columns={'ZIP Code Tabulation Area': 'Zipcode'})
usa_census['Zipcode']

0        00601
1        00602
2        00603
3        00606
4        00610
         ...  
33115    99923
33116    99925
33117    99926
33118    99927
33119    99929
Name: Zipcode, Length: 33120, dtype: object

In [117]:
temp_df = pd.DataFrame(zipcode_results['Zipcode'])
massachusetts_census = pd.merge(temp_df, usa_census, on = 'Zipcode')

In [118]:
len(massachusetts_census)

485

In [119]:
massachusetts_census.head()

Unnamed: 0,Zipcode,FIPS,Geographic Identifier,Name of Area,Qualifying Name,State/U.S.-Abbreviation (USPS),Summary Level,Geographic Component,File Identification,Logical Record Number,...,Own Children under 18 Years,Own Children under 18 Years: Children Living with Single Parents,Households.2,Households: 1-Person Household,Households: 2-Person Household,Households: 3-Person Household,Households: 4-Person Household,Households: 5-Person Household,Households: 6-Person Household,Households: 7-or-More Person Household
0,1001,1001001,86000US01001,01001 ZCTA5,01001 ZCTA5,us,860,0,ACSSF,10707,...,2706,627,7460,2826,2460,1010,785,237,83,59
1,1002,1001002,86000US01002,01002 ZCTA5,01002 ZCTA5,us,860,0,ACSSF,10708,...,3625,1289,9976,2726,3185,1632,1800,455,103,75
2,1005,1001005,86000US01005,01005 ZCTA5,01005 ZCTA5,us,860,0,ACSSF,10710,...,813,184,1785,503,534,322,232,119,35,40
3,1007,1001007,86000US01007,01007 ZCTA5,01007 ZCTA5,us,860,0,ACSSF,10711,...,3153,553,5558,1022,2097,854,1143,364,78,0
4,1008,1001008,86000US01008,01008 ZCTA5,01008 ZCTA5,us,860,0,ACSSF,10712,...,181,45,548,122,236,114,58,18,0,0


In [121]:
massachusetts_census.to_csv('~/Desktop/new_insights/datasets/census/massachusetts_census_data.csv', index = False)

### Double check the zipcodes thats are in the state (i.e. MA) but not in the Census Data all have population = 0.

In [122]:
for zip in list(zipcode_results['Zipcode']):
    if zip not in list(massachusetts_census['Zipcode']):
        print('>',zip)

> 01144
> 01152
> 01195
> 01380
> 01655
> 02031
> 02133
> 02222
> 02283
> 02284
> 02636
