# Moving Social Characteristic CSVs into a dataframe

In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
path = 'felipe_csv/social_characteristics'

### Listing out the files in the directory

In [3]:
files = os.listdir(path)
files

['acs5_ZCTA501001.csv',
 'acs5_ZCTA501002.csv',
 'acs5_ZCTA501003.csv',
 'acs5_ZCTA501005.csv',
 'acs5_ZCTA501007.csv',
 'acs5_ZCTA501008.csv',
 'acs5_ZCTA501009.csv',
 'acs5_ZCTA501010.csv',
 'acs5_ZCTA501011.csv',
 'acs5_ZCTA501012.csv',
 'acs5_ZCTA501013.csv',
 'acs5_ZCTA501020.csv',
 'acs5_ZCTA501022.csv',
 'acs5_ZCTA501026.csv',
 'acs5_ZCTA501027.csv',
 'acs5_ZCTA501028.csv',
 'acs5_ZCTA501029.csv',
 'acs5_ZCTA501030.csv',
 'acs5_ZCTA501031.csv',
 'acs5_ZCTA501032.csv',
 'acs5_ZCTA501033.csv',
 'acs5_ZCTA501034.csv',
 'acs5_ZCTA501035.csv',
 'acs5_ZCTA501036.csv',
 'acs5_ZCTA501037.csv',
 'acs5_ZCTA501038.csv',
 'acs5_ZCTA501039.csv',
 'acs5_ZCTA501040.csv',
 'acs5_ZCTA501050.csv',
 'acs5_ZCTA501053.csv',
 'acs5_ZCTA501054.csv',
 'acs5_ZCTA501056.csv',
 'acs5_ZCTA501057.csv',
 'acs5_ZCTA501060.csv',
 'acs5_ZCTA501062.csv',
 'acs5_ZCTA501063.csv',
 'acs5_ZCTA501066.csv',
 'acs5_ZCTA501068.csv',
 'acs5_ZCTA501069.csv',
 'acs5_ZCTA501070.csv',
 'acs5_ZCTA501071.csv',
 'acs5_ZCTA50107

### Creating logic to put the all of the CSVs into a single dataframe

To start, I grabbed a couple of rows of information I thought could be helpful in the regression(households, educational attainment, internet broadband access), but we can add columns as necessary. From the format that Felipe put the CSVs into, I extracted the name of the zip code from the file name, and then got the population estimate for each variable. If needed, we can repeat this process for the margin of error percentage and estimate. 

In [4]:
dct = {}
zipcodes = []
total_households = []
no_diploma = []
assoc_degree = []
bach_degree = []
grad_degree = []
broadband_households = []
dfs = []
for filename in files:
    zip_code = re.search(r'ZCTA50(\d{4})\.csv', filename)
    zipcodes.append(zip_code.group(1))
    df = pd.read_csv(path + '/' + filename)
    dfs.append(df)
    total_households.append(df[df['Unnamed: 0'] == 'Estimate Households By Type Total Households']['Estimate'][0])
    no_diploma.extend(df[df['Unnamed: 0'] == 'Estimate Educational Attainment Population 25 Years And Over 9Th To 12Th Grade, No Diploma']['Estimate'].values)
    assoc_degree.extend(df[df['Unnamed: 0'] == "Estimate Educational Attainment Population 25 Years And Over Associate'S Degree"]['Estimate'].values)
    bach_degree.extend(df[df['Unnamed: 0'] == "Estimate Educational Attainment Population 25 Years And Over Bachelor'S Degree"]['Estimate'].values)
    grad_degree.extend(df[df['Unnamed: 0'] == "Estimate Educational Attainment Population 25 Years And Over Graduate Or Professional Degree"]['Estimate'].values)
    broadband_households.extend(df[df['Unnamed: 0'] == 'Estimate Computers And Internet Use Total Households With A Broadband Internet Subscription']['Estimate'].values)

dct['Zip Code'] = zipcodes
dct['Total Households'] = total_households
dct['Population Over 25 With No High School Diploma'] = no_diploma
dct['Population Over 25 With Associate Degree'] = assoc_degree
dct['Population Over 25 With Bachelor Degree'] = bach_degree
dct['Population Over 25 With Graduate/Professional Degree'] = grad_degree
dct['Households With Computer Access and Broadband Internet'] = broadband_households

social_char_estimates = pd.DataFrame.from_dict(dct)
social_char_estimates

Unnamed: 0,Zip Code,Total Households,Population Over 25 With No High School Diploma,Population Over 25 With Associate Degree,Population Over 25 With Bachelor Degree,Population Over 25 With Graduate/Professional Degree,Households With Computer Access and Broadband Internet
0,1001,6791.0,694.0,1394.0,2808.0,1789.0,5781.0
1,1002,9985.0,153.0,689.0,3291.0,6527.0,9273.0
2,1003,15.0,0.0,46.0,1.0,32.0,10.0
3,1005,1761.0,16.0,276.0,642.0,647.0,1617.0
4,1007,5917.0,164.0,1328.0,2573.0,2524.0,5580.0
...,...,...,...,...,...,...,...
534,2777,6752.0,779.0,1461.0,2402.0,1532.0,5959.0
535,2779,2294.0,103.0,396.0,1347.0,662.0,2094.0
536,2780,20625.0,2920.0,3032.0,5890.0,2185.0,17168.0
537,2790,6667.0,709.0,1383.0,2041.0,1695.0,5907.0
