## Load Data

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Files to Load 
race = "Race.csv"
educ = "Education_age_Sex.csv"
social = "Social_characteristics.csv"
income = "Household_Income_Details.csv"
hispanic = "Hispanic_Population.csv"
gender = "Population_age_sex.csv"

# Read Data
df_race = pd.read_csv(race, encoding="ISO-8859-1")
df_educ = pd.read_csv(educ, encoding="ISO-8859-1")
df_social = pd.read_csv(social, encoding="ISO-8859-1")
df_income = pd.read_csv(income, encoding="ISO-8859-1")
df_hispanic = pd.read_csv(hispanic, encoding="ISO-8859-1")
df_gender = pd.read_csv(gender, encoding="ISO-8859-1")

# Drop Column Labels
df_race.drop(0, inplace=True)
df_educ.drop(0, inplace=True)
df_social.drop(0, inplace=True)
df_income.drop(0, inplace=True)
df_hispanic.drop(0, inplace=True)
df_gender.drop(0, inplace=True)

#df_race.head()
#df_educ.head()
#df_social.head()
#df_income.head()
#df_hispanic.head()

## Clean Data

### 1) Education

In [2]:
# Keep Selected Columns for Education Series
df_keep_educ = df_educ[['GEO.id','GEO.id2','GEO.display-label',
                        'HC01_EST_VC02','HC01_EST_VC03','HC01_EST_VC04',
                        'HC01_EST_VC05','HC01_EST_VC06','HC01_EST_VC08',
                        'HC01_EST_VC09','HC01_EST_VC10','HC01_EST_VC11',
                        'HC01_EST_VC12','HC01_EST_VC13','HC01_EST_VC14',
                        'HC01_EST_VC15','HC01_EST_VC42','HC01_EST_VC43',
                        'HC01_EST_VC46','HC01_EST_VC47','HC01_EST_VC50',
                        'HC01_EST_VC51','HC01_EST_VC54','HC01_EST_VC55',
                        'HC01_EST_VC70','HC01_EST_VC71']]

# Rename columns
df_keep_educ.rename(columns={'GEO.id': 'GEO_id', 
                             'GEO.id2': 'GEO_id2', 
                             'GEO.display-label': 'GEO_label',
                             'HC01_EST_VC02': 'pop_1824',
                             'HC01_EST_VC03': 'edu_1824_lessthan_high_school',
                             'HC01_EST_VC04': 'edu_1824_high_school',
                             'HC01_EST_VC05': 'edu_1824_some_college',
                             'HC01_EST_VC06': 'edu_1824_bachelor',
                             'HC01_EST_VC08': 'pop_25',
                             'HC01_EST_VC09': 'edu_25_lessthan9thgrade',
                             'HC01_EST_VC10': 'edu_25_high_school_no_diploma',
                             'HC01_EST_VC11': 'edu_25_high_school',
                             'HC01_EST_VC12': 'edu_25_some_college',
                             'HC01_EST_VC13': 'edu_25_associate',
                             'HC01_EST_VC14': 'edu_25_bachelor',
                             'HC01_EST_VC15': 'edu_25_grad_school',
                             'HC01_EST_VC42': 'edu_white_high_school',
                             'HC01_EST_VC43': 'edu_white_bachelor',
                             'HC01_EST_VC46': 'edu_black_high_school',
                             'HC01_EST_VC47': 'edu_black_bachelor',
                             'HC01_EST_VC50': 'edu_native_high_school',
                             'HC01_EST_VC51': 'edu_native_bachelor',
                             'HC01_EST_VC54': 'edu_asian_high_school',
                             'HC01_EST_VC55': 'edu_asian_bachelor',
                             'HC01_EST_VC70': 'edu_hispanic_high_school',
                             'HC01_EST_VC71': 'edu_hispanic_bachelor'}, inplace=True)

#df_keep_educ.head()

### 2) Race

In [3]:
# Keep Selected Columns for Race Series
df_keep_race = df_race[['GEO.id','GEO.id2','GEO.display-label',
                        'HD01_VD03','HD01_VD04','HD01_VD05',
                        'HD01_VD06','HD01_VD07','HD01_VD08']]

# Rename columns
df_keep_race.rename(columns={'GEO.id': 'GEO_id', 
                             'GEO.id2': 'GEO_id2', 
                             'GEO.display-label': 'GEO_label',
                             'HD01_VD03': 'pop_white',
                             'HD01_VD04': 'pop_black',
                             'HD01_VD05': 'pop_native',
                             'HD01_VD06': 'pop_asian',
                             'HD01_VD07': 'pop_pacific',
                             'HD01_VD08': 'pop_other'}, inplace=True)

#df_keep_race.head()

### 3) Hispanic

In [4]:
# Keep Selected Columns for Hispanic Population
df_keep_hispanic = df_hispanic[['GEO.id','GEO.id2','GEO.display-label',
                                'HD01_VD01','HD01_VD02','HD01_VD12']]

# Rename columns
df_keep_hispanic.rename(columns={'GEO.id': 'GEO_id', 
                             'GEO.id2': 'GEO_id2', 
                             'GEO.display-label': 'GEO_label',
                             'HD01_VD01': 'pop_total',
                             'HD01_VD02': 'pop_not_hispanic',
                             'HD01_VD12': 'pop_hispanic'}, inplace=True)

#df_keep_hispanic.head()

### 3) Gender

In [5]:
# Keep Selected Columns for Gender & Age Series
df_keep_gender = df_gender[['GEO.id','GEO.id2','GEO.display-label',
                            'HD01_VD06','HD01_VD25']]

# Rename columns
df_keep_gender.rename(columns={'GEO.id': 'GEO_id', 
                             'GEO.id2': 'GEO_id2', 
                             'GEO.display-label': 'GEO_label',
                             'HD01_VD06': 'pop_male',
                             'HD01_VD25': 'pop_female'}, inplace=True)

#df_keep_gender.head()      

### 4) Merge Race & Gender

In [6]:
# Merge Race & Gender
df_keep_pop = pd.merge(df_keep_race, df_keep_hispanic, how='left', on=('GEO_id','GEO_id2','GEO_label'))
df_keep_pop = pd.merge(df_keep_pop, df_keep_gender, how='left', on=('GEO_id','GEO_id2','GEO_label'))

#df_keep_pop.head()

### 5) Computer & Internet Access

In [7]:
# Keep Selected Columns for COMPUTERS AND INTERNET USE
df_keep_social = df_social[['GEO.id','GEO.id2','GEO.display-label',
                            'HC01_VC216','HC03_VC216','HC01_VC217',
                            'HC03_VC217','HC01_VC218','HC03_VC218']]

# Rename columns
df_keep_social.rename(columns={'GEO.id': 'GEO_id', 
                             'GEO.id2': 'GEO_id2', 
                             'GEO.display-label': 'GEO_label',
                             'HC01_VC216': 'hh_access',
                             'HC03_VC216': 'hh_access_pct',
                             'HC01_VC217': 'hh_computer',
                             'HC03_VC217': 'hh_computer_pct',
                             'HC01_VC218': 'hh_broadband',
                             'HC03_VC218': 'hh_broadband_pct'}, inplace=True)

#df_keep_social.head()

### 6) Household Income Breakdown

In [8]:
# Keep Selected Columns for Household Income Breadown
df_keep_income = df_income[['GEO.id','GEO.id2','GEO.display-label',
                            'HC01_EST_VC02','HC01_EST_VC03','HC01_EST_VC04',
                            'HC01_EST_VC05','HC01_EST_VC06','HC01_EST_VC07',
                            'HC01_EST_VC08','HC01_EST_VC09','HC01_EST_VC10',
                            'HC01_EST_VC11','HC01_EST_VC13','HC01_EST_VC15']]

# Rename columns
df_keep_income.rename(columns={'GEO.id': 'GEO_id', 
                             'GEO.id2': 'GEO_id2', 
                             'GEO.display-label': 'GEO_label',
                             'HC01_EST_VC02': 'hh_inc_less10k',
                             'HC01_EST_VC03': 'hh_inc_10_14k',
                             'HC01_EST_VC04': 'hh_inc_15_24k',
                             'HC01_EST_VC05': 'hh_inc_25_34k',
                             'HC01_EST_VC06': 'hh_inc_35_49k',
                             'HC01_EST_VC07': 'hh_inc_50_74k',
                             'HC01_EST_VC08': 'hh_inc_75_99k',
                             'HC01_EST_VC09': 'hh_inc_100_149k',
                             'HC01_EST_VC10': 'hh_inc_150_199k',
                             'HC01_EST_VC11': 'hh_inc_200k',
                             'HC01_EST_VC13': 'hh_inc_median',
                             'HC01_EST_VC15': 'hh_inc_mean'}, inplace=True)

#df_keep_income.head()

## Inspect Data

In [9]:
# Count number of counties in education data frame
len(df_keep_educ['GEO_label'].unique())

3142

In [10]:
# Count number of counties in race data frame
len(df_keep_race['GEO_label'].unique())

3142

In [11]:
# Count number of counties in race data frame
len(df_keep_hispanic['GEO_label'].unique())

3142

In [12]:
# Count number of counties in race data frame
len(df_keep_gender['GEO_label'].unique())

3142

In [13]:
# Count number of counties in race data frame
len(df_keep_pop['GEO_label'].unique())

3142

In [14]:
# Count number of counties in computer & internet data frame
len(df_keep_social['GEO_label'].unique())

3142

In [15]:
# Count number of counties in income data frame
len(df_keep_income['GEO_label'].unique())

3142

In [16]:
# Percentage of missing values for each column
#df_keep_race.isnull().mean().round(4) * 100

## Export to CSV

In [None]:
df_keep_educ.to_csv("1_Eductation.csv", index=False)
df_keep_pop.to_csv("2_Population.csv", index=False)