In [3]:
import pandas as pd
import os
import glob
import re


In [4]:
# Access all education attainment csv and combine into one
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "*.csv"))

dfs = []
# loop over the list of csv files
for f in csv_files:
    filename = f.split("\\")[-1]
    year = re.findall(r"\d+", filename)[-1]

    # read the csv file
    df = pd.read_csv(f)
    df = df.T
    df['year'] = year
    dfs.append(df)

df = pd.concat(dfs)

In [5]:
m_ind = [5, 8, 9]
f_ind = [46, 49, 50]

def get_indexes(ind):
    ''' 
    Get the column indexes to retreve data for male and female education attainment
    
    Input (lst): Initial list of indexes, either male or female 
    
    Return: a list of column indexes
    '''

    indexes = ind.copy()
    for i in range(1, 5):
        index = []
        for j in ind:
            index.append(j+8*i)
        indexes.extend(index)

    return indexes 

In [6]:
m_indexes = get_indexes(m_ind)
m_indexes.insert(0, 1)
m_indexes.append(-1)

In [7]:
f_indexes = get_indexes(f_ind)
f_indexes.insert(0, 42)
f_indexes.append(-1)

In [8]:
female_df = df.iloc[:, f_indexes]
male_df = df.iloc[:, m_indexes]

In [9]:
df_f_test = female_df
df_m_test = male_df

In [10]:
df_f_test = df_f_test[df_f_test.index.str.contains('Estimate')]
df_m_test = df_m_test[df_m_test.index.str.contains('Estimate')]

In [11]:
def remove_after_exclamation(s):
    ''' 
    Remove the !! in each row name, i.e., change Alabama!!Estimate to Alabama Estimate
    
    '''
    return re.sub('!.+', '', s)

In [12]:
df_f_test = df_f_test.apply(lambda x: x.str.replace(',', "").astype(int), axis=1)
df_m_test = df_m_test.apply(lambda x: x.str.replace(',', "").astype(int), axis=1)

In [13]:
f_header = female_df.iloc[0, :]
df_f_test.columns = f_header
df_f_test.head()


Label (Grouping),Female:,"High school graduate, GED, or alternative",Bachelor's degree,Graduate or professional degree,"High school graduate, GED, or alternative.1",Bachelor's degree.1,Graduate or professional degree.1,"High school graduate, GED, or alternative.2",Bachelor's degree.2,Graduate or professional degree.2,"High school graduate, GED, or alternative.3",Bachelor's degree.3,Graduate or professional degree.3,"High school graduate, GED, or alternative.4",Bachelor's degree.4,Graduate or professional degree.4,2011
Alabama!!Estimate,1924729,64909,18799,861,71356,60688,27274,82644,54489,32133,214381,91312,59346,147533,28493,21819,2011
Alaska!!Estimate,257346,10911,2585,149,11713,11574,3914,9840,10513,5735,26418,16008,9987,7653,4094,2612,2011
Arizona!!Estimate,2461657,86021,21024,1025,91490,84819,31847,88814,77234,41957,189800,132571,80621,166062,62661,37865,2011
Arkansas!!Estimate,1146736,39434,9264,611,51024,31156,16241,56426,33851,13288,140476,49234,29697,100553,17035,13153,2011
California!!Estimate,14418147,511666,181329,11167,505996,648272,264850,470695,556044,319893,955290,897584,530535,654063,340495,197228,2011


In [14]:
m_header = male_df.iloc[0, :]
df_m_test.columns = m_header
df_m_test.head()

Label (Grouping),Male:,"High school graduate, GED, or alternative",Bachelor's degree,Graduate or professional degree,"High school graduate, GED, or alternative.1",Bachelor's degree.1,Graduate or professional degree.1,"High school graduate, GED, or alternative.2",Bachelor's degree.2,Graduate or professional degree.2,"High school graduate, GED, or alternative.3",Bachelor's degree.3,Graduate or professional degree.3,"High school graduate, GED, or alternative.4",Bachelor's degree.4,Graduate or professional degree.4,2011
Alabama!!Estimate,1753623,76434,13223,418,92241,44248,18932,89687,43725,21042,203297,84960,55710,84311,35573,30788,2011
Alaska!!Estimate,277149,16955,1297,198,18625,8145,1920,15220,7147,5071,31704,15450,10316,7200,4222,3753,2011
Arizona!!Estimate,2396824,111295,18379,958,114173,77159,26501,100965,73019,34735,186241,136595,84341,99042,75488,61785,2011
Arkansas!!Estimate,1080057,53151,6752,1138,68710,29781,9581,68945,26103,11717,142494,46910,29195,64588,20833,17045,2011
California!!Estimate,14006329,636395,123828,9922,682654,541289,196639,561569,495083,303495,954170,872195,584038,366557,346460,312220,2011


In [15]:
df_f_groupby = df_f_test.groupby(level=0, axis=1).sum()
df_m_groupby = df_m_test.groupby(level=0, axis=1).sum()

In [16]:
df_f_groupby.shape, df_m_groupby.shape

((520, 5), (520, 5))

In [17]:
df_f_groupby.iloc[:, 2:]

Label (Grouping),Bachelor's degree,Graduate or professional degree,"High school graduate, GED, or alternative"
Alabama!!Estimate,253781,141433,580823
Alaska!!Estimate,44774,22397,66535
Arizona!!Estimate,378309,193315,622187
Arkansas!!Estimate,140540,72990,387913
California!!Estimate,2623724,1323673,3097710
...,...,...,...
Washington!!Estimate,647285,369016,639419
West Virginia!!Estimate,94536,62724,274374
Wisconsin!!Estimate,470096,227887,661076
Wyoming!!Estimate,38757,20850,57739


In [18]:
df_edu = pd.DataFrame(df_f_groupby.iloc[:, 2:5].sum(axis=1)/ df_m_groupby.iloc[:, 2:5].sum(axis=1))
df_edu.head()

Unnamed: 0,0
Alabama!!Estimate,1.091045
Alaska!!Estimate,0.908187
Arizona!!Estimate,0.994282
Arkansas!!Estimate,1.007538
California!!Estimate,1.008387


In [21]:
df_edu['year'] = df_f_groupby['2011']
df_edu.columns = ['education','year']
df_edu.head()

Unnamed: 0,education,year
Alabama!!Estimate,1.091045,2011
Alaska!!Estimate,0.908187,2011
Arizona!!Estimate,0.994282,2011
Arkansas!!Estimate,1.007538,2011
California!!Estimate,1.008387,2011


In [22]:
# clean the format of state names
df_edu.index = df_edu.index.map(remove_after_exclamation)

In [23]:
# convert index to a column state
df_edu.reset_index(inplace=True)
df_edu = df_edu.rename(columns={'index': "state"})

In [24]:
df_edu.columns

Index(['state', 'education', 'year'], dtype='object')

In [25]:
state_names = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", 
               "Connecticut", "Delaware", "Florida", "Georgia",  "Hawaii", 
               "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", 
               "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", 
               "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", 
               "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
               "Texas", "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", 
               "Wyoming"]


In [26]:
# select 50 major states
df_edu = df_edu[df_edu['state'].isin(state_names)]

In [27]:
df_edu.reset_index(drop=True, inplace=True)

In [28]:
df_edu

Unnamed: 0,state,education,year
0,Alabama,1.091045,2011
1,Alaska,0.908187,2011
2,Arizona,0.994282,2011
3,Arkansas,1.007538,2011
4,California,1.008387,2011
...,...,...,...
495,Virginia,1.031591,2020
496,Washington,0.985292,2020
497,West Virginia,1.006788,2020
498,Wisconsin,1.032818,2020


In [30]:
import seaborn as sns

In [29]:
df_edu.to_csv(r'..\..\cleaned_data\clean_education.csv', encoding='utf-8', index=False)