In [1]:
import pandas as pd
import os
import glob
import re

In [2]:
# Access all education attainment csv and combine into one
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "*.csv"))

dfs = []
# loop over the list of csv files
for f in csv_files:
    filename = f.split("\\")[-1]
    year = re.findall(r"\d+", filename)[-1]

    # read the csv file
    df = pd.read_csv(f)
    df = df.T
    df['year'] = year
    dfs.append(df)

df = pd.concat(dfs)

In [3]:
m_ind = [5, 8, 9]
f_ind = [46, 49, 50]

def get_indexes(ind):
    ''' 
    Get the column indexes to retreve data for male and female education attainment
    
    Input (lst): Initial list of indexes, either male or female 
    
    Return: a list of column indexes
    '''

    indexes = ind.copy()
    for i in range(1, 5):
        index = []
        for j in ind:
            index.append(j+8*i)
        indexes.extend(index)

    return indexes 

In [4]:
m_indexes = get_indexes(m_ind)
m_indexes.insert(0, 1) # total and all male
m_indexes.append(-1) # year

In [5]:
f_indexes = get_indexes(f_ind)
f_indexes.insert(0, 42) # total and all female
f_indexes.append(-1) # year

In [6]:
female_df = df.iloc[:, f_indexes]
male_df = df.iloc[:, m_indexes]

In [7]:
male_df

Unnamed: 0,1,5,8,9,13,16,17,21,24,25,29,32,33,37,40,41,year
Label (Grouping),Male:,High school graduate (includes equ...,Bachelor's degree,Graduate or professional degree,High school graduate (includes equ...,Bachelor's degree,Graduate or professional degree,High school graduate (includes equ...,Bachelor's degree,Graduate or professional degree,High school graduate (includes equ...,Bachelor's degree,Graduate or professional degree,High school graduate (includes equ...,Bachelor's degree,Graduate or professional degree,2018
Alabama!!Estimate,1799902,90367,11509,523,92048,55678,18900,93912,44339,24387,220514,91475,57247,103337,51090,45746,2018
Alabama!!Margin of Error,"±2,714","±4,298","±1,877",±368,"±5,083","±4,070","±2,257","±4,952","±3,452","±2,391","±6,551","±4,457","±3,823","±4,349","±2,654","±2,882",2018
Alaska!!Estimate,291715,17448,2406,0,22293,8485,3242,16983,8592,4459,30842,15083,10293,10553,6930,6300,2018
Alaska!!Margin of Error,±933,"±2,009",±896,±153,"±2,424","±1,534",±854,"±1,966","±1,202",±958,"±2,245","±1,636","±1,506","±1,480","±1,033",±993,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wisconsin!!Margin of Error,"±2,592","±4,480","±2,872",±711,"±4,011","±4,141","±2,674","±3,880","±3,982","±2,275","±5,407","±3,957","±2,896","±3,442","±2,983","±2,458",2016
Wyoming!!Estimate,226138,9675,1512,125,13716,8498,1478,11282,6158,3658,26724,10992,8080,11615,7082,6167,2016
Wyoming!!Margin of Error,"±1,675","±1,472",±717,±219,"±1,874","±1,220",±522,"±1,645","±1,399",±948,"±2,285","±1,488","±1,195","±1,271","±1,312","±1,028",2016
Puerto Rico!!Estimate,1266884,54728,11157,120,64899,36360,9336,65779,33861,14056,128551,58544,29618,72481,27369,17335,2016


In [8]:
df_f_test = female_df
df_m_test = male_df

In [9]:
df_f_test = df_f_test[df_f_test.index.str.contains('Estimate')]
df_m_test = df_m_test[df_m_test.index.str.contains('Estimate')]

In [10]:
def remove_after_exclamation(s):
    ''' 
    Remove the !! in each row name, i.e., change Alabama!!Estimate to Alabama Estimate
    
    '''
    return re.sub('!.+', '', s)

In [11]:
df_f_test = df_f_test.apply(lambda x: x.str.replace(',', "").astype(int), axis=1) # number from str to int
df_m_test = df_m_test.apply(lambda x: x.str.replace(',', "").astype(int), axis=1)

In [12]:
f_header = female_df.iloc[0, :]
df_f_test.columns = f_header 
df_f_test.head()

Label (Grouping),Female:,High school graduate (includes equivalency),Bachelor's degree,Graduate or professional degree,High school graduate (includes equivalency).1,Bachelor's degree.1,Graduate or professional degree.1,High school graduate (includes equivalency).2,Bachelor's degree.2,Graduate or professional degree.2,High school graduate (includes equivalency).3,Bachelor's degree.3,Graduate or professional degree.3,High school graduate (includes equivalency).4,Bachelor's degree.4,Graduate or professional degree.4,2018
Alabama!!Estimate,1980784,70944,18343,832,82245,64862,28408,71769,60320,37318,196065,115011,69283,170092,45244,35396,2018
Alaska!!Estimate,262899,10444,2191,182,11931,11120,5251,9132,11242,6728,21607,17693,11123,9791,7299,4844,2018
Arizona!!Estimate,2728146,104771,26016,2542,100451,106200,41646,85225,93346,52804,196877,153462,96966,189591,88540,69086,2018
Arkansas!!Estimate,1181838,44810,12489,1193,49590,38666,19055,49062,34936,21323,125761,57148,34885,104462,28651,18350,2018
California!!Estimate,15461706,515217,217370,14842,560087,810562,357159,477879,609460,404584,999730,1007824,608985,698818,495572,319610,2018


In [13]:
m_header = male_df.iloc[0, :]
df_m_test.columns = m_header
df_m_test.head()

Label (Grouping),Male:,High school graduate (includes equivalency),Bachelor's degree,Graduate or professional degree,High school graduate (includes equivalency).1,Bachelor's degree.1,Graduate or professional degree.1,High school graduate (includes equivalency).2,Bachelor's degree.2,Graduate or professional degree.2,High school graduate (includes equivalency).3,Bachelor's degree.3,Graduate or professional degree.3,High school graduate (includes equivalency).4,Bachelor's degree.4,Graduate or professional degree.4,2018
Alabama!!Estimate,1799902,90367,11509,523,92048,55678,18900,93912,44339,24387,220514,91475,57247,103337,51090,45746,2018
Alaska!!Estimate,291715,17448,2406,0,22293,8485,3242,16983,8592,4459,30842,15083,10293,10553,6930,6300,2018
Arizona!!Estimate,2655396,121360,21091,1513,135477,94904,32449,103217,77701,45666,202546,143868,91055,123109,105041,90358,2018
Arkansas!!Estimate,1114008,53799,8308,591,72454,32389,11905,62312,29445,14935,145250,48547,25729,74394,31512,22638,2018
California!!Estimate,15017561,658875,158991,11825,742824,730435,283779,583043,551990,343902,1049396,930440,612638,429405,480166,429141,2018


In [14]:
df_f_groupby = df_f_test.groupby(level=0, axis=1).sum()
df_m_groupby = df_m_test.groupby(level=0, axis=1).sum()

In [15]:
df_f_groupby.shape, df_m_groupby.shape

((520, 5), (520, 5))

In [16]:
# Convert three indicators into one that wholostically capture education gender gap, including high school and higher education
df_edu = pd.DataFrame(df_f_groupby.iloc[:, 2:5].sum(axis=1)/ df_m_groupby.iloc[:, 2:5].sum(axis=1))
df_edu.head()

Unnamed: 0,0
Alabama!!Estimate,1.06499
Alaska!!Estimate,0.857659
Arizona!!Estimate,1.013077
Arkansas!!Estimate,1.009733
California!!Estimate,1.012611


In [17]:
df_edu['year'] = df_f_groupby['2018']
df_edu.columns = ['education','year']
df_edu.head()

Unnamed: 0,education,year
Alabama!!Estimate,1.06499,2018
Alaska!!Estimate,0.857659,2018
Arizona!!Estimate,1.013077,2018
Arkansas!!Estimate,1.009733,2018
California!!Estimate,1.012611,2018


In [18]:
# clean the format of state names
df_edu.index = df_edu.index.map(remove_after_exclamation)

In [19]:
# convert index to a column state
df_edu.reset_index(inplace=True)
df_edu = df_edu.rename(columns={'index': "state"})

In [20]:
df_edu.columns

Index(['state', 'education', 'year'], dtype='object')

In [21]:
state_names = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", 
               "Connecticut", "Delaware", "Florida", "Georgia",  "Hawaii", 
               "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", 
               "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", 
               "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", 
               "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
               "Texas", "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", 
               "Wyoming"]


In [22]:
# select 50 major states
df_edu = df_edu[df_edu['state'].isin(state_names)]

In [23]:
df_edu.reset_index(drop=True, inplace=True)

In [24]:
df_edu.sort_values('state', ascending=True)

Unnamed: 0,state,education,year
0,Alabama,1.064990,2018
350,Alabama,1.062572,2015
300,Alabama,1.068706,2014
250,Alabama,1.091045,2011
200,Alabama,1.060205,2013
...,...,...,...
149,Wyoming,0.913917,2020
99,Wyoming,0.912927,2019
49,Wyoming,0.921088,2018
449,Wyoming,0.921088,2017


In [25]:
# Example: check the education gender gap trend
df_edu[df_edu['state']=='Wyoming']

Unnamed: 0,state,education,year
49,Wyoming,0.921088,2018
99,Wyoming,0.912927,2019
149,Wyoming,0.913917,2020
199,Wyoming,0.885097,2012
249,Wyoming,0.879804,2013
299,Wyoming,0.93798,2011
349,Wyoming,0.908773,2014
399,Wyoming,0.90438,2015
449,Wyoming,0.921088,2017
499,Wyoming,0.905729,2016


In [26]:
df_edu.to_csv('clean_education.csv', encoding='utf-8', index=False)