In [1]:
import pandas as pd
import os
import glob
import re


In [2]:
# Access all education attainment csv and combine into one
path = os.getcwd()
csv_files = glob.glob(os.path.join(path, "*.csv"))

dfs = []
# loop over the list of csv files
for f in csv_files:
    filename = f.split("\\")[-1]
    year = re.findall(r"\d+", filename)[-1]

    # read the csv file
    df = pd.read_csv(f)
    df = df.T
    df['year'] = year
    dfs.append(df)

df = pd.concat(dfs)

In [3]:
m_ind = [5, 8, 9]
f_ind = [46, 49, 50]

def get_indexes(ind):
    ''' 
    Get the column indexes to retreve data for male and female education attainment
    
    Input (lst): Initial list of indexes, either male or female 
    
    Return: a list of column indexes
    '''

    indexes = ind.copy()
    for i in range(1, 5):
        index = []
        for j in ind:
            index.append(j+8*i)
        indexes.extend(index)

    return indexes 

In [4]:
m_indexes = get_indexes(m_ind)
m_indexes.insert(0, 1)
m_indexes.append(-1)

In [5]:
f_indexes = get_indexes(f_ind)
f_indexes.insert(0, 42)
f_indexes.append(-1)

In [6]:
female_df = df.iloc[:, f_indexes]
male_df = df.iloc[:, m_indexes]

In [7]:
df_f_test = female_df
df_m_test = male_df

In [8]:
df_f_test = df_f_test[df_f_test.index.str.contains('Estimate')]
df_m_test = df_m_test[df_m_test.index.str.contains('Estimate')]

In [9]:
def remove_after_exclamation(s):
    ''' 
    Remove the !! in each row name, i.e., change Alabama!!Estimate to Alabama Estimate
    
    '''
    return re.sub('!.+', '', s)

In [10]:
df_f_test = df_f_test.apply(lambda x: x.str.replace(',', "").astype(int), axis=1)
df_m_test = df_m_test.apply(lambda x: x.str.replace(',', "").astype(int), axis=1)

In [11]:
f_header = female_df.iloc[0, :]
df_f_test.columns = f_header
df_f_test.head()


Label (Grouping),Female:,High school graduate (includes equivalency),Bachelor's degree,Graduate or professional degree,High school graduate (includes equivalency).1,Bachelor's degree.1,Graduate or professional degree.1,High school graduate (includes equivalency).2,Bachelor's degree.2,Graduate or professional degree.2,High school graduate (includes equivalency).3,Bachelor's degree.3,Graduate or professional degree.3,High school graduate (includes equivalency).4,Bachelor's degree.4,Graduate or professional degree.4,2020
Alabama!!Estimate,1992175,69800,19768,1782,77515,70099,30935,70208,62809,39699,193390,109335,71903,174091,51128,39805,2020
Alaska!!Estimate,263271,11276,2205,163,14592,13159,4653,9623,10790,7329,21797,17955,11658,12527,7315,6054,2020
Arizona!!Estimate,2805863,106396,33650,2114,103928,111405,43754,85798,94769,59629,195738,164064,102201,191143,99812,75440,2020
Arkansas!!Estimate,1190236,42558,13544,734,52406,40411,18641,47271,35915,23091,124807,60344,33049,109236,29873,21067,2020
California!!Estimate,15405196,507125,228087,16832,545062,841760,371150,457152,623081,427710,957760,1009691,620443,705859,525932,342893,2020


In [12]:
m_header = male_df.iloc[0, :]
df_m_test.columns = m_header
df_m_test.head()

Label (Grouping),Male:,High school graduate (includes equivalency),Bachelor's degree,Graduate or professional degree,High school graduate (includes equivalency).1,Bachelor's degree.1,Graduate or professional degree.1,High school graduate (includes equivalency).2,Bachelor's degree.2,Graduate or professional degree.2,High school graduate (includes equivalency).3,Bachelor's degree.3,Graduate or professional degree.3,High school graduate (includes equivalency).4,Bachelor's degree.4,Graduate or professional degree.4,2020
Alabama!!Estimate,1808099,85209,13617,1161,98887,55780,19468,87413,47428,27220,203895,93706,57291,108943,56389,44049,2020
Alaska!!Estimate,290360,18420,1765,166,22550,9682,2722,14903,8388,5030,30500,14895,10849,10994,7977,6511,2020
Arizona!!Estimate,2728556,129369,25016,1917,141838,98192,34858,105988,82275,50733,201828,149744,94893,125727,110967,95962,2020
Arkansas!!Estimate,1118805,55446,9618,701,72873,32978,12752,63631,29273,15411,140303,47408,27380,76285,32446,22656,2020
California!!Estimate,14984186,657609,169188,12861,749072,757796,296247,562755,572076,363978,1010975,936950,626884,442750,497541,442741,2020


In [13]:
df_f_groupby = df_f_test.groupby(level=0, axis=1).sum()
df_m_groupby = df_m_test.groupby(level=0, axis=1).sum()

In [14]:
df_f_groupby.shape, df_m_groupby.shape

((520, 5), (520, 5))

In [15]:
df_edu = df_f_groupby.iloc[:, 2:5] / df_m_groupby.iloc[:, 2:6]
df_edu = df_edu.round(3)

Label (Grouping),Bachelor's degree,Graduate or professional degree,High school graduate (includes equivalency)
Alabama!!Estimate,1.173,1.234,1.001
Alaska!!Estimate,1.204,1.181,0.717
Arizona!!Estimate,1.080,1.017,0.969
Arkansas!!Estimate,1.187,1.224,0.921
California!!Estimate,1.101,1.021,0.927
...,...,...,...
Washington!!Estimate,1.010,0.931,0.972
West Virginia!!Estimate,1.091,1.111,0.997
Wisconsin!!Estimate,1.157,0.998,0.961
Wyoming!!Estimate,1.104,1.046,0.843


In [16]:
df_edu['year'] = df_f_groupby['2020']
df_edu.tail()

Label (Grouping),Bachelor's degree,Graduate or professional degree,High school graduate (includes equivalency),year
Washington!!Estimate,1.009735,0.931379,0.971722,2011
West Virginia!!Estimate,1.091435,1.111455,0.997463,2011
Wisconsin!!Estimate,1.156701,0.997529,0.96052,2011
Wyoming!!Estimate,1.103849,1.045777,0.843445,2011
Puerto Rico!!Estimate,1.664651,1.464186,0.942815,2011


In [17]:
df_edu.index = df_edu.index.map(remove_after_exclamation)

In [18]:
df_edu.reset_index(inplace=True)
df_edu = df_edu.rename(columns={'index': "state"})

In [19]:
df_edu.columns

Index(['state', '            Bachelor's degree',
       '            Graduate or professional degree',
       '            High school graduate (includes equivalency)', 'year'],
      dtype='object', name='Label (Grouping)')

In [20]:
state_names = ["Alaska", "Alabama", "Arkansas", "Arizona", "California", "Colorado", 
               "Connecticut", "Delaware", "Florida", "Georgia",  "Hawaii", 
               "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", 
               "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", 
               "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", 
               "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
               "Texas", "Utah", "Virginia", "Vermont", "Washington", "Wisconsin", "West Virginia", 
               "Wyoming"]


In [21]:
len(state_names)

50

In [22]:
df_edu = df_edu[df_edu['state'].isin(state_names)]

In [23]:
df_edu.to_csv('education', encoding='utf-8', index=False)