# In this step, we'll process graduation data from the federal files
## In most cases, this is a straight "pull" from the data, but there are a few possible modifications:

- If the sample is too small from the most recent year, use 3 years of data
- For HBCUs, boost by 15%
- For a handful of schools, adjust down to reflect the true Noble rate of success
- Add in a handful of estimates

In [1]:
import pandas as pd
import numpy as np
import os

# Edit these to reflect any changes
work_location = 'inputs'
directory_file = 'hd2017.csv'
base_dir = 'base_dir.csv'
noble_attending = '../../raw_inputs/noble_attending.csv'
gr_output = 'grad_rates.csv'
gr_files = {'latest':'gr2017.csv',
            'one_removed':'gr2016.csv',
            'two_removed':'gr2015.csv'}

In [2]:
os.chdir(work_location)

In [3]:
# We'll use a dict to keep track of each grad rate file, reading in each one
years=['latest','one_removed','two_removed']
gr_dfs = {}
for year in years:
    gr_dfs[year] = pd.read_csv(gr_files[year], index_col=['UNITID'],
                     usecols=['UNITID', 'GRTYPE', 'GRTOTLT','GRBKAAT','GRHISPT'],
                     na_values='.',
                     dtype={'GRTOTLT':float,'GRBKAAT':float,'GRHISPT':float},
                     encoding='latin-1')
    gr_dfs[year].rename(columns={'GRTOTLT':'Total','GRBKAAT':'Black','GRHISPT':'Hisp'}, inplace=True)
    gr_dfs[year]['AA_H']=gr_dfs[year].Black+gr_dfs[year].Hisp
gr_dfs['latest'].head()

Unnamed: 0_level_0,GRTYPE,Total,Black,Hisp,AA_H
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100654,2,839.0,807.0,6.0,813.0
100654,3,201.0,195.0,1.0,196.0
100654,4,336.0,313.0,5.0,318.0
100654,6,839.0,807.0,6.0,813.0
100654,8,839.0,807.0,6.0,813.0


In [4]:
# We now have to sort through these GRTYPES:
# 8 is the adjusted cohort for bachelor's seeking students (completions: 12=6yr, 13=4yr, 14=5yr; transfers=16)
# 29 for associate's seeking (completions: 30=3yr 35=2yr; transfers=33)
# We'll build a list of unitids that have both starting cohorts and completions for either one
valid_unitids = {}
for year in years:
    df = gr_dfs[year]
    valid_unitids[year] = list( (set(df[df['GRTYPE']==8].index) & set(df[df['GRTYPE']==12].index)) |
                                (set(df[df['GRTYPE']==29].index) & set(df[df['GRTYPE']==30].index)) )
print('%d, %d' % (len(gr_dfs['latest']), len(valid_unitids['latest'])))

54714, 3921


In [5]:
# We'll use the basic "hd" directory to form the base of the final year output
def create_year_df(df, source_df1, source_df2):
    """Apply function to pull the appropriate data into a single row per college"""
    ix = df.name
    if ix in source_df1.index:
        return source_df1.loc[ix][['Total','Black','Hisp','AA_H']]
    elif ix in source_df2.index:
        return source_df2.loc[ix][['Total','Black','Hisp','AA_H']]
    else:
        return [np.nan,np.nan,np.nan,np.nan]

year_dfs = {}
for year in years:
    dir_df = pd.read_csv(directory_file, index_col=['UNITID'],
                     usecols=['UNITID','INSTNM'],encoding='latin-1')
    dir_df = dir_df[dir_df.index.isin(valid_unitids[year])]
    
    # First do the starts
    start1 = gr_dfs[year][gr_dfs[year].GRTYPE == 12]
    start2 = gr_dfs[year][gr_dfs[year].GRTYPE == 30]
    dir_df[['Cl_Total','Cl_Black','Cl_Hisp','Cl_AA_H']]=dir_df.apply(create_year_df,axis=1,result_type="expand",
                                                                    args=(start1,start2))
    # Then do the completions
    start1 = gr_dfs[year][gr_dfs[year].GRTYPE == 8]
    start2 = gr_dfs[year][gr_dfs[year].GRTYPE == 29]
    dir_df[['St_Total','St_Black','St_Hisp','St_AA_H']]=dir_df.apply(create_year_df,axis=1,result_type="expand",
                                                                    args=(start1,start2))
    # Next the transfers
    start1 = gr_dfs[year][gr_dfs[year].GRTYPE == 16]
    start2 = gr_dfs[year][gr_dfs[year].GRTYPE == 33]
    dir_df[['Xf_Total','Xf_Black','Xf_Hisp','Xf_AA_H']]=dir_df.apply(create_year_df,axis=1,result_type="expand",
                                                                    args=(start1,start2))
    
    # Finally, calculated within year stats
    for type in ['Total','Black','Hisp','AA_H']:
        dir_df['GR_'+type]=dir_df['Cl_'+type]/dir_df['St_'+type]
        dir_df['Xfr_'+type]=dir_df['Xf_'+type]/dir_df['St_'+type]
        dir_df['CI_'+type]=np.sqrt(dir_df['GR_'+type]*(1-dir_df['GR_'+type])/dir_df['St_'+type])
        dir_df.replace(np.inf,np.nan)
    
    year_dfs[year]=dir_df.copy()
year_dfs['latest'].head()

Unnamed: 0_level_0,INSTNM,Cl_Total,Cl_Black,Cl_Hisp,Cl_AA_H,St_Total,St_Black,St_Hisp,St_AA_H,Xf_Total,...,CI_Total,GR_Black,Xfr_Black,CI_Black,GR_Hisp,Xfr_Hisp,CI_Hisp,GR_AA_H,Xfr_AA_H,CI_AA_H
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100654,Alabama A & M University,201.0,195.0,1.0,196.0,839.0,807.0,6.0,813.0,336.0,...,0.014736,0.241636,0.387856,0.015069,0.166667,0.833333,0.152145,0.241082,0.391144,0.015002
100663,University of Alabama at Birmingham,832.0,180.0,20.0,200.0,1576.0,381.0,36.0,417.0,436.0,...,0.012575,0.472441,0.349081,0.025577,0.555556,0.277778,0.082817,0.479616,0.342926,0.024465
100690,Amridge University,1.0,0.0,1.0,1.0,9.0,2.0,1.0,3.0,8.0,...,0.104757,0.0,1.0,0.0,1.0,0.0,0.0,0.333333,0.666667,0.272166
100706,University of Alabama in Huntsville,317.0,25.0,10.0,35.0,652.0,86.0,26.0,112.0,186.0,...,0.019574,0.290698,0.418605,0.048965,0.384615,0.384615,0.095411,0.3125,0.410714,0.043798
100724,Alabama State University,304.0,286.0,9.0,295.0,1098.0,1048.0,17.0,1065.0,,...,0.013503,0.272901,,0.01376,0.529412,,0.121058,0.276995,,0.013713


In [6]:
# Here, we're just saving the one year files locally for reference
year_dfs['latest'].to_csv('grad2017.csv', na_rep="N/A")
year_dfs['one_removed'].to_csv('grad2016.csv', na_rep="N/A")
year_dfs['two_removed'].to_csv('grad2015.csv', na_rep="N/A")

## The above code created three DFs for the most recent three years
## Each DF has the in year counting stats and rates for graduation
### Now we need create a final set of statistics based on these:
- Adj6yrGrad (overall number after adjustments)
- Adj6yrAAH (African American/Hispanic number after adjustments)
- 6yrGrad (overall number, no adjustments)
- 6yrAAH (AA/H no adjustments)
- 6yrAA
- 6yrH
- Xfer
- XferAAH
- XferAA
- XferH


In [7]:
# We'll start with reading some of the rows from the 'base_dir' created in the last step
dir_df = pd.read_csv(base_dir, index_col=['UNITID'],
                     usecols=['UNITID','INSTNM','Type','HBCU'],encoding='latin-1')
dir_df.head()

Unnamed: 0_level_0,INSTNM,HBCU,Type
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100654,Alabama A & M University,Yes,4 year
100663,University of Alabama at Birmingham,No,4 year
100690,Amridge University,No,4 year
100706,University of Alabama in Huntsville,No,4 year
100724,Alabama State University,Yes,4 year


In [8]:
def bump15(x):
    """Helper function to increase by 15% or half the distance to 100"""
    if x > .7:
        return x + (1-x)*.5
    else:
        return x + .15
    
def set_gradrates(df, year_dfs):
    """Apply function to decide how to set the specific values specified above"""
    ix = df.name
    
    # First we see if there is actual data for the latest year
    if ix in year_dfs['latest'].index:
        ty = year_dfs['latest'].loc[ix]
        gr_source = '2017'
        gr_6yr,gr_6yr_aah,gr_6yr_aa,gr_6yr_h,xf,xf_aah,xf_aa,xf_h = ty.reindex(
            ['GR_Total','GR_AA_H','GR_Black','GR_Hisp','Xfr_Total','Xfr_AA_H','Xfr_Black','Xfr_Hisp'])
        
        # If there's data in the latest year, we'll check how robust and add in prior years if necessary
        ci, ci_aah = ty.reindex(['CI_Total','CI_AA_H'])
        # For HBCUs, we bump by the lesser of 15% or half the distance to 100%
        if (df.HBCU == 'Yes') and (ci_aah <= 0.04):
            adj_6yr = gr_6yr
            adj_6yr_aah = bump15(gr_6yr_aah)
        # Otherwise, add more years if the confidence intervals are too low
        elif (ci >0.015) or (ci_aah >0.05):
            calc_fields = ['Cl_Total','Cl_Black','Cl_Hisp','Cl_AA_H',
                           'St_Total','St_Black','St_Hisp','St_AA_H',
                           'Xf_Total','Xf_Black','Xf_Hisp','Xf_AA_H']
            calc_data = ty.reindex(calc_fields)
            
            if ix in year_dfs['one_removed'].index:
                gr_source = '2016-2017'
                ty=year_dfs['one_removed'].loc[ix]
                calc_data = calc_data+ty.reindex(calc_fields)
                
                if ix in year_dfs['two_removed'].index:
                    gr_source = '2015-2017'
                    ty=year_dfs['two_removed'].loc[ix]
                    calc_data = calc_data+ty.reindex(calc_fields)
                    
                    
            gr_6yr = calc_data['Cl_Total']/calc_data['St_Total'] if calc_data['St_Total']>0 else np.nan
            gr_6yr_aah = calc_data['Cl_AA_H']/calc_data['St_AA_H'] if calc_data['St_AA_H']>0 else np.nan
            gr_6yr_aa = calc_data['Cl_Black']/calc_data['St_Black'] if calc_data['St_Black']>0 else np.nan
            gr_6yr_h = calc_data['Cl_Hisp']/calc_data['St_Hisp'] if calc_data['St_Hisp']>0 else np.nan
            xf = calc_data['Xf_Total']/calc_data['St_Total'] if calc_data['St_Total']>0 else np.nan
            xf_aah = calc_data['Xf_AA_H']/calc_data['St_AA_H'] if calc_data['St_AA_H']>0 else np.nan
            xf_aa = calc_data['Xf_Black']/calc_data['St_Black'] if calc_data['St_Black']>0 else np.nan
            xf_h = calc_data['Xf_Hisp']/calc_data['St_Hisp'] if calc_data['St_Hisp']>0 else np.nan
            adj_6yr = gr_6yr
            adj_6yr_aah = gr_6yr_aah
    
        else:
            adj_6yr = gr_6yr
            adj_6yr_aah = gr_6yr_aah
            
    # If there was no data in the most recent year, we got the prior (and stick--no need to add prior prior)
    elif ix in year_dfs['one_removed'].index:
        ty = year_dfs['one_removed'].loc[ix]
        gr_source = '2016'
        gr_6yr,gr_6yr_aah,gr_6yr_aa,gr_6yr_h,xf,xf_aah,xf_aa,xf_h = ty.reindex(
            ['GR_Total','GR_AA_H','GR_Black','GR_Hisp','Xfr_Total','Xfr_AA_H','Xfr_Black','Xfr_Hisp'])
        adj_6yr = gr_6yr
        adj_6yr_aah = gr_6yr_aah
    
    # If no data in the last two years, we'll go to prior prior (and stick--no need to check CI)
    elif ix in year_dfs['two_removed'].index:
        ty = year_dfs['two_removed'].loc[ix]
        gr_source = '2015'
        gr_6yr,gr_6yr_aah,gr_6yr_aa,gr_6yr_h,xf,xf_aah,xf_aa,xf_h = ty.reindex(
            ['GR_Total','GR_AA_H','GR_Black','GR_Hisp','Xfr_Total','Xfr_AA_H','Xfr_Black','Xfr_Hisp'])
        adj_6yr = gr_6yr
        adj_6yr_aah = gr_6yr_aah
    
    # No data in any of the last 3 years
    else:
        gr_source,adj_6yr,adj_6yr_aah,gr_6yr,gr_6yr_aah,gr_6yr_aa,gr_6yr_h,xf,xf_aah,xf_aa,xf_h=['N/A']+[np.nan]*10
        
    # 2 year schools are given 
    if df['Type'] == '2 year':
        adj_6yr = adj_6yr+0.5*xf
        adj_6yr_aah = adj_6yr_aah+0.5*xf_aah
        
    return [gr_source,
            np.round(adj_6yr,decimals=2),np.round(adj_6yr_aah,decimals=2),
            np.round(gr_6yr,decimals=2),np.round(gr_6yr_aah,decimals=2),
            np.round(gr_6yr_aa,decimals=2),np.round(gr_6yr_h,decimals=2),
            np.round(xf,decimals=2),np.round(xf_aah,decimals=2),
            np.round(xf_aa,decimals=2),np.round(xf_h,decimals=2)]

new_columns = ['GR_Source','Adj6yrGrad','Adj6yrAAH','6yrGrad',
               '6yrAAH','6yrAA','6yrH','Xfer','XferAAH','XferAA','XferH']
dir_df[new_columns] = dir_df.apply(set_gradrates,axis=1,args=(year_dfs,),result_type="expand")
dir_df.head()

Unnamed: 0_level_0,INSTNM,HBCU,Type,GR_Source,Adj6yrGrad,Adj6yrAAH,6yrGrad,6yrAAH,6yrAA,6yrH,Xfer,XferAAH,XferAA,XferH
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100654,Alabama A & M University,Yes,4 year,2017,0.24,0.39,0.24,0.24,0.24,0.17,0.4,0.39,0.39,0.83
100663,University of Alabama at Birmingham,No,4 year,2017,0.53,0.48,0.53,0.48,0.47,0.56,0.28,0.34,0.35,0.28
100690,Amridge University,No,4 year,2016-2017,0.19,0.33,0.19,0.33,0.2,1.0,0.81,0.67,0.8,0.0
100706,University of Alabama in Huntsville,No,4 year,2015-2017,0.49,0.33,0.49,0.33,0.33,0.34,0.3,0.42,0.41,0.42
100724,Alabama State University,Yes,4 year,2017,0.28,0.43,0.28,0.28,0.27,0.53,,,,


In [9]:
dir_df.to_csv(gr_output,na_rep='N/A')

# A few more manual steps
## These should eventually be moved to code, but here are a few more checks:
1. Add a correction for schools where we have a lot of historic results. Historically, this has meant reducing grad rates for schools by 1/3 of the difference between Noble retention and university retention (typically at only 3-4 schools)
2. Increase grad rates for partner colleges (15%)
3. Double check schools known to report oddly: Robert Morris University-Illinois specifically
4. Look for major shifts in grad rate at schools many Noble students attend and consider shifting to a 3year average

In all of these cases, change the grad rates and the "GR_Source" to designate that a non-standard practice was followed