## Here, we're going to combine multiple files

### This file is a quick port of the prior year's process, which was automated with a script and YAML file

There are filenames in the yaml file that __need to be updated based on this year's files__ (specifically by incrementing the year)

In [1]:
# The cells here are mostly just transcriptions of the merge_files.py script in the root directory of this repo

import yaml
import pandas as pd
import numpy as np
import os

SETTINGS_FILE = '../raw_inputs/inputs.yaml'

In [2]:
#os.chdir('..')

In [3]:
####### Function definitions ###############
### converter functions ###
def safe2int(x):
    '''converts to int if possible, otherwise is a string'''
    try:
        return int(x)
    except:
        return x

def safe2f(x):
    '''converts to float if possible, otherwise is a string'''
    try:
        return float(x)
    except:
        return x

def p2f(x):
    '''converts percent string to float number'''
    return None if x == 'N/A' else float(x.strip('%'))/100

# This dictionary lets the yaml input translate to one of the functions above
CONVERTERS = {'safe2int': safe2int,
              'safe2f': safe2f,
              'p2f': p2f,
              }

def safe_divide(x):
    '''Divides the first element by the second'''
    if np.isnan(x[0]) or np.isnan(x[1]) or x[1] == 0:
        return np.nan
    else:
        return x[0]/x[1]


In [4]:
### Substantive functions
### These are used for processing input files based on settings in the yaml file

def read_input_file(filename, params):
    """Reads an input file into a DataFrame based on the provided parameters"""
    converters = params['converters'].copy()
    for key, value in converters.items():
        converters[key] = CONVERTERS[value]  # now references actual function
    df = pd.read_csv(filename,
                       index_col=params['index_col'],
                       encoding=params['encoding'],
                       na_values=params['na_values'],
                       converters=converters)
    if 'reduce' in params: # reduce table to those matching this parameter
        var, value = params['reduce'].popitem()
        return df.loc[df[var] == value]
    else:
        return df

def insert_calc_column(df, input_df, ix, final_label, specs):
    '''Processes the insertion of a non-direct pull column into df using
    input_df as the raw table read from a file'''
    if specs['type'] == 'direct test':
        df.insert(ix, final_label, input_df[specs['source']])
    elif specs['type'] == 'remap':
        df.insert(ix, final_label, input_df[specs['source']].replace(
            specs['mapping']))
    elif specs['type'] == 'division':
        operands = specs['source'].split(sep='/')
        new_col = input_df[operands].apply(safe_divide, axis=1)
        df.insert(ix, final_label, new_col)
    elif specs['type'] == 'constant':
        df[final_label] = specs['source']
        
def process_read_instructions(df_in, columns):
    '''Reduces the raw DF (from file) down to a smaller DF based on the
    descriptions from the yaml file'''
    # First, handle the 'direct pull' columns for the base of the DF
    pull_columns = []
    pull_column_labels = []
    for column in columns: # iterates over the list of columns
        for label in column: # iterates over the single key in the dict
            if column[label]['type'] == 'direct pull':
                pull_column_labels.append(label)
                pull_columns.append(column[label]['source'])
    df = df_in[pull_columns]
    df.columns = pull_column_labels

    # Second, insert the more complicated columns in the correct place
    for i in range(len(columns)):
        column = columns[i]
        for label in column: # iterates over the single key in the dict
            if column[label]['type'] != 'direct pull':
                insert_calc_column(df, df_in, i-1, label, column[label])
    return df


In [5]:
# This is the settings file specifying the details of the merge
# Edit it directly to fix the year of the latest files (there is an "INCREMENT" comment where this is required)
with open(SETTINGS_FILE) as f:
    print(f.read())

########################################################
# Master instructions for creation of directory
# Adds filenames in sequence along with the columns to use
# for each
#
# Most of the keys here are based on the arguments for
# the pandas.read_csv function
#
# Where relevant, an "INCREMENT" comment appears next
# to each line with a dated filename that needs to be
# incremented based on the latest file (downloaded in
# step 0)
########################################################

output_file: inputs/College Directory.xlsx
output_sheet_name: 'College Directory 2019'  # INCREMENT each year

input_details:
    - inputs/base_dir.csv:
        - file_setup:
            index_col: 0
            encoding: cp1252
            na_values: N/A
            converters:
                UNITID: safe2int
                ZIP: safe2int
        - UNITID:
            type: index
            source: UNITID
        - College Name:
            type: direct pull
            source: INSTNM
        - Co

## From here below actually runs all of this setup

In [6]:
####### Main script ####################
print('Loading configuration file...')
with open(SETTINGS_FILE, 'r') as ymlfile:
    cfg = yaml.load(ymlfile)

df = None

Loading configuration file...


In [7]:
for file in cfg['input_details']:
    filename, details = file.popitem()
    print('Reading file ({})'.format(filename),flush=True)
    
    this_df = read_input_file(filename, details[0]['file_setup'])
    sliced_df = process_read_instructions(this_df, details[1:])
    
    if type(df) is not pd.DataFrame:
        df = sliced_df
    else:
        df = pd.concat([df, sliced_df], axis=1, join_axes=[df.index])

Reading file (inputs/base_dir.csv)
Reading file (inputs/ic2017.csv)
Reading file (inputs/adm2017.csv)
Reading file (inputs/sfa1617.csv)
Reading file (inputs/ef2017a.csv)
Reading file (inputs/grad_rates.csv)
Reading file (inputs/ef2017d.csv)
Reading file (inputs/hd2017.csv)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Reading file (../raw_inputs/extra_directory_info.csv)


---

## After the raw data has been loaded and combined, there are a few manipulations

### In order, we'll do:

1. Calculation of AdjACT25, 50, and 75 fields (which add converted SAT to low ACT schools)
2. Calculation of AdjSAT25, 50, and 75 fields (which add converted ACT to low SAT schools)
3. Size Range
4. Estimated AA_H retention--based on a formula and the gap between AA/H and Overall grad rates and Retention (will be 0-100)

In [8]:
# First, we'll create "AdjACT25, 50, and 75" fields for admissions analyses
# Next AdjACT25, 50, and 75
admissions_file = 'inputs/adm2017.csv'  # INCREMENT
sat_to_act_file = '../raw_inputs/sat_to_act.csv'
adm_fields = ['APPLCN','ADMSSN',
              'SATPCT','ACTPCT',
              'SATVR25','SATVR75','SATMT25','SATMT75',
              'ACTCM25','ACTCM75']
adm_df = pd.read_csv(admissions_file, index_col=['UNITID'],
                     usecols=['UNITID']+adm_fields,
                     na_values='.',
                     encoding='latin-1')
sat_to_act = pd.read_csv(sat_to_act_file, index_col=['SAT'],dtype={'SAT':int,'ACT':int},encoding='cp1252')
adm_df['pct_accepted'] = adm_df.ADMSSN/adm_df.APPLCN

def calculate_adjact25_50_75(df):
    """
    Estimates the median 'ACT' based on 25th to 75th percentile range of either ACT
    or converted SAT
    """
    adj25, adj50, adj75 = (np.nan, np.nan, np.nan)
    if df.ACTPCT >= 20 and np.isfinite(df.ACTCM25) and np.isfinite(df.ACTCM75): #reasonable number of ACT
        adj25 = df.ACTCM25
        adj75 = df.ACTCM75
        adj50 = (adj25 + adj75)/2
    elif df.SATPCT >= 20 and (np.isfinite(df.SATVR25) and np.isfinite(df.SATMT25) and
                              np.isfinite(df.SATVR75) and np.isfinite(df.SATMT75)): #same threshold for SAT
        sat25 = int(np.round(df.SATVR25+df.SATMT25,decimals=-1))
        sat75 = int(np.round(df.SATVR75+df.SATMT75,decimals=-1))
        adj25 = sat_to_act.ACT[sat25]
        adj75 = sat_to_act.ACT[sat75]
        adj50 = (adj25 + adj75)/2
    return (adj25, adj50, adj75)

adm_df[['AdjACT25','AdjACT50','AdjACT75']] = adm_df.apply(calculate_adjact25_50_75,axis=1,result_type='expand')

In [9]:
# Visually inspect the first few converted rows (looking for AdjACT25, 50, 75 values:
adm_df[adm_df['ACTPCT']<20].head()

Unnamed: 0_level_0,APPLCN,ADMSSN,SATPCT,ACTPCT,SATVR25,SATVR75,SATMT25,SATMT75,ACTCM25,ACTCM75,pct_accepted,AdjACT25,AdjACT50,AdjACT75
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
111966,61,55.0,38.0,0.0,510.0,620.0,565.0,588.0,,,0.901639,21.0,23.0,25.0
118693,2524,1085.0,34.0,11.0,498.0,585.0,510.0,580.0,18.0,25.0,0.429873,19.0,21.5,24.0
119173,2352,1910.0,89.0,15.0,470.0,570.0,450.0,550.0,16.0,21.0,0.812075,17.0,19.5,22.0
120537,786,223.0,81.0,17.0,450.0,550.0,460.0,550.0,17.0,26.0,0.283715,16.0,19.0,22.0
128498,933,596.0,93.0,0.0,410.0,540.0,400.0,510.0,,,0.6388,14.0,17.0,20.0


In [10]:
# Now add them to the main DF
for var in ['AdjACT25','AdjACT50','AdjACT75']:
    df[var] = df.index.map(lambda x: adm_df[var].get(x, default=np.nan))

In [11]:
# Second, we'll follow exactly the same process to create similar variables for SAT
admissions_file = 'inputs/adm2017.csv'  # INCREMENT
act_to_sat_file = '../raw_inputs/act_to_sat.csv'
adm_fields = ['APPLCN','ADMSSN',
              'SATPCT','ACTPCT',
              'SATVR25','SATVR75','SATMT25','SATMT75',
              'ACTCM25','ACTCM75']
adm_df = pd.read_csv(admissions_file, index_col=['UNITID'],
                     usecols=['UNITID']+adm_fields,
                     na_values='.',
                     encoding='latin-1')
act_to_sat = pd.read_csv(act_to_sat_file, index_col=['ACT'],dtype={'SAT':int,'ACT':int},encoding='cp1252')
adm_df['pct_accepted'] = adm_df.ADMSSN/adm_df.APPLCN

def calculate_adjsat25_50_75(df):
    """
    Estimates the median 'SAT' based on 25th to 75th percentile range of either SAT
    or converted ACT
    """
    adj25, adj50, adj75 = (np.nan, np.nan, np.nan)
    if df.SATPCT >= 20 and (np.isfinite(df.SATVR25) and np.isfinite(df.SATMT25) and
                              np.isfinite(df.SATVR75) and np.isfinite(df.SATMT75)):  # reasonable threshold
        adj25 = int(np.round(df.SATVR25+df.SATMT25,decimals=-1))
        adj75 = int(np.round(df.SATVR75+df.SATMT75,decimals=-1))
        adj50 = (adj25 + adj75)/2
    elif df.ACTPCT >= 20 and np.isfinite(df.ACTCM25) and np.isfinite(df.ACTCM75):  # same cutoff for ACT
        act25 = int(df.ACTCM25)
        act75 = int(df.ACTCM75)
        try:
            adj25 = int(act_to_sat.SAT[int(act25)])
        except KeyError as e:
            if act25 < min(act_to_sat.index):  # If they reported an impossibly low score, correct
                adj25 = act_to_sat.SAT[min(act_to_sat.index)]
            else:
                raise e
        adj75 = int(act_to_sat.SAT[act75])
        adj50 = int((adj25 + adj75)/2)
    
    return (adj25, adj50, adj75)

adm_df[['AdjSAT25','AdjSAT50','AdjSAT75']] = adm_df.apply(calculate_adjsat25_50_75,axis=1,result_type='expand')

In [12]:
# Visually inspect the first few converted rows (looking for AdjSAT25, 50, 75 values:
adm_df[adm_df['SATPCT']<20].head()

Unnamed: 0_level_0,APPLCN,ADMSSN,SATPCT,ACTPCT,SATVR25,SATVR75,SATMT25,SATMT75,ACTCM25,ACTCM75,pct_accepted,AdjSAT25,AdjSAT50,AdjSAT75
UNITID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
100654,8610,7772.0,1.0,88.0,365.0,485.0,360.0,495.0,16.0,19.0,0.902671,890.0,950.0,1010.0
100663,7555,6936.0,1.0,94.0,440.0,630.0,550.0,740.0,21.0,28.0,0.918068,1080.0,1195.0,1310.0
100706,4454,3618.0,2.0,96.0,550.0,660.0,530.0,670.0,25.0,31.0,0.812304,1210.0,1305.0,1400.0
100724,6842,6696.0,18.0,85.0,380.0,485.0,375.0,481.0,16.0,20.0,0.978661,890.0,965.0,1040.0
100751,38129,20321.0,19.0,81.0,530.0,640.0,520.0,640.0,23.0,32.0,0.532954,1140.0,1285.0,1430.0


In [13]:
# Now add them to the main DF
for var in ['AdjSAT25','AdjSAT50','AdjSAT75']:
    df[var] = df.index.map(lambda x: adm_df[var].get(x, default=np.nan))

In [14]:
# Third, we'll calculate Size Range
def return_size_range(num_ug):
    """Apply function to give a text response for Size Range"""
    if np.isnan(num_ug):
        return 'N/A'
    else:
        if num_ug >= 20000:
            return '20,000 and above'
        elif num_ug >= 10000:
            return '10,000-19,999'
        elif num_ug >= 5000:
            return '5,000-9,999'
        elif num_ug >= 1000:
            return '1,000-4,999'
        else:
            return 'Under 1,000'
df['Size Range'] = df['# of undergraduates'].apply(return_size_range)

In [15]:
# Fourth, estimated AA/H retention
def calc_aa_h_retention(x):
    """Apply function to estimate the URM retention"""
    overall_grad, aa_h_grad, overall_retention = x
    if np.isfinite(overall_grad) and np.isfinite(aa_h_grad) and np.isfinite(overall_retention):
        overall_6yr_loss = 1 - overall_grad  # This number comes as a percentage
        aa_h_6yr_loss = 1 - aa_h_grad  # This number comes as a percentage
        overall_1yr_loss = 100 - overall_retention  # This number comes as 0-100
        if (overall_6yr_loss > 0) & (aa_h_6yr_loss > 0):
            aa_h_loss_as_share_of_overall = aa_h_6yr_loss / overall_6yr_loss
            aa_h_retention = 100 - aa_h_loss_as_share_of_overall * overall_1yr_loss  # comes as 0-100
            if (aa_h_retention > 0):
                return int(np.round(aa_h_retention,decimals=0))
            else:
                return 0
    return overall_retention
df['Estimated AA_H Retention'] = df[['Adj 6yr Grad','Adj 6yr Grad AA_H','Retention']].apply(
    calc_aa_h_retention, axis=1)

In [16]:
# Finally, write the output
writer = pd.ExcelWriter(cfg['output_file'], engine='xlsxwriter')
wb = writer.book
df.to_excel(writer, sheet_name=cfg['output_sheet_name'], na_rep='N/A')
writer.save()

# From this point on, there is one manual change/addition:

- add the 3+ schools not in NCES (Naval Prep, Arrupe, UIC Honors)

### Two other stylistic steps:

- Format the directory for sharing (including an external version without the Noble numbers)

- (Step 5) Save off subsets of the data for use in college counseling tools--college lists, awards, Bot