In [123]:
import pandas as pd
import numpy as np
import sqlite3
import re

conn = sqlite3.connect("profiles.db")
cursor = conn.cursor()
cursor.execute("ATTACH DATABASE 'colleges.db' AS colleges")

<sqlite3.Cursor at 0x205ae73e5e0>

In [124]:
df = pd.read_sql_query("SELECT * FROM colleges.university_profiles AS c JOIN profiles AS p ON c.school = p.school",conn)
df_w_admission_rate = df.copy(deep=True)

In [125]:
def get_type_map(df_column):
    return {t:i for i, t in enumerate(set(df_column))}

def convert_sat_to_act(sat):
    if np.isnan(sat):
        return np.nan
    sat_scores = np.array([990, 1060, 1140, 1210, 1270, 1330, 1390, 1450, 1510, 1560, 1620, 1680, 1740, 1800, 1860, 1920, 1980, 2020, 2080, 2140, 2220, 2290, 2380, 2410])
    return np.where(sat < sat_scores)[0][0] + 13
    
def combine_test_scores(x):
    if not np.isnan(x['act']):
        return x['act']/36.0
    sat = np.array([x['sat_m'], x['sat_r'], x['sat_w']])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def convert_string(x):
    try:
        return int(x)
    except:
        return np.nan

def combine_avg_test_scores(x):
    if not np.isnan(convert_string(x['avg_act'])):
        return convert_string(x['avg_act'])/36.0
    sat = np.array([convert_string(x['sat_math']), convert_string(x['sat_reading']), convert_string(x['sat_writing'])])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def map_statuses(x):
    status_map = {"Denied": 0, "Deferred": 0, "Wait-Listed": 0, "Accepted": 2, "Will Attend": 2}
    return status_map[x['status']]

def convert_class_rank(x):
    s = str(x['class_rank'])
    if " of " in s:
        return float(s[:s.index(" of ")])/float(s[s.index(" of ") + 4:])
    if "Top " in s:
        return float(s[4:s.index("%")])/100.0
    if "Bottom " in s:
        return float(s[7:s.index("%")])/100.0
    return np.nan

def convert_instate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        instate = x['cost_attendance'].split("<br>")[0]
        return instate[instate.index("$")+1:]
    else:
        return x['cost_attendance']

def convert_outstate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        outstate = x['cost_attendance'].split("<br>")[1]
        return outstate[outstate.index("$")+1:]
    else:
        return x['cost_attendance']

def get_tuition(x):
    return x['in_state_tuition'] if x['state'] == x['hs_state'] else x['out_state_tuition']

def map_hs_types(x):
    hs_type_map = {"Public": 0, "Private": 1, "Parochial": 2, "Home": 3}
    return hs_type_map[x['hs_type']]

def map_ins_types(x):
    ins_type_map = {"Public": 0, "Private": 1, "Private for-profit": 1, "&nbsp;": np.nan}
    return ins_type_map[x['institution_type']]

In [126]:
df = df.loc[:,~df.columns.duplicated()]
df.fillna(value=pd.np.nan, inplace=True)
df['avg_act'] = df['avg_act'].map(lambda x: str(x).strip('-'))
df['test_score'] = df.apply(combine_test_scores, axis=1)
df['avg_test_score'] = df.apply(combine_avg_test_scores, axis=1)
df['class_rank'] = df.apply(convert_class_rank, axis=1)
df['institution_type'] = df.apply(map_ins_types, axis=1)
df['in_state_tuition'] = df.apply(convert_instate_tuition, axis=1)
df['out_state_tuition'] = df.apply(convert_outstate_tuition, axis=1)
#df['tuition'] = df.apply(get_tuition, axis=1)

for col in ['cost_attendance', 'city', 'sat_m', 'sat_r', 'sat_w', 'act', 'sat_math', 'sat_reading', 'sat_writing', 'avg_act', 'gpa_w']:
    df = df.drop(col, axis=1)

df = df.dropna()#subset=['test_score','avg_test_score','gpa_uw','class_rank','hs_state','hs_type','institution_type'])

df['status'] = df.apply(map_statuses, axis=1)
df['hs_type'] = df.apply(map_hs_types, axis=1)
replace_columns = ['hs_state', 'state', 'gender']
for col in replace_columns:
    df[col].replace(get_type_map(df[col]), inplace=True)

In [127]:
print(len(df.index))

44042


In [128]:
whole_df = df

def split_df_on(feature=None):
    split_dfs = {}
    if feature:
        for val in whole_df[feature].unique():
            split_dfs[val] = (whole_df.loc[whole_df[feature]==val])
    else:
        split_dfs = {'all': whole_df}
    return split_dfs
df.sample(n=100)

Unnamed: 0,school,state,avg_gpa,average_freshman_aid,admission_rate,faculty_total,international_percent,institution_type,female_percentage,year,...,gpa_uw,class_rank,status,eaed,legacy,athlete,test_score,avg_test_score,in_state_tuition,out_state_tuition
25778,University of Chicago,8,4.48,54438,9,1403,13.,1.0,48.9,2018,...,3.34,0.138333,0,0,0,0,0.888889,0.888889,75735,75735
62580,Emory University,17,3.76,44118,22,1066,15.,1.0,59.5,2010,...,3.75,0.035185,2,0,0,0,0.888889,0.861111,66986,66986
47857,Stanford University,1,3.94,53337,5,1613,9.1,1.0,48.5,2016,...,3.95,0.042857,0,0,0,0,0.805556,0.861111,66184,66184
55092,"University of California, Riverside",1,3.74,26448,57,976,2.9,0.0,54.3,2012,...,3.10,0.345000,2,0,0,0,0.916667,0.666667,36092,64106
8178,State University of New York College at Oneonta,5,3.43,9936,60,286,0.8,0.0,59.7,2011,...,3.10,0.395349,2,1,0,0,0.694444,0.638889,23996,33646
87548,Roanoke College,9,3.54,36817,67,165,3.2,1.0,57.7,2011,...,3.90,0.050314,2,0,0,0,0.694444,0.666667,61095,61095
77296,Florida Atlantic University,10,4.04,13295,60,826,3.0,0.0,55.9,2018,...,3.00,0.500000,2,0,0,0,0.638889,0.638889,22235,37791
52188,Florida State University,10,4.02,14486,49,1448,1.5,0.0,55.7,2013,...,3.70,0.077259,2,0,0,0,0.722222,0.777778,22625,37791
3303,Penn State University Park,22,3.58,9620,50,2679,11.,0.0,46.7,2013,...,3.50,0.199752,2,0,0,0,0.777778,0.777778,36344,51572
1785,Sarah Lawrence College,5,3.71,36059,53,119,12.,1.0,71.5,2009,...,3.80,0.250000,2,0,0,0,0.888889,0.805556,70266,70266


In [129]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression

split_on = None
for val, df_s in split_df_on(split_on).items():
    k = 1
    
    df0 = df_s[df.status == 0]
    df2 = df_s[df.status == 2].sample(n = int(k*len(df0)))
    
    df_final = pd.concat([df0, df2])
    schools = df_final['school']
    df_final = df_final.drop('school', axis=1)
    X = df_final.drop('status', axis=1)
    y = df_final['status']
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    mlp = MLPClassifier(hidden_layer_sizes=(21, 21, 21),max_iter=1000)
    mlp.fit(X_train,y_train)
    predictions = mlp.predict(X_test)
    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))

    logisticRegr = LogisticRegression()
    logisticRegr.fit(X_train, y_train)
    score = logisticRegr.score(X_test, y_test)
    print(f"Logistic Regression Score for {split_on} {val}: " + str(score))



[[1782  391]
 [ 466 1756]]
             precision    recall  f1-score   support

          0       0.79      0.82      0.81      2173
          2       0.82      0.79      0.80      2222

avg / total       0.81      0.81      0.80      4395

Logistic Regression Score for None all: 0.7902161547212742


In [130]:
old_x = X.copy()
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [131]:

from collections import defaultdict
# Experimental Acceptance Rate
e_a_r = defaultdict(lambda: defaultdict(int))
total_applicants = 0
for res, school in zip(mlp.predict(X), schools):
    if res == 0:
        e_a_r[school]['reject'] += 1
    elif res == 2:
        e_a_r[school]['accept'] += 1
    e_a_r[school]['total'] += 1
    total_applicants += 1

print(total_applicants == len(schools))


True


In [132]:
e_a_r

defaultdict(<function __main__.<lambda>>,
            {'Abilene Christian University': defaultdict(int,
                         {'accept': 3, 'total': 3}),
             'Adams State University': defaultdict(int,
                         {'accept': 2, 'total': 2}),
             'Adelphi University': defaultdict(int, {'accept': 9, 'total': 9}),
             'Adrian College': defaultdict(int,
                         {'accept': 1, 'reject': 2, 'total': 3}),
             'Agnes Scott College': defaultdict(int,
                         {'accept': 12, 'total': 12}),
             'Alabama State University': defaultdict(int,
                         {'accept': 2, 'reject': 5, 'total': 7}),
             'Albany College of Pharmacy and Health Sciences': defaultdict(int,
                         {'accept': 3, 'reject': 1, 'total': 4}),
             'Albany State University': defaultdict(int,
                         {'accept': 3, 'reject': 4, 'total': 7}),
             'Albion College': defaultd

In [133]:
compare_vs_ground_truth = defaultdict(str)

for (index, row), school in zip(old_x.iterrows(), schools):
    accept_rate = None
    if e_a_r[school]['accept'] != 0 and e_a_r[school]['accept'] != 0:
        accept_rate = e_a_r[school]['accept']/e_a_r[school]['total']
    elif e_a_r[school]['accept'] == 0:
        accept_rate = 0
    elif e_a_r[school]['reject'] == 0:
        accept_rate = 1
    
    compare_vs_ground_truth[school] = [accept_rate, int(row['admission_rate'])/100]

In [134]:
compare_vs_ground_truth

defaultdict(str,
            {'Abilene Christian University': [1.0, 0.51],
             'Adams State University': [1.0, 0.99],
             'Adelphi University': [1.0, 0.73],
             'Adrian College': [0.3333333333333333, 0.56],
             'Agnes Scott College': [1.0, 0.66],
             'Alabama State University': [0.2857142857142857, 0.98],
             'Albany College of Pharmacy and Health Sciences': [0.75, 0.67],
             'Albany State University': [0.42857142857142855, 0.5],
             'Albion College': [1.0, 0.71],
             'Albright College': [0.8181818181818182, 0.5],
             'Alcorn State University': [1.0, 0.79],
             'Alfred University': [0.75, 0.63],
             'Allegheny College': [0.8461538461538461, 0.68],
             'Alma College': [1.0, 0.64],
             'Alvernia University': [0.6666666666666666, 0.74],
             'Alverno College': [1.0, 0.78],
             'American University': [0.376, 0.29],
             'Anderson University'

In [135]:
avg_error = 0
error_rates = {}
for (school, admission_stats), (school_2, comparisons) in zip(e_a_r.items(), compare_vs_ground_truth.items()):
    if school != school_2:
        print("there's a huge mistake somewhere")
    error = abs(comparisons[1]-comparisons[0])
    weight_error = admission_stats['total']/total_applicants
    avg_error += error*weight_error
    if admission_stats['total'] > 25:
        error_rates[school] = {'error': error, 'experimental': comparisons[0], 'real': comparisons[1]}
        error_rates[school].update(admission_stats)

In [136]:
avg_error

0.12252161547212746

In [143]:
sorted_errors = sorted(error_rates.items(), reverse=False, key=lambda item: item[1]['error'])
flatten_dict = []
for school, stats in sorted_errors:
    stats['school'] = school
    flatten_dict.append(stats)
flatten_dict

[{'accept': 20,
  'error': 0.0,
  'experimental': 0.5,
  'real': 0.5,
  'reject': 20,
  'school': 'University of San Diego',
  'total': 40},
 {'accept': 19,
  'error': 0.0007692307692307443,
  'experimental': 0.7307692307692307,
  'real': 0.73,
  'reject': 7,
  'school': 'Bryant University',
  'total': 26},
 {'accept': 31,
  'error': 0.00216216216216214,
  'experimental': 0.8378378378378378,
  'real': 0.84,
  'reject': 6,
  'school': 'University of Arizona',
  'total': 37},
 {'accept': 43,
  'error': 0.003076923076923088,
  'experimental': 0.8269230769230769,
  'real': 0.83,
  'reject': 9,
  'school': 'University of Oregon',
  'total': 52},
 {'accept': 70,
  'error': 0.0033333333333334103,
  'experimental': 0.6666666666666666,
  'real': 0.67,
  'reject': 35,
  'school': 'University of Vermont',
  'total': 105},
 {'accept': 23,
  'error': 0.00696969696969707,
  'experimental': 0.696969696969697,
  'real': 0.69,
  'reject': 10,
  'school': 'University of Rhode Island',
  'total': 33},
 {

In [148]:
output_df = pd.DataFrame(flatten_dict, columns=['school', 'error', 'experimental', 'real', 'accept', 'reject', 'total'])
output_df

Unnamed: 0,school,error,experimental,real,accept,reject,total
0,University of San Diego,0.000000,0.500000,0.50,20,20.0,40
1,Bryant University,0.000769,0.730769,0.73,19,7.0,26
2,University of Arizona,0.002162,0.837838,0.84,31,6.0,37
3,University of Oregon,0.003077,0.826923,0.83,43,9.0,52
4,University of Vermont,0.003333,0.666667,0.67,70,35.0,105
5,University of Rhode Island,0.006970,0.696970,0.69,23,10.0,33
6,Colorado State University,0.008710,0.838710,0.83,26,5.0,31
7,Hofstra University,0.009565,0.630435,0.64,29,17.0,46
8,Christopher Newport University,0.009730,0.729730,0.72,27,10.0,37
9,"California Polytechnic State University, San L...",0.013128,0.363128,0.35,65,114.0,179
