In [2]:
import pandas as pd
import numpy as np
import sqlite3
import re

conn = sqlite3.connect("profiles.db")
cursor = conn.cursor()
cursor.execute("ATTACH DATABASE 'colleges.db' AS colleges")

<sqlite3.Cursor at 0x7f5384734dc0>

In [3]:
df = pd.read_sql_query("SELECT * FROM colleges.university_profiles AS c JOIN profiles AS p ON c.school = p.school",conn)
df_w_admission_rate = df.copy(deep=True)

In [4]:
def get_type_map(df_column):
    return {t:i for i, t in enumerate(set(df_column))}

def convert_sat_to_act(sat):
    if np.isnan(sat):
        return np.nan
    sat_scores = np.array([990, 1060, 1140, 1210, 1270, 1330, 1390, 1450, 1510, 1560, 1620, 1680, 1740, 1800, 1860, 1920, 1980, 2020, 2080, 2140, 2220, 2290, 2380, 2410])
    return np.where(sat < sat_scores)[0][0] + 13
    
def combine_test_scores(x):
    if not np.isnan(x['act']):
        return x['act']/36.0
    sat = np.array([x['sat_m'], x['sat_r'], x['sat_w']])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def convert_string(x):
    try:
        return int(x)
    except:
        return np.nan

def combine_avg_test_scores(x):
    if not np.isnan(convert_string(x['avg_act'])):
        return convert_string(x['avg_act'])/36.0
    sat = np.array([convert_string(x['sat_math']), convert_string(x['sat_reading']), convert_string(x['sat_writing'])])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def map_statuses(x):
    status_map = {"Denied": 0, "Deferred": 0, "Wait-Listed": 0, "Accepted": 2, "Will Attend": 2}
    return status_map[x['status']]

def convert_class_rank(x):
    s = str(x['class_rank'])
    if " of " in s:
        return float(s[:s.index(" of ")])/float(s[s.index(" of ") + 4:])
    if "Top " in s:
        return float(s[4:s.index("%")])/100.0
    if "Bottom " in s:
        return float(s[7:s.index("%")])/100.0
    return np.nan

def convert_instate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        instate = x['cost_attendance'].split("<br>")[0]
        return instate[instate.index("$")+1:]
    else:
        return x['cost_attendance']

def convert_outstate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        outstate = x['cost_attendance'].split("<br>")[1]
        return outstate[outstate.index("$")+1:]
    else:
        return x['cost_attendance']

def get_tuition(x):
    return x['in_state_tuition'] if x['state'] == x['hs_state'] else x['out_state_tuition']

def map_hs_types(x):
    hs_type_map = {"Public": 0, "Private": 1, "Parochial": 2, "Home": 3}
    return hs_type_map[x['hs_type']]

def map_ins_types(x):
    ins_type_map = {"Public": 0, "Private": 1, "Private for-profit": 1, "&nbsp;": np.nan}
    return ins_type_map[x['institution_type']]

In [5]:
df = df.loc[:,~df.columns.duplicated()]
df.fillna(value=pd.np.nan, inplace=True)
df['avg_act'] = df['avg_act'].map(lambda x: str(x).strip('-'))
df['test_score'] = df.apply(combine_test_scores, axis=1)
df['avg_test_score'] = df.apply(combine_avg_test_scores, axis=1)
df['class_rank'] = df.apply(convert_class_rank, axis=1)
df['institution_type'] = df.apply(map_ins_types, axis=1)
df['in_state_tuition'] = df.apply(convert_instate_tuition, axis=1)
df['out_state_tuition'] = df.apply(convert_outstate_tuition, axis=1)
#df['tuition'] = df.apply(get_tuition, axis=1)

for col in ['cost_attendance', 'city', 'sat_m', 'sat_r', 'sat_w', 'act', 'sat_math', 'sat_reading', 'sat_writing', 'avg_act', 'gpa_w']:
    df = df.drop(col, axis=1)

df = df.dropna()#subset=['test_score','avg_test_score','gpa_uw','class_rank','hs_state','hs_type','institution_type'])

df['status'] = df.apply(map_statuses, axis=1)
df['hs_type'] = df.apply(map_hs_types, axis=1)
replace_columns = ['hs_state', 'state', 'gender']
for col in replace_columns:
    df[col].replace(get_type_map(df[col]), inplace=True)

In [6]:
print(len(df.index))

44042


In [7]:
whole_df = df

def split_df_on(feature=None):
    split_dfs = {}
    if feature:
        for val in whole_df[feature].unique():
            split_dfs[val] = (whole_df.loc[whole_df[feature]==val])
    else:
        split_dfs = {'all': whole_df}
    return split_dfs
df.sample(n=100)



Unnamed: 0,school,state,avg_gpa,average_freshman_aid,admission_rate,faculty_total,international_percent,institution_type,female_percentage,year,...,gpa_uw,class_rank,status,eaed,legacy,athlete,test_score,avg_test_score,in_state_tuition,out_state_tuition
62252,George Mason University,3,3.66,14717,81,1290,5.1,0.0,50.3,2009,...,3.60,0.250000,2,0,0,0,0.694444,0.750000,23752,47351
75350,University of Central Florida,21,3.93,8544,50,1572,1.8,0.0,54.3,2017,...,3.40,0.250000,0,0,0,0,0.638889,0.777778,22501,38600
76726,Georgia Institute of Technology,36,4.02,14735,23,1115,9.0,0.0,38.0,2017,...,4.00,0.003861,2,1,0,0,0.916667,0.888889,30604,51200
37334,Averett University,3,3.19,26770,62,58,5.9,1.0,45.1,2019,...,3.40,0.341379,2,0,0,0,0.500000,0.555556,46344,46344
67727,"University of California, Los Angeles",6,4.36,24405,18,1570,11.,0.0,57.5,2017,...,4.00,0.015730,2,0,0,0,0.916667,0.805556,30485,58499
46616,Scripps College,6,4.07,46467,33,94,5.2,1.0,100.0,2013,...,3.80,0.026961,0,0,0,0,0.694444,0.888889,71060,71060
67414,"University of California, Los Angeles",6,4.36,24405,18,1570,11.,0.0,57.5,2014,...,4.00,0.001290,2,0,0,0,0.944444,0.805556,30485,58499
33949,Harvard College,46,4.18,55354,5,975,11.,1.0,47.7,2018,...,3.90,0.061224,0,1,0,0,0.916667,0.944444,69600,69600
65539,"University of California, Berkeley",6,3.87,24668,15,1623,13.,0.0,52.1,2016,...,3.94,0.050360,0,0,0,0,0.777778,0.805556,35774,63788
76653,Georgia Institute of Technology,36,4.02,14735,23,1115,9.0,0.0,38.0,2014,...,3.83,0.078261,0,0,0,0,0.777778,0.888889,30604,51200


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression

split_on = None
for val, df_s in split_df_on(split_on).items():
    k = 1
    
    df0 = df_s[df.status == 0]
    df2 = df_s[df.status == 2].sample(n = int(k*len(df0)))
    
    df_final = pd.concat([df0, df2])
    df_final_copy = df_final.copy()
    X = df_final.drop('status', axis=1)
    y = df_final['status']
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    X_test_copy = X_test.copy()
    schools = X_test['school']
    X_test = X_test.drop('school', axis=1)
    X_train = X_train.drop('school', axis=1)
    X_test_copy['status'] = y_test
    print(X_test_copy[X_test_copy['school'] == 'Uni'])
    
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    mlp = MLPClassifier(hidden_layer_sizes=(21, 21, 21),max_iter=1000)
    mlp.fit(X_train,y_train)
    predictions = mlp.predict(X_test)
    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))

    logisticRegr = LogisticRegression()
    logisticRegr.fit(X_train, y_train)
    score = logisticRegr.score(X_test, y_test)
    print(f"Logistic Regression Score for {split_on} {val}: " + str(score))



Empty DataFrame
Columns: [school, state, avg_gpa, average_freshman_aid, admission_rate, faculty_total, international_percent, institution_type, female_percentage, year, gender, hs_type, hs_state, gpa_uw, class_rank, eaed, legacy, athlete, test_score, avg_test_score, in_state_tuition, out_state_tuition, status]
Index: []

[0 rows x 23 columns]


  return self.partial_fit(X, y)


[[1847  395]
 [ 475 1678]]
              precision    recall  f1-score   support

           0       0.80      0.82      0.81      2242
           2       0.81      0.78      0.79      2153

   micro avg       0.80      0.80      0.80      4395
   macro avg       0.80      0.80      0.80      4395
weighted avg       0.80      0.80      0.80      4395

Logistic Regression Score for None all: 0.7822525597269625




In [9]:
old_x = X_test_copy.copy()
scaler = StandardScaler()
scaler.fit(X_train)
X = scaler.transform(X_test)

In [10]:

from collections import defaultdict
# Experimental Acceptance Rate
e_a_r = defaultdict(lambda: defaultdict(int))
total_applicants = 0
for res, school in zip(mlp.predict(X), schools):
    if res == 0:
        e_a_r[school]['experimental_reject'] += 1
    elif res == 2:
        e_a_r[school]['experimental_accept'] += 1
    e_a_r[school]['total'] += 1
    total_applicants += 1

for school in schools.unique():
    for student in df_final_copy[df_final_copy['school'] == school].iterrows():
        if student[1][15] == 0:
            e_a_r[school]['observed_reject'] += 1
        else:
            e_a_r[school]['observed_accept'] += 1

print(total_applicants == len(schools))


True


In [11]:
e_a_r

defaultdict(<function __main__.<lambda>()>,
            {'University of Southern California': defaultdict(int,
                         {'experimental_accept': 6,
                          'total': 76,
                          'experimental_reject': 70,
                          'observed_reject': 226,
                          'observed_accept': 68}),
             'University of Michigan': defaultdict(int,
                         {'experimental_reject': 61,
                          'total': 76,
                          'experimental_accept': 15,
                          'observed_reject': 202,
                          'observed_accept': 118}),
             'Goucher College': defaultdict(int,
                         {'experimental_accept': 7,
                          'total': 7,
                          'observed_reject': 3,
                          'observed_accept': 11}),
             'University of Pennsylvania': defaultdict(int,
                         {'experimental_rej

In [12]:
compare_vs_ground_truth = defaultdict(str)

for (index, row), school in zip(old_x.iterrows(), schools):
    accept_rate = None
    if e_a_r[school]['experimental_accept'] != 0 and e_a_r[school]['experimental_reject'] != 0:
        accept_rate = e_a_r[school]['experimental_accept']/e_a_r[school]['total']
    elif e_a_r[school]['experimental_accept'] == 0:
        accept_rate = 0
    elif e_a_r[school]['experimental_reject'] == 0:
        accept_rate = 1
    actual_accept_rate = None
    if e_a_r[school]['observed_accept'] != 0 and e_a_r[school]['observed_reject'] != 0:
        actual_accept_rate = e_a_r[school]['observed_accept']/(e_a_r[school]['observed_reject'] + e_a_r[school]['observed_accept'])
    elif e_a_r[school]['observed_accept'] == 0:
        actual_accept_rate = 0
    elif e_a_r[school]['observed_reject'] == 0:
        actual_accept_rate = 1
    compare_vs_ground_truth[school] = [accept_rate, actual_accept_rate, int(row['admission_rate'])/100]

In [13]:
compare_vs_ground_truth

defaultdict(str,
            {'University of Southern California': [0.07894736842105263,
              0.23129251700680273,
              0.16],
             'University of Michigan': [0.19736842105263158, 0.36875, 0.29],
             'Goucher College': [1, 0.7857142857142857, 0.79],
             'University of Pennsylvania': [0.009009009009009009,
              0.10738255033557047,
              0.09],
             'University of Arizona': [1, 1, 0.84],
             'University of California, Los Angeles': [0.10434782608695652,
              0.22365591397849463,
              0.18],
             'University of Colorado Denver': [1, 0.7142857142857143, 0.65],
             'Dominican University of California': [1, 1, 0.76],
             'Rensselaer Polytechnic Institute': [0.425,
              0.47058823529411764,
              0.43],
             'Stanford University': [0.017543859649122806,
              0.06593406593406594,
              0.05],
             'Fordham University': [0.4

In [14]:
avg_error = 0
error_rates = {}
for (school, admission_stats), (school_2, comparisons) in zip(e_a_r.items(), compare_vs_ground_truth.items()):
    if admission_stats['total'] > 25:
        if school != school_2:
            print("there's a huge mistake somewhere")
        error = abs(comparisons[1]-comparisons[0])
        weight_error = admission_stats['total']/total_applicants
        avg_error += error*weight_error
        error_rates[school] = {'error': error, 'experimental_rate': comparisons[0], 'observed_rate': comparisons[1], 'university_reported': comparisons[2]}
        error_rates[school].update(admission_stats)

In [15]:
avg_error

0.05096980559348792

In [16]:
sorted_errors = sorted(error_rates.items(), reverse=False, key=lambda item: item[1]['error'])
flatten_dict = []
for school, stats in sorted_errors:
    stats['school'] = school
    flatten_dict.append(stats)
flatten_dict

[{'error': 0.005172413793103403,
  'experimental_rate': 0.6551724137931034,
  'observed_rate': 0.65,
  'university_reported': 0.44,
  'experimental_reject': 10,
  'total': 29,
  'experimental_accept': 19,
  'observed_reject': 42,
  'observed_accept': 78,
  'school': 'University of Maryland'},
 {'error': 0.008333333333333304,
  'experimental_rate': 0.5,
  'observed_rate': 0.5083333333333333,
  'university_reported': 0.5,
  'experimental_accept': 18,
  'total': 36,
  'experimental_reject': 18,
  'observed_reject': 59,
  'observed_accept': 61,
  'school': 'University of Central Florida'},
 {'error': 0.010691103474608638,
  'experimental_rate': 0.18518518518518517,
  'observed_rate': 0.1958762886597938,
  'university_reported': 0.31,
  'experimental_reject': 22,
  'total': 27,
  'experimental_accept': 5,
  'observed_reject': 78,
  'observed_accept': 19,
  'school': 'Bucknell University'},
 {'error': 0.02347852950262591,
  'experimental_rate': 0.41025641025641024,
  'observed_rate': 0.43373

In [17]:
output_df = pd.DataFrame(flatten_dict, columns=['school', 'error', 'experimental_rate', 'observed_rate', 'university_reported', 'experimental_accept', 'experimental_reject', 'observed_accept', 'observed_reject'])
#output_df.sample(10).sort_values(by=['error'])
output_df

Unnamed: 0,school,error,experimental_rate,observed_rate,university_reported,experimental_accept,experimental_reject,observed_accept,observed_reject
0,University of Maryland,0.005172,0.655172,0.65,0.44,19,10,78,42
1,University of Central Florida,0.008333,0.5,0.508333,0.5,18,18,61,59
2,Bucknell University,0.010691,0.185185,0.195876,0.31,5,22,19,78
3,Fordham University,0.023479,0.410256,0.433735,0.46,16,23,72,94
4,Princeton University,0.035072,0.042553,0.077626,0.06,4,90,34,404
5,Rensselaer Polytechnic Institute,0.045588,0.425,0.470588,0.43,17,23,48,54
6,Stanford University,0.04839,0.017544,0.065934,0.05,3,168,42,595
7,Georgia Institute of Technology,0.051216,0.452381,0.503597,0.23,19,23,70,69
8,Boston University,0.067633,0.388889,0.456522,0.25,21,33,105,125
9,Florida State University,0.0754,0.515152,0.590551,0.49,17,16,75,52


In [19]:
arr = output_df.values
arr


array([['University of Maryland', 0.005172413793103403,
        0.6551724137931034, 0.65, 0.44, 19, 10, 78, 42],
       ['University of Central Florida', 0.008333333333333304, 0.5,
        0.5083333333333333, 0.5, 18, 18, 61, 59],
       ['Bucknell University', 0.010691103474608638, 0.18518518518518517,
        0.1958762886597938, 0.31, 5, 22, 19, 78],
       ['Fordham University', 0.02347852950262591, 0.41025641025641024,
        0.43373493975903615, 0.46, 16, 23, 72, 94],
       ['Princeton University', 0.035072379286894, 0.0425531914893617,
        0.0776255707762557, 0.06, 4, 90, 34, 404],
       ['Rensselaer Polytechnic Institute', 0.04558823529411765, 0.425,
        0.47058823529411764, 0.43, 17, 23, 48, 54],
       ['Stanford University', 0.04839020628494313, 0.017543859649122806,
        0.06593406593406594, 0.05, 3, 168, 42, 595],
       ['Georgia Institute of Technology', 0.05121616992120587,
        0.4523809523809524, 0.5035971223021583, 0.23, 19, 23, 70, 69],
       ['Bost