In [320]:
import pandas as pd
import numpy as np
import sqlite3
import re

conn = sqlite3.connect("profiles.db")
cursor = conn.cursor()
cursor.execute("ATTACH DATABASE 'colleges.db' AS colleges")

<sqlite3.Cursor at 0x18bf05543b0>

In [321]:
df = pd.read_sql_query("SELECT * FROM colleges.university_profiles AS c JOIN profiles AS p ON c.school = p.school",conn)
df_w_admission_rate = df.copy(deep=True)

In [322]:
def get_type_map(df_column):
    return {t:i for i, t in enumerate(set(df_column))}

def convert_sat_to_act(sat):
    if np.isnan(sat):
        return np.nan
    sat_scores = np.array([990, 1060, 1140, 1210, 1270, 1330, 1390, 1450, 1510, 1560, 1620, 1680, 1740, 1800, 1860, 1920, 1980, 2020, 2080, 2140, 2220, 2290, 2380, 2410])
    return np.where(sat < sat_scores)[0][0] + 13
    
def combine_test_scores(x):
    if not np.isnan(x['act']):
        return x['act']/36.0
    sat = np.array([x['sat_m'], x['sat_r'], x['sat_w']])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def convert_string(x):
    try:
        return int(x)
    except:
        return np.nan

def combine_avg_test_scores(x):
    if not np.isnan(convert_string(x['avg_act'])):
        return convert_string(x['avg_act'])/36.0
    sat = np.array([convert_string(x['sat_math']), convert_string(x['sat_reading']), convert_string(x['sat_writing'])])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def map_statuses(x):
    status_map = {"Denied": 0, "Deferred": 0, "Wait-Listed": 0, "Accepted": 2, "Will Attend": 2}
    return status_map[x['status']]

def convert_class_rank(x):
    s = str(x['class_rank'])
    if " of " in s:
        return float(s[:s.index(" of ")])/float(s[s.index(" of ") + 4:])
    if "Top " in s:
        return float(s[4:s.index("%")])/100.0
    if "Bottom " in s:
        return float(s[7:s.index("%")])/100.0
    return np.nan

def convert_instate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        instate = x['cost_attendance'].split("<br>")[0]
        return instate[instate.index("$")+1:]
    else:
        return x['cost_attendance']

def convert_outstate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        outstate = x['cost_attendance'].split("<br>")[1]
        return outstate[outstate.index("$")+1:]
    else:
        return x['cost_attendance']

def get_tuition(x):
    return x['in_state_tuition'] if x['state'] == x['hs_state'] else x['out_state_tuition']

def map_hs_types(x):
    hs_type_map = {"Public": 0, "Private": 1, "Parochial": 2, "Home": 3}
    return hs_type_map[x['hs_type']]

def map_ins_types(x):
    ins_type_map = {"Public": 0, "Private": 1, "Private for-profit": 1, "&nbsp;": np.nan}
    return ins_type_map[x['institution_type']]

In [323]:
df = df.loc[:,~df.columns.duplicated()]
df.fillna(value=pd.np.nan, inplace=True)
df['avg_act'] = df['avg_act'].map(lambda x: str(x).strip('-'))
df['test_score'] = df.apply(combine_test_scores, axis=1)
df['avg_test_score'] = df.apply(combine_avg_test_scores, axis=1)
df['class_rank'] = df.apply(convert_class_rank, axis=1)
df['institution_type'] = df.apply(map_ins_types, axis=1)
df['in_state_tuition'] = df.apply(convert_instate_tuition, axis=1)
df['out_state_tuition'] = df.apply(convert_outstate_tuition, axis=1)
#df['tuition'] = df.apply(get_tuition, axis=1)

for col in ['cost_attendance', 'city', 'sat_m', 'sat_r', 'sat_w', 'act', 'sat_math', 'sat_reading', 'sat_writing', 'avg_act', 'gpa_w']:
    df = df.drop(col, axis=1)

df = df.dropna()#subset=['test_score','avg_test_score','gpa_uw','class_rank','hs_state','hs_type','institution_type'])

df['status'] = df.apply(map_statuses, axis=1)
df['hs_type'] = df.apply(map_hs_types, axis=1)
replace_columns = ['hs_state', 'state', 'gender']
for col in replace_columns:
    df[col].replace(get_type_map(df[col]), inplace=True)

In [324]:
print(len(df.index))

44042


In [325]:
whole_df = df

def split_df_on(feature=None):
    split_dfs = {}
    if feature:
        for val in whole_df[feature].unique():
            split_dfs[val] = (whole_df.loc[whole_df[feature]==val])
    else:
        split_dfs = {'all': whole_df}
    return split_dfs
df.sample(n=100)

Unnamed: 0,school,state,avg_gpa,average_freshman_aid,admission_rate,faculty_total,international_percent,institution_type,female_percentage,year,...,gpa_uw,class_rank,status,eaed,legacy,athlete,test_score,avg_test_score,in_state_tuition,out_state_tuition
20625,Western New England University,17,3.40,26141,81,241,3.3,1.0,37.1,2017,...,3.50,0.250000,2,0,0,0,0.694444,0.666667,52547,52547
36440,University of Alabama,18,3.72,16254,53,1382,1.6,0.0,55.9,2019,...,3.55,0.222756,2,0,0,0,0.944444,0.750000,26572,45022
34084,Harvard College,17,4.18,55354,5,975,11.,1.0,47.7,2019,...,3.90,0.104348,0,0,0,0,0.805556,0.944444,69600,69600
73047,Elon University,8,4.00,19885,67,433,2.3,1.0,59.6,2017,...,3.83,0.039326,2,1,0,0,0.861111,0.750000,51048,51048
42778,"University of California, Irvine",21,3.97,23886,37,1290,16.,0.0,52.7,2009,...,3.83,0.019178,2,0,0,0,0.888889,0.805556,33826,61840
90981,Wheaton College,30,3.72,27369,85,222,2.9,1.0,54.1,2012,...,3.20,0.250000,0,0,0,0,0.916667,0.805556,49300,49300
23896,Mount St. Mary College,4,3.30,23758,93,89,0.5,1.0,70.2,2018,...,3.30,0.141026,2,0,0,0,0.583333,0.583333,49076,49076
71070,Fordham University,4,3.65,36410,46,735,8.5,1.0,57.5,2017,...,3.40,0.175439,0,0,0,0,0.611111,0.805556,72239,72239
82871,Virginia Polytechnic Institute and State Unive...,35,3.97,17549,70,1806,6.4,0.0,43.1,2015,...,3.90,0.005168,2,0,0,0,0.972222,0.750000,21920,39099
44816,Loyola Marymount University,21,3.75,28854,52,585,9.8,1.0,55.6,2013,...,3.50,0.146862,2,0,0,0,0.805556,0.777778,68764,68764


In [357]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression

split_on = None
for val, df_s in split_df_on(split_on).items():
    k = 1
    
    df0 = df_s[df.status == 0]
    df2 = df_s[df.status == 2].sample(n = int(k*len(df0)))
    
    df_final = pd.concat([df0, df2])
    schools = df_final['school']
    df_final = df_final.drop('school', axis=1)
    X = df_final.drop('status', axis=1)
    y = df_final['status']
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    mlp = MLPClassifier(hidden_layer_sizes=(21, 21, 21),max_iter=1000)
    mlp.fit(X_train,y_train)
    predictions = mlp.predict(X_test)
    print(confusion_matrix(y_test,predictions))
    print(classification_report(y_test,predictions))

    logisticRegr = LogisticRegression()
    logisticRegr.fit(X_train, y_train)
    score = logisticRegr.score(X_test, y_test)
    print(f"Logistic Regression Score for {split_on} {val}: " + str(score))



  return self.partial_fit(X, y)


[[1805  413]
 [ 475 1702]]
              precision    recall  f1-score   support

           0       0.79      0.81      0.80      2218
           2       0.80      0.78      0.79      2177

   micro avg       0.80      0.80      0.80      4395
   macro avg       0.80      0.80      0.80      4395
weighted avg       0.80      0.80      0.80      4395

Logistic Regression Score for None all: 0.7904436860068259




In [358]:
old_x = X.copy()
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

from collections import defaultdict
experimental_acceptance_rates = defaultdict(lambda: defaultdict(int))
for res, school in zip(mlp.predict(X), schools):
    if res == 2:
        experimental_acceptance_rates[school]['reject'] += 1
    elif res == 0:
        experimental_acceptance_rates[school]['accept'] += 1


  return self.partial_fit(X, y)
  after removing the cwd from sys.path.


In [359]:
experimental_acceptance_rates

defaultdict(<function __main__.<lambda>()>,
            {'Albany College of Pharmacy and Health Sciences': defaultdict(int,
                         {'reject': 3}),
             'Alfred University': defaultdict(int, {'accept': 1, 'reject': 8}),
             'Allegheny College': defaultdict(int,
                         {'reject': 12, 'accept': 2}),
             'DeSales University': defaultdict(int,
                         {'accept': 2, 'reject': 2}),
             'Alvernia University': defaultdict(int,
                         {'accept': 1, 'reject': 1}),
             'Saint Anselm College': defaultdict(int,
                         {'accept': 2, 'reject': 4}),
             'Assumption College': defaultdict(int,
                         {'accept': 4, 'reject': 3}),
             "St. John's University": defaultdict(int,
                         {'accept': 8, 'reject': 56}),
             "Saint Joseph's University": defaultdict(int,
                         {'reject': 21, 'accept': 1})

In [360]:
compare_vs_ground_truth = defaultdict(str)

for (index, row), school in zip(old_x.iterrows(), schools):
    accept_rate = None
    if experimental_acceptance_rates[school]['accept'] != 0 and experimental_acceptance_rates[school]['accept'] != 0:
        accept_rate = experimental_acceptance_rates[school]['accept']/(experimental_acceptance_rates[school]['accept']+experimental_acceptance_rates[school]['reject'])
    elif experimental_acceptance_rates[school]['accept'] == 0:
        accept_rate = 0
    elif experimental_acceptance_rates[school]['reject'] == 0:
        accept_rate = 1
    
    compare_vs_ground_truth[school] = [accept_rate, int(row['admission_rate'])/100]

In [361]:
compare_vs_ground_truth

defaultdict(str,
            {'Albany College of Pharmacy and Health Sciences': [0, 0.67],
             'Alfred University': [0.1111111111111111, 0.63],
             'Allegheny College': [0.14285714285714285, 0.68],
             'DeSales University': [0.5, 0.74],
             'Alvernia University': [0.5, 0.74],
             'Saint Anselm College': [0.3333333333333333, 0.76],
             'Assumption College': [0.5714285714285714, 0.79],
             "St. John's University": [0.125, 0.68],
             "Saint Joseph's University": [0.045454545454545456, 0.77],
             'St. Lawrence University': [0.8333333333333334, 0.48],
             'Ramapo College of New Jersey': [0.2222222222222222, 0.57],
             'College of St. Rose': [0, 0.83],
             'St. Thomas Aquinas College': [0.25, 0.79],
             'Salve Regina University': [0, 0.72],
             'Sarah Lawrence College': [0.3333333333333333, 0.53],
             'University of Scranton': [0.21739130434782608, 0.76],
   