In [7]:
import pandas as pd
import numpy as np
import sqlite3
import re

conn = sqlite3.connect("profiles.db")
cursor = conn.cursor()
cursor.execute("ATTACH DATABASE 'colleges.db' AS colleges")

<sqlite3.Cursor at 0x7f5a824f5f80>

In [8]:
df = pd.read_sql_query("SELECT * FROM colleges.university_profiles AS c JOIN profiles AS p ON c.school = p.school",conn)

In [9]:
def get_type_map(df_column):
    return {t:i for i, t in enumerate(set(df_column))}

def convert_sat_to_act(sat):
    if np.isnan(sat):
        return np.nan
    sat_scores = np.array([990, 1060, 1140, 1210, 1270, 1330, 1390, 1450, 1510, 1560, 1620, 1680, 1740, 1800, 1860, 1920, 1980, 2020, 2080, 2140, 2220, 2290, 2380, 2410])
    return np.where(sat < sat_scores)[0][0] + 13
    
def combine_test_scores(x):
    if not np.isnan(x['act']):
        return x['act']/36.0
    sat = np.array([x['sat_m'], x['sat_r'], x['sat_w']])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def convert_string(x):
    try:
        return int(x)
    except:
        return np.nan

def combine_avg_test_scores(x):
    if not np.isnan(convert_string(x['avg_act'])):
        return convert_string(x['avg_act'])/36.0
    sat = np.array([convert_string(x['sat_math']), convert_string(x['sat_reading']), convert_string(x['sat_writing'])])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def map_statuses(x):
    status_map = {"Denied": 0, "Deferred": np.nan, "Wait-Listed": np.nan, "Accepted": 1, "Will Attend": 1}
    return status_map[x['status']]

def convert_class_rank(x):
    s = str(x['class_rank'])
    if " of " in s:
        return float(s[:s.index(" of ")])/float(s[s.index(" of ") + 4:])
    if "Top " in s:
        return float(s[4:s.index("%")])/100.0
    if "Bottom " in s:
        return float(s[7:s.index("%")])/100.0
    return np.nan

def convert_instate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        instate = x['cost_attendance'].split("<br>")[0]
        return instate[instate.index("$")+1:]
    else:
        return x['cost_attendance']

def convert_outstate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        outstate = x['cost_attendance'].split("<br>")[1]
        return outstate[outstate.index("$")+1:]
    else:
        return x['cost_attendance']

def get_tuition(x):
    return x['in_state_tuition'] if x['state'] == x['hs_state'] else x['out_state_tuition']

def map_hs_types(x):
    hs_type_map = {"Public": 0, "Private": 1, "Parochial": 2, "Home": 3, np.nan: np.nan}
    return hs_type_map[x['hs_type']]

def map_ins_types(x):
    ins_type_map = {"Public": 0, "Private": 1, "Private for-profit": 1, "&nbsp;": np.nan}
    return ins_type_map[x['institution_type']]

In [10]:
df = df.loc[:,~df.columns.duplicated()]
df.fillna(value=pd.np.nan, inplace=True)
df['avg_act'] = df['avg_act'].map(lambda x: str(x).strip('-'))
df['test_score'] = df.apply(combine_test_scores, axis=1)
df['avg_test_score'] = df.apply(combine_avg_test_scores, axis=1)
df['class_rank'] = df.apply(convert_class_rank, axis=1)
df['institution_type'] = df.apply(map_ins_types, axis=1)
df['status'] = df.apply(map_statuses, axis=1)
df['hs_type'] = df.apply(map_hs_types, axis=1)
df['in_state_tuition'] = df.apply(convert_instate_tuition, axis=1)
df['out_state_tuition'] = df.apply(convert_outstate_tuition, axis=1)
df['tuition'] = df.apply(get_tuition, axis=1)

c0 = .5
c1 = .4
c2 = .1
df['split_metric'] = c0*df['test_score'] + c1*df['gpa_uw'] + c2*(1-df['class_rank'])

for col in [',in_state_tuition',',out_state_tuition','cost_attendance','school', ',city', 'sat_m', 'sat_r', 'sat_w', 'act', 'sat_math', 'sat_reading', 'sat_writing', 'avg_act', ',gpa_w']:
    try:
        df = df.drop(col, axis=1)
    except:
        pass
    
df = df.dropna()
replace_columns = ['hs_state', 'state', 'gender', 'city']
for col in replace_columns:
    try:
        df[col].replace(get_type_map(df[col]), inplace=True)
    except:
        pass

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.metrics import precision_recall_fscore_support as score

k = 1
df0 = df[df.status == 0]
df2 = df[df.status == 1].sample(n = int(k*len(df0)))
df_final = pd.concat([df0, df2])
df_final = df_final.sort_values(by=['split_metric'])
X = df_final.drop('status', axis=1)
X = X.drop('split_metric', axis=1)
y = df_final['status']

mlps = []
clusters = 3
cluster_tables = []
for i in range(0,len(X)-clusters,int(len(X)/clusters)):
    cluster_tables.append(X[i:i+int(len(X)/clusters)])
    X_train, X_test, y_train, y_test = train_test_split(X[i:i+int(len(X)/clusters)], y[i:i+int(len(X)/clusters)])
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    mlp = MLPClassifier(hidden_layer_sizes=(8,8,8),max_iter=1000)
    mlp.fit(X_train,y_train)
    pred = mlp.predict(X_test)
    mlps.append(score(y_test,pred))

# logisticRegr = LogisticRegression()
# logisticRegr.fit(X_train, y_train)
# pred = logisticRegr.predict(X_test)
# logs.append(score(y_test,pred))

# ridgeRegr = RidgeClassifier()
# ridgeRegr.fit(X_train, y_train)
# pred = ridgeRegr.predict(X_test)
# rids.append(score(y_test,pred))

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [67]:
totals = (sum([x[3][0] for x in mlps]),sum(x[3][1] for x in mlps))
sums = [([x[0][0]*x[3][0], x[0][1]*x[3][1]],[x[1][0]*x[3][0], x[1][1]*x[3][1]],[x[2][0]*x[3][0], x[2][1]*x[3][1]]) for x in mlps]
sums = ((sum([x[0][0] for x in sums])+sum([x[0][1] for x in sums]))/sum(totals),
        (sum([x[1][0] for x in sums])+sum([x[1][1] for x in sums]))/sum(totals),
        (sum([x[2][0] for x in sums])+sum([x[2][1] for x in sums]))/sum(totals))
print(sums)

(0.8520595957387145, 0.8500905797101449, 0.8501707424542374)


In [69]:
for df in cluster_tables:
    print(df['gpa_uw'].mean())
    print(df['class_rank'].mean())
    print(df['test_score'].mean())

3.1969942196531793
0.326322319798781
0.6755827571876534
3.7178272696361785
0.13615349686921002
0.7886867656503833
3.9439102346140773
0.049140972157810485
0.8930352487815937


In [65]:
mlps

[(array([0.75749319, 0.78590786]),
  array([0.77871148, 0.7651715 ]),
  array([0.7679558 , 0.77540107]),
  array([357, 379])),
 (array([0.86703601, 0.904     ]),
  array([0.89684814, 0.87596899]),
  array([0.88169014, 0.88976378]),
  array([349, 387])),
 (array([0.86823529, 0.86495177]),
  array([0.89781022, 0.82769231]),
  array([0.88277512, 0.84591195]),
  array([411, 325]))]

In [55]:
save

Unnamed: 0,state,city,avg_gpa,average_freshman_aid,admission_rate,faculty_total,international_percent,institution_type,female_percentage,year,...,gpa_uw,class_rank,eaed,legacy,athlete,test_score,avg_test_score,in_state_tuition,out_state_tuition,tuition
52580,31,500,3.26,14067,53,808,1.9,0.0,49.8,2011,...,2.60,0.500000,0,0,0,0.611111,0.611111,27622,37087,27622
31704,18,406,3.47,24020,77,654,3.8,0.0,54.7,2014,...,2.90,0.534091,0,0,0,0.694444,0.694444,33758,49138,49138
46190,28,200,3.79,17500,46,2055,14.,0.0,52.9,2018,...,3.24,0.500000,0,0,0,0.777778,0.805556,25948,49986,49986
93525,39,178,3.63,15835,81,1484,10.,0.0,51.7,2020,...,3.32,0.250000,0,0,0,0.583333,0.638889,17936,39580,39580
25161,23,384,3.36,14438,68,797,2.6,0.0,56.9,2017,...,3.03,0.500000,0,0,0,0.694444,0.638889,24076,35236,24076
52139,9,202,4.02,14486,49,1448,1.5,0.0,55.7,2012,...,2.63,0.428571,0,0,0,0.805556,0.777778,22625,37791,22625
85583,11,285,3.72,13222,51,2245,4.7,0.0,45.4,2020,...,2.81,0.287926,0,0,0,0.833333,0.805556,23976,43320,23976
28100,50,455,3.54,9672,74,997,2.0,0.0,59.8,2018,...,2.80,0.556250,0,0,0,0.833333,0.666667,27142,36606,27142
63191,22,136,3.44,15330,74,1329,1.8,0.0,52.1,2013,...,0.00,0.053571,0,1,1,0.027778,0.722222,29704,46381,46381
82826,12,37,3.97,17549,70,1806,6.4,0.0,43.1,2013,...,3.30,0.543417,0,0,0,0.555556,0.750000,21920,39099,21920
