In [8]:
import pandas as pd
import numpy as np
import sqlite3
import re

conn = sqlite3.connect("profiles.db")
cursor = conn.cursor()
cursor.execute("ATTACH DATABASE 'colleges.db' AS colleges")

<sqlite3.Cursor at 0x7fc0a8fa2500>

In [9]:
df = pd.read_sql_query("SELECT * FROM colleges.university_profiles AS c JOIN profiles AS p ON c.school = p.school",conn)

In [10]:
def get_type_map(df_column):
    return {t:i for i, t in enumerate(set(df_column))}

def convert_sat_to_act(sat):
    if np.isnan(sat):
        return np.nan
    sat_scores = np.array([990, 1060, 1140, 1210, 1270, 1330, 1390, 1450, 1510, 1560, 1620, 1680, 1740, 1800, 1860, 1920, 1980, 2020, 2080, 2140, 2220, 2290, 2380, 2410])
    return np.where(sat < sat_scores)[0][0] + 13
    
def combine_test_scores(x):
    if not np.isnan(x['act']):
        return x['act']/36.0
    sat = np.array([x['sat_m'], x['sat_r'], x['sat_w']])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def convert_string(x):
    try:
        return int(x)
    except:
        return np.nan

def combine_avg_test_scores(x):
    if not np.isnan(convert_string(x['avg_act'])):
        return convert_string(x['avg_act'])/36.0
    sat = np.array([convert_string(x['sat_math']), convert_string(x['sat_reading']), convert_string(x['sat_writing'])])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def map_statuses(x):
    status_map = {"Denied": 0, "Deferred": 0, "Wait-Listed": 0, "Accepted": 2, "Will Attend": 2}
    return status_map[x['status']]

def convert_class_rank(x):
    s = str(x['class_rank'])
    if " of " in s:
        return float(s[:s.index(" of ")])/float(s[s.index(" of ") + 4:])
    if "Top " in s:
        return float(s[4:s.index("%")])/100.0
    if "Bottom " in s:
        return float(s[7:s.index("%")])/100.0
    return np.nan

def convert_instate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        instate = x['cost_attendance'].split("<br>")[0]
        return instate[instate.index("$")+1:]
    else:
        return x['cost_attendance']

def convert_outstate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        outstate = x['cost_attendance'].split("<br>")[1]
        return outstate[outstate.index("$")+1:]
    else:
        return x['cost_attendance']

def get_tuition(x):
    return x['in_state_tuition'] if x['state'] == x['hs_state'] else x['out_state_tuition']

def map_hs_types(x):
    hs_type_map = {"Public": 0, "Private": 1, "Parochial": 2, "Home": 3}
    return hs_type_map[x['hs_type']]

def map_ins_types(x):
    ins_type_map = {"Public": 0, "Private": 1, "Private for-profit": 1, "&nbsp;": np.nan}
    return ins_type_map[x['institution_type']]

In [11]:
df = df.loc[:,~df.columns.duplicated()]
df.fillna(value=pd.np.nan, inplace=True)
df['avg_act'] = df['avg_act'].map(lambda x: str(x).strip('-'))
df['test_score'] = df.apply(combine_test_scores, axis=1)
df['avg_test_score'] = df.apply(combine_avg_test_scores, axis=1)
df['class_rank'] = df.apply(convert_class_rank, axis=1)
df['institution_type'] = df.apply(map_ins_types, axis=1)
df['in_state_tuition'] = df.apply(convert_instate_tuition, axis=1)
df['out_state_tuition'] = df.apply(convert_outstate_tuition, axis=1)
#df['tuition'] = df.apply(get_tuition, axis=1)

for col in ['admission_rate','cost_attendance','school', 'city', 'sat_m', 'sat_r', 'sat_w', 'act', 'sat_math', 'sat_reading', 'sat_writing', 'avg_act', 'gpa_w']:
    df = df.drop(col, axis=1)
df = df.dropna()#subset=['test_score','avg_test_score','gpa_uw','class_rank','hs_state','hs_type','institution_type'])

df['status'] = df.apply(map_statuses, axis=1)
df['hs_type'] = df.apply(map_hs_types, axis=1)
replace_columns = ['hs_state', 'state', 'gender']
for col in replace_columns:
    df[col].replace(get_type_map(df[col]), inplace=True)

In [12]:
print(len(df.index))

44081


In [13]:
df.sample(n=100)

Unnamed: 0,state,avg_gpa,average_freshman_aid,faculty_total,international_percent,institution_type,female_percentage,year,gender,hs_type,...,gpa_uw,class_rank,status,eaed,legacy,athlete,test_score,avg_test_score,in_state_tuition,out_state_tuition
56248,1,3.60,16455,1359,5.5,0.0,51.2,2019,1,0,...,4.00,0.076923,2,0,1,0,0.666667,0.722222,25090,40858
76738,35,4.02,14735,1115,9.0,0.0,38.0,2017,2,0,...,3.90,0.020305,2,0,0,0,0.888889,0.888889,30604,51200
83648,36,4.27,29009,1482,4.4,0.0,54.7,2018,2,0,...,3.93,0.034056,2,0,0,0,0.972222,0.861111,31506,62335
40806,7,3.95,9996,952,2.2,0.0,47.8,2018,1,0,...,3.86,0.037698,2,0,0,0,0.944444,0.777778,27152,39032
94160,24,3.71,14029,2519,11.,0.0,50.5,2020,2,0,...,3.62,0.085616,2,0,0,0,0.750000,0.722222,28428,53373
51451,38,3.38,38565,110,16.,1.0,57.4,2011,2,0,...,3.80,0.100000,2,1,0,0,0.777778,0.638889,58324,58324
254,2,3.48,42666,167,2.9,1.0,54.5,2020,2,0,...,3.43,0.301887,2,0,0,1,0.527778,0.722222,63180,63180
87049,47,3.48,9498,288,0.7,0.0,60.7,2018,1,0,...,2.70,0.329146,2,1,0,0,0.472222,0.527778,19417,31008
10583,2,3.55,37399,139,1.5,1.0,56.0,2011,1,0,...,3.85,0.100000,2,0,0,0,0.833333,0.611111,59780,59780
40796,7,3.95,9996,952,2.2,0.0,47.8,2018,1,0,...,3.63,0.167006,2,0,0,0,0.722222,0.777778,27152,39032


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression

k = 1
df0 = df[df.status == 0]
df2 = df[df.status == 2].sample(n = int(k*len(df0)))
df_final = pd.concat([df0, df2])
X = df_final.drop('status', axis=1)
y = df_final['status']
X_train, X_test, y_train, y_test = train_test_split(X, y)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(8,8,8),max_iter=1000)
mlp.fit(X_train,y_train)
predictions = mlp.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
score = logisticRegr.score(X_test, y_test)
print("Logistic Regression Score: " + str(score))

  return self.partial_fit(X, y)


[[1746  429]
 [ 467 1756]]
              precision    recall  f1-score   support

           0       0.79      0.80      0.80      2175
           2       0.80      0.79      0.80      2223

   micro avg       0.80      0.80      0.80      4398
   macro avg       0.80      0.80      0.80      4398
weighted avg       0.80      0.80      0.80      4398

Logistic Regression Score: 0.7689859026830378




In [6]:
df.iloc[0]

school                   Bryn Athyn College
state                                    PA
city                             Bryn Athyn
avg_gpa                                3.30
sat_math                                512
sat_writing                             NaN
sat_reading                             533
avg_act                                  21
cost_attendance                       35214
average_freshman_aid                  11104
admission_rate                           88
faculty_total                            37
international_percent                   1.8
institution_type                          1
female_percentage                      50.6
year                                   2014
gender                                    f
hs_type                              Public
hs_state                                 NY
gpa_w                                   3.5
gpa_uw                                    3
sat_m                                   450
sat_r                           