In [4]:
import pandas as pd
import sqlite3
import re

conn = sqlite3.connect("profiles.db")

def convert_class_rank(cr_str):
    cr = cr_str.split(' ')
    return int(cr[0])/int(cr[2])

def get_type_map(df_column):
    return {t:i for i, t in enumerate(set(df_column))}

def convert_sat_to_act(sat):
    if sat >= 2380:
        return 36
    if sat < 2380 and sat >= 2290:
        return 35
    if sat < 2290 and sat >= 2220:
        return 34
    if sat < 2220 and sat >= 2140:
        return 33
    if sat < 2140 and sat >= 2080:
        return 32
    if sat < 2080 and sat >= 2020:
        return 31
    if sat < 2020 and sat >= 1980:
        return 30
    if sat < 1980 and sat >= 1920:
        return 29
    if sat < 1920 and sat >= 1860:
        return 28
    if sat < 1860 and sat >= 1800:
        return 27
    if sat < 1800 and sat >= 1740:
        return 26
    if sat < 1740 and sat >= 1680:
        return 25
    if sat < 1680 and sat >= 1620:
        return 24
    if sat < 1620 and sat >= 1560:
        return 23
    if sat < 1560 and sat >= 1510:
        return 22
    if sat < 1510 and sat >= 1450:
        return 21
    if sat < 1450 and sat >= 1390:
        return 20
    if sat < 1390 and sat >= 1330:
        return 19
    if sat < 1330 and sat >= 1270:
        return 18
    if sat < 1270 and sat >= 1210:
        return 17
    if sat < 1210 and sat >= 1140:
        return 16
    if sat < 1140 and sat >= 1060:
        return 15
    if sat < 1060 and sat >= 990:
        return 14
    if sat < 990:
        return 13

df = pd.read_sql_query("SELECT * FROM profiles", conn)
# sums individual scores and drops them
df['sat_c'] = df['sat_m']+df['sat_r']+df['sat_w']
for col in ['sat_m', 'sat_r', 'sat_w']:
    df = df.drop(col, axis=1)
# converts all sat to act if act is not present
df['act'] = df.apply(lambda x: convert_sat_to_act(x['sat_c'] if pd.isnull(x['act']) else x['act']), axis=1)
# drop nan
df = df.dropna()
# drop class ranks that are not in "1 of 200" format
df = df[df.class_rank.str.match('\d* (of) \d*', na=False)]
# converts class rank to decimal
df['class_rank'] = df['class_rank'].apply(lambda x: convert_class_rank(x))
# convert string label to int
replace_columns = ['hs_type', 'gender', 'status', 'hs_state', 'school']
df['status'].replace({'Deferred': 'Denied', 'Wait-Listed': 'Denied', 'Will Attend': 'Accepted'}, inplace=True)
for col in replace_columns:
    df[col].replace(get_type_map(df[col]), inplace=True)


In [5]:
from sklearn.model_selection import train_test_split

X = df.drop('year', axis=1)
X = X.drop('status', axis=1)
X = X.drop('hs_type', axis=1)
X = X.drop('hs_state', axis=1)
X = X.drop('athlete', axis=1)
X = X.drop('sat_c', axis=1)
print(X)
y = df['status']
X_train, X_test, y_train, y_test = train_test_split(X, y)


       school  gender  gpa_w  gpa_uw   act  class_rank  eaed  legacy
0          49       1   3.50    3.00  20.0    0.210753     0       0
3         109       1   4.50    3.90  13.0    0.017544     0       0
7         109       1   4.00    3.80  25.0    0.075231     0       0
9         109       1   3.90    3.60  31.0    0.149626     0       0
11        109       1   3.80    3.60  25.0    0.097059     0       0
15        109       1   4.00    4.00  32.0    0.009259     0       0
17        109       1   3.89    3.47  24.0    0.145251     1       0
19        109       1   4.00    3.80  13.0    0.087912     0       0
21        109       1   3.90    3.90  26.0    0.055556     1       0
22        109       0   4.00    3.70  13.0    0.079295     0       0
23        109       1   3.06    2.65  24.0    0.093023     1       0
24        109       0   3.30    3.20  26.0    0.383966     1       0
26        109       0   3.70    3.60  26.0    0.164948     0       0
30        109       0   3.85    3.

In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """


In [7]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(8,8,8),max_iter=1000)


In [8]:
mlp.fit(X_train,y_train)


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(8, 8, 8), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [9]:
predictions = mlp.predict(X_test)


In [10]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))


[[747  29]
 [215  35]]
              precision    recall  f1-score   support

           0       0.78      0.96      0.86       776
           1       0.55      0.14      0.22       250

   micro avg       0.76      0.76      0.76      1026
   macro avg       0.66      0.55      0.54      1026
weighted avg       0.72      0.76      0.70      1026



In [11]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
score = logisticRegr.score(X_test, y_test)
print(score)

0.7563352826510721


