In [217]:
import pandas as pd
import sqlite3
import re
X_train, X_test, y_train, y_test = train_test_split(X, y)

conn = sqlite3.connect("profiles.db")

def convert_class_rank(cr_str):
    cr = cr_str.split(' ')
    return int(cr[0])/int(cr[2])

def get_type_map(df_column):
    return {t:i for i, t in enumerate(set(df_column))}

def convert_sat_to_act(sat):
    if sat >= 2380:
        return 36
    if sat < 2380 and sat >= 2290:
        return 35
    if sat < 2290 and sat >= 2220:
        return 34
    if sat < 2220 and sat >= 2140:
        return 33
    if sat < 2140 and sat >= 2080:
        return 32
    if sat < 2080 and sat >= 2020:
        return 31
    if sat < 2020 and sat >= 1980:
        return 30
    if sat < 1980 and sat >= 1920:
        return 29
    if sat < 1920 and sat >= 1860:
        return 28
    if sat < 1860 and sat >= 1800:
        return 27
    if sat < 1800 and sat >= 1740:
        return 26
    if sat < 1740 and sat >= 1680:
        return 25
    if sat < 1680 and sat >= 1620:
        return 24
    if sat < 1620 and sat >= 1560:
        return 23
    if sat < 1560 and sat >= 1510:
        return 22
    if sat < 1510 and sat >= 1450:
        return 21
    if sat < 1450 and sat >= 1390:
        return 20
    if sat < 1390 and sat >= 1330:
        return 19
    if sat < 1330 and sat >= 1270:
        return 18
    if sat < 1270 and sat >= 1210:
        return 17
    if sat < 1210 and sat >= 1140:
        return 16
    if sat < 1140 and sat >= 1060:
        return 15
    if sat < 1060 and sat >= 990:
        return 14
    if sat < 990:
        return 13

df = pd.read_sql_query("SELECT * FROM profiles", conn)
df['sat_c'] = df['sat_m']+df['sat_r']+df['sat_w']
for col in ['sat_m', 'sat_r', 'sat_w']:
    df = df.drop(col, axis=1)
df['act'] = df.apply(lambda x: convert_sat_to_act(x['sat_c'] if pd.isnull(x['act']) else x['act']), axis=1)
df = df.dropna()
df = df[df.class_rank.str.match('\d* (of) \d*', na=False)]
df['class_rank'] = df['class_rank'].apply(lambda x: convert_class_rank(x))
replace_columns = ['hs_type', 'gender', 'status', 'hs_state', 'school']
df['status'].replace({'Deferred': 'Denied', 'Wait-Listed': 'Denied', 'Will Attend': 'Accepted'}, inplace=True)
print(get_type_map(df['status']))
for col in replace_columns:
    df[col].replace(get_type_map(df[col]), inplace=True)


{'Denied': 0, 'Accepted': 1}


In [224]:
from sklearn.model_selection import train_test_split
X = df.drop('year', axis=1)
X = X.drop('status', axis=1)
X = X.drop('hs_type', axis=1)
X = X.drop('hs_state', axis=1)
X = X.drop('athlete', axis=1)
X = X.drop('sat_c', axis=1)
y = df['status']


In [225]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)


In [226]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(11,11,11),max_iter=1000)


In [227]:
mlp.fit(X_train,y_train)


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(11, 11, 11), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [228]:
predictions = mlp.predict(X_test)


In [229]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))


[[119 138]
 [108 661]]
              precision    recall  f1-score   support

           0       0.52      0.46      0.49       257
           1       0.83      0.86      0.84       769

   micro avg       0.76      0.76      0.76      1026
   macro avg       0.68      0.66      0.67      1026
weighted avg       0.75      0.76      0.76      1026

