In [1]:
import pandas as pd
import numpy as np
import sqlite3
import re
from sklearn.cluster import KMeans

conn = sqlite3.connect("profiles.db")
cursor = conn.cursor()
cursor.execute("ATTACH DATABASE 'colleges.db' AS colleges")

<sqlite3.Cursor at 0x7f6c3df0bf10>

In [2]:
df = pd.read_sql_query("SELECT * FROM colleges.university_profiles AS c JOIN profiles AS p ON c.school = p.school",conn)
max_index = len(df.index)

In [3]:
def get_type_map(df_column):
    return {t:i for i, t in enumerate(set(df_column))}

def convert_sat_to_act(sat):
    if np.isnan(sat):
        return np.nan
    sat_scores = np.array([990, 1060, 1140, 1210, 1270, 1330, 1390, 1450, 1510, 1560, 1620, 1680, 1740, 1800, 1860, 1920, 1980, 2020, 2080, 2140, 2220, 2290, 2380, 2410])
    return np.where(sat < sat_scores)[0][0] + 13
    
def combine_test_scores(x):
    if not np.isnan(x['act']):
        return x['act']/36.0
    sat = np.array([x['sat_m'], x['sat_r'], x['sat_w']])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def convert_string(x):
    try:
        return int(x)
    except:
        return np.nan

def combine_avg_test_scores(x):
    if not np.isnan(convert_string(x['avg_act'])):
        return convert_string(x['avg_act'])/36.0
    sat = np.array([convert_string(x['sat_math']), convert_string(x['sat_reading']), convert_string(x['sat_writing'])])
    return convert_sat_to_act(3*sat[~np.isnan(sat)].mean())/36.0 if len(sat[np.isnan(sat)]) != 3 else np.nan

def map_statuses(x):
    status_map = {"Denied": 0, "Deferred": np.nan, "Wait-Listed": np.nan, "Accepted": 2, "Will Attend": 2}
    return status_map[x['status']]

def convert_class_rank(x):
    s = str(x['class_rank'])
    if " of " in s:
        return float(s[:s.index(" of ")])/float(s[s.index(" of ") + 4:])
    if "Top " in s:
        return float(s[4:s.index("%")])/100.0
    if "Bottom " in s:
        return float(s[7:s.index("%")])/100.0
    return np.nan

def convert_instate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        instate = x['cost_attendance'].split("<br>")[0]
        return instate[instate.index("$")+1:]
    else:
        return x['cost_attendance']

def convert_outstate_tuition(x):
    if "<br>" in str(x['cost_attendance']):
        outstate = x['cost_attendance'].split("<br>")[1]
        return outstate[outstate.index("$")+1:]
    else:
        return x['cost_attendance']

def get_tuition(x):
    return x['in_state_tuition'] if x['state'] == x['hs_state'] else x['out_state_tuition']

def map_hs_types(x):
    hs_type_map = {"Public": 0, "Private": 1, "Parochial": 2, "Home": 3}
    return hs_type_map[x['hs_type']]

def map_ins_types(x):
    ins_type_map = {"Public": 0, "Private": 1, "Private for-profit": 1, "&nbsp;": np.nan}
    return ins_type_map[x['institution_type']]

In [4]:
df = df.loc[:,~df.columns.duplicated()]
df.fillna(value=pd.np.nan, inplace=True)
df['avg_act'] = df['avg_act'].map(lambda x: str(x).strip('-'))
df['test_score'] = df.apply(combine_test_scores, axis=1)
df['avg_test_score'] = df.apply(combine_avg_test_scores, axis=1)
df['class_rank'] = df.apply(convert_class_rank, axis=1)
df['institution_type'] = df.apply(map_ins_types, axis=1)
df['in_state_tuition'] = df.apply(convert_instate_tuition, axis=1)
df['out_state_tuition'] = df.apply(convert_outstate_tuition, axis=1)
#df['tuition'] = df.apply(get_tuition, axis=1)
df['status'] = df.apply(map_statuses, axis=1)

for col in ['cost_attendance', 'city', 'sat_m', 'sat_r', 'sat_w', 'act', 'sat_math', 'sat_reading', 'sat_writing', 'avg_act', 'gpa_w']:
    df = df.drop(col, axis=1)
df = df.dropna()#subset=['test_score','avg_test_score','gpa_uw','class_rank','hs_state','hs_type','institution_type'])

df['hs_type'] = df.apply(map_hs_types, axis=1)
replace_columns = ['hs_state', 'state', 'gender', 'school']
for col in replace_columns:
    df[col].replace(get_type_map(df[col]), inplace=True)
    
dfc = df.copy()
df_plain = df.copy()
for col in ['avg_gpa','hs_state','school','hs_type','avg_test_score','state','average_freshman_aid','admission_rate','faculty_total','international_percent','institution_type','female_percentage','year','gender','status','eaed','legacy','athlete','in_state_tuition','out_state_tuition']:
    dfc = dfc.drop(col, axis=1)

def getNextTableIndex(index,j):
    isValid = False
    while(not isValid):
        try:
            dfc['class_rank'][index]
            return index, j+1
        except:
            index += 1

print("Setting up points")
points = []
i = 0
j=0
while i < 97686:
    i, j = getNextTableIndex(i,j)
    point = []
    for col in dfc.columns.values:
        point.append(dfc[col].get(i))
    points.append(point)
    i += 1
print("Done")
        
cluster = KMeans(n_clusters=1500, max_iter=500).fit(points)



Setting up points
Done


In [5]:
print("Assigning clusters")
i = 0
j = 0
while i < 97686:
    label = cluster.labels_[j]
    i, j = getNextTableIndex(i,j)
    dfc.set_value(i, "cluster",label)
    i += 1
    
for col in ['gpa_uw', 'class_rank', 'test_score']:
    df = df.drop(col, axis=1)

df = df.join(dfc, how='outer')
df.dropna()

Assigning clusters


  import sys


Unnamed: 0,school,state,avg_gpa,average_freshman_aid,admission_rate,faculty_total,international_percent,institution_type,female_percentage,year,...,eaed,legacy,athlete,avg_test_score,in_state_tuition,out_state_tuition,gpa_uw,class_rank,test_score,cluster
0,411,1,3.30,11104,88,37,1.8,1.0,50.6,2014,...,0,0,0,0.583333,35214,35214,3.00,0.210753,0.555556,625.0
2,845,33,3.52,21500,73,335,4.0,1.0,68.4,2010,...,0,0,0,0.694444,56738,56738,2.80,0.500000,0.583333,816.0
3,845,33,3.52,21500,73,335,4.0,1.0,68.4,2010,...,0,0,0,0.694444,56738,56738,2.50,0.500000,0.583333,1400.0
4,845,33,3.52,21500,73,335,4.0,1.0,68.4,2010,...,0,0,0,0.694444,56738,56738,3.90,0.017544,0.805556,410.0
5,845,33,3.52,21500,73,335,4.0,1.0,68.4,2011,...,0,0,0,0.694444,56738,56738,3.20,0.250000,0.777778,351.0
6,845,33,3.52,21500,73,335,4.0,1.0,68.4,2011,...,0,0,0,0.694444,56738,56738,4.00,0.023891,0.777778,81.0
7,845,33,3.52,21500,73,335,4.0,1.0,68.4,2012,...,0,0,0,0.694444,56738,56738,3.60,0.149626,0.861111,1076.0
10,845,33,3.52,21500,73,335,4.0,1.0,68.4,2012,...,0,0,0,0.694444,56738,56738,3.60,0.097059,0.694444,411.0
12,845,33,3.52,21500,73,335,4.0,1.0,68.4,2012,...,0,0,0,0.694444,56738,56738,3.80,0.075231,0.694444,828.0
13,845,33,3.52,21500,73,335,4.0,1.0,68.4,2012,...,0,0,0,0.694444,56738,56738,3.90,0.100000,0.638889,856.0


In [6]:
generated_df = pd.DataFrame(columns=df.columns.values)
for label in set(cluster.labels_):
    for index, row in df[df.cluster == label].iterrows():
        temp = df[df.cluster == label].copy()
        temp['school'] = row['school']
        generated_df = generated_df.append(temp)
    print("Cluster {} done".format(label))
generated_df


Cluster 0 done
Cluster 1 done
Cluster 2 done
Cluster 3 done
Cluster 4 done
Cluster 5 done
Cluster 6 done
Cluster 7 done
Cluster 8 done
Cluster 9 done
Cluster 10 done
Cluster 11 done
Cluster 12 done
Cluster 13 done
Cluster 14 done
Cluster 15 done
Cluster 16 done
Cluster 17 done
Cluster 18 done
Cluster 19 done
Cluster 20 done
Cluster 21 done
Cluster 22 done
Cluster 23 done
Cluster 24 done
Cluster 25 done
Cluster 26 done
Cluster 27 done
Cluster 28 done
Cluster 29 done
Cluster 30 done
Cluster 31 done
Cluster 32 done
Cluster 33 done
Cluster 34 done
Cluster 35 done
Cluster 36 done
Cluster 37 done
Cluster 38 done
Cluster 39 done
Cluster 40 done
Cluster 41 done
Cluster 42 done
Cluster 43 done
Cluster 44 done
Cluster 45 done
Cluster 46 done
Cluster 47 done
Cluster 48 done
Cluster 49 done
Cluster 50 done
Cluster 51 done
Cluster 52 done
Cluster 53 done
Cluster 54 done
Cluster 55 done
Cluster 56 done
Cluster 57 done
Cluster 58 done
Cluster 59 done
Cluster 60 done
Cluster 61 done
Cluster 62 done
Cl

KeyboardInterrupt: 

In [7]:
generated_df

Unnamed: 0,school,state,avg_gpa,average_freshman_aid,admission_rate,faculty_total,international_percent,institution_type,female_percentage,year,...,eaed,legacy,athlete,avg_test_score,in_state_tuition,out_state_tuition,gpa_uw,class_rank,test_score,cluster
284,559,1,3.33,22520,74,108,0.4,1.0,73.9,2018,...,0,0,0,0.583333,49530,49530,3.53,0.155477,0.694444,0.0
805,559,47,3.71,47010,68,26,21.,1.0,75.1,2013,...,0,0,0,0.777778,54969,54969,3.53,0.134276,0.694444,0.0
1764,559,15,3.36,29388,72,127,2.0,1.0,66.8,2015,...,1,0,0,0.638889,57746,57746,3.50,0.129630,0.694444,0.0
2074,559,28,3.45,15530,79,459,2.6,1.0,58.8,2016,...,0,0,0,0.694444,59062,59062,3.50,0.145251,0.694444,0.0
2249,559,33,3.78,40705,43,467,13.,1.0,32.1,2013,...,0,0,0,0.833333,71998,71998,3.50,0.133333,0.694444,0.0
9469,559,15,3.43,32640,52,298,1.9,1.0,55.4,2015,...,1,0,0,0.777778,65124,65124,3.50,0.129630,0.694444,0.0
24777,559,41,3.57,30669,69,163,1.9,1.0,61.1,2011,...,1,0,0,0.666667,50314,50314,3.50,0.125000,0.694444,0.0
25058,559,19,3.29,18778,68,164,1.1,1.0,55.7,2013,...,0,0,0,0.583333,59640,59640,3.50,0.124424,0.694444,0.0
26025,559,41,3.61,8688,78,2212,4.3,0.0,49.5,2015,...,0,0,0,0.722222,28156,43490,3.50,0.147619,0.694444,0.0
26284,559,37,3.46,30926,71,78,4.6,1.0,49.2,2013,...,1,0,0,0.638889,56924,56924,3.53,0.134276,0.694444,0.0


In [10]:
from sklearn.model_selection import train_test_split, KFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression

k = 1

df0 = df[df.status == 0]
df2 = df[df.status == 2].sample(n = int(k*len(df0)))

dfg0 = generated_df[generated_df.status == 0]
dfg2 = generated_df[generated_df.status == 2].sample(n = int(k*len(dfg0)))

df_final = pd.concat([df0, df2])
dfg_final = pd.concat([dfg0, dfg2])

X = df_final.drop('status', axis=1)
Xg = dfg_final.drop('status', axis=1)

y = df_final['status']
yg = dfg_final['status']

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

scaler = StandardScaler()
scaler.fit(Xg)
Xg = scaler.transform(Xg)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y)

X_train1 = np.append(X_train1,Xg)
y_train1 = np.append(y_train1,yg)

mlp = MLPClassifier(hidden_layer_sizes=(8,8,8),max_iter=1000)



# y_pred = cross_val_predict(mlp, X, y, cv = 4)

# print(classification_report(y,y_pred))

# logisticRegr = LogisticRegression()

# y_pred = cross_val_predict(logisticRegr, X, y, cv = 4)

# print(classification_report(y,y_pred))


mlp.fit(X_train1,y_train1)
predictions = mlp.predict(X_test1)
print(confusion_matrix(y_test1,predictions))
print(classification_report(y_test1,predictions))

# logisticRegr = LogisticRegression()
# logisticRegr.fit(X_train1, y_train1)
# score = logisticRegr.score(X_test1, y_test1)
# print("Logistic Regression Score: " + str(score))

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


ValueError: Expected 2D array, got 1D array instead:
array=[ 0.1471315  -0.26070201 -0.03181542 ... -0.56743683  0.53141328
  0.17949092].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
df

# To-do: Jake and Marco
- 