In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA

In [2]:
def dummy_variable(columns, dataframe):
    enc = OneHotEncoder()
    labler = LabelEncoder()
    
    for colname in columns:
        labels = labler.fit_transform(dataframe[colname])
        labels = labels.reshape(-1,1)
        categor_vars = enc.fit_transform(labels)
        categor_vars = categor_vars.toarray()
        df = pd.DataFrame(categor_vars)
        columns = df.columns
        column_labels = labler.inverse_transform(columns)
        df.columns = [str(col) + "_" + colname for col in column_labels]

        del dataframe[colname]
        dataframe = pd.concat([dataframe.reset_index(drop=True), df.iloc[:,:-1].reset_index(drop=True)], axis=1)
    
    return dataframe 

def cross_val(estimator, x, y, k=10, reg=True):
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import accuracy_score
    import numpy as np
    
    
    kf = KFold(n_splits=k, shuffle=True)
    score = []
    for train_index, test_index in kf.split(X):
        X_train, Y_train = x.iloc[train_index,:], y[train_index] 
        X_test, Y_test = x.iloc[test_index,:], y[test_index]
        
        estimator.fit(X_train, Y_train)
        y_predict = estimator.predict(X_test)
        if reg:
            score.append(mean_squared_error(Y_test, y_predict))
        else:
            score.append(accuracy_score(Y_test, y_predict))
    return np.mean(score)


In [3]:
port = pd.read_csv('student-por.csv', sep=";")
port['class'] = "portuguese"
math = pd.read_csv('student-mat.csv', sep=";")
math['class'] = 'math'
data = pd.concat([port, math], axis=0)
data = data.drop_duplicates(["school","sex","age","address","famsize","Pstatus","Medu","Fedu","Mjob","Fjob","reason","nursery","internet"])
data = data.reset_index(drop=True)

In [4]:
data.columns

Index([u'school', u'sex', u'age', u'address', u'famsize', u'Pstatus', u'Medu',
       u'Fedu', u'Mjob', u'Fjob', u'reason', u'guardian', u'traveltime',
       u'studytime', u'failures', u'schoolsup', u'famsup', u'paid',
       u'activities', u'nursery', u'higher', u'internet', u'romantic',
       u'famrel', u'freetime', u'goout', u'Dalc', u'Walc', u'health',
       u'absences', u'G1', u'G2', u'G3', u'class'],
      dtype='object')

In [5]:
Y = data['Walc']
#Y = (data['Walc'] * 2 + data['Dalc'] * 5) / 7.0
#Y = Y >= 3
X = data.drop(['Walc'], axis=1)
columns = [u'school', u'sex', u'famsize', u'Pstatus', u'Medu',
       u'Fedu', u'Mjob', u'Fjob', u'reason', u'guardian', u'traveltime',
       u'studytime', u'failures', u'schoolsup', u'famsup', u'paid',
       u'activities', u'nursery', u'higher', u'internet', u'romantic',
       u'famrel', u'freetime', u'goout','health', 'G1', 'G2', 'G3', 'class','Dalc']
X = data.loc[:,columns]
X = dummy_variable(columns, X)
extra_col = ['failures', 'G1', 'absences', 'age']
X = pd.concat([X, data.loc[:,extra_col]], axis=1)

In [11]:
data.shape

(662, 34)

In [13]:
data.loc[data['G2']==18, 'Walc']

193    3
237    3
328    2
333    2
340    2
345    2
370    3
373    5
409    2
538    3
583    2
585    2
595    1
625    4
Name: Walc, dtype: int64

In [10]:
data['G2'].value_counts()

11    106
12     87
10     84
13     83
9      75
14     55
8      40
15     37
16     25
17     20
7      16
18     14
0       8
6       7
5       3
4       1
19      1
Name: G2, dtype: int64

In [6]:
X.head()

Unnamed: 0,GP_school,F_sex,GT3_famsize,A_Pstatus,0_Medu,1_Medu,2_Medu,3_Medu,0_Fedu,1_Fedu,...,18_G3,math_class,1_Dalc,2_Dalc,3_Dalc,4_Dalc,failures,G1,absences,age
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,0,4,18
1,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,9,2,17
2,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0,12,6,15
3,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,14,0,15
4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,11,0,16


In [10]:
X.groupby(['']).count()

Walc
1    254
2    147
3    124
4     91
5     46
Name: Walc, dtype: int64

In [29]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# calling fit and transform in sequence (using method chaining) 
# same result, but more efficient compbutation
X_scaled_d = pd.DataFrame(scaler.fit_transform(X))

In [11]:
cross_val(estimator=LinearDiscriminantAnalysis(), x=X, y=Y, reg=False)



0.44084124830393484

In [12]:
cross_val(estimator=DecisionTreeClassifier(criterion='gini', max_depth=15), x=X, y=Y, reg=False)

0.39441429217548618

In [90]:
from sklearn.ensemble import RandomForestClassifier
estimators=50
cross_val(estimator=RandomForestClassifier(n_estimators = estimators, random_state=False ,class_weight='auto'), x=X, y=Y, reg=False)

0.48948439620081408

In [16]:
rate = .5
ada = AdaBoostClassifier(learning_rate=rate, random_state=False)
cross_val(estimator=ada, x=X, y=Y, reg=False)

0.48018995929443686

In [111]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression()
cross_val(estimator=log,x=X, y=Y, reg=False)

0.44724106739032105

In [119]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(40,3), activation='tanh', random_state=0)


cross_val(estimator=mlp,x=X, y=Y, reg=False)

0.39265038444142925

In [67]:
# from sklearn.metrics import confusion_matrix
# from sklearn.model_selection import train_test_split

# train_x, test_x, train_y, test_y = train_test_split(X,Y, train_size=.1)
# log.fit(train_x, train_y)
# pred_y = log.predict(test_x)
# c = confusion_matrix(test_y, pred_y)
# print c[1,1] / float(c[1,1] + c[1,0])
# c

In [91]:
X.shape

(662, 115)