In [541]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn as sk
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA

In [542]:
print('The scikit-learn version is {}.'.format(sk.__version__))

The scikit-learn version is 0.17.1.


In [543]:
def dummy_variable(columns, dataframe):
    enc = OneHotEncoder()
    labler = LabelEncoder()
    
    for colname in columns:
        labels = labler.fit_transform(dataframe[colname])
        labels = labels.reshape(-1,1)
        categor_vars = enc.fit_transform(labels)
        categor_vars = categor_vars.toarray()
        df = pd.DataFrame(categor_vars)
        columns = df.columns
        column_labels = labler.inverse_transform(columns)
        df.columns = [str(col) + "_" + colname for col in column_labels]

        del dataframe[colname]
        dataframe = pd.concat([dataframe.reset_index(drop=True), df.reset_index(drop=True)], axis=1)
    
    return dataframe 

def cross_val(estimator, x, y, k=10, reg=True):
    from sklearn.cross_validation import KFold
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import accuracy_score
    import numpy as np
    
    
    kf = KFold(len(y), n_folds=k, shuffle=True)
    score = []
    for train_index, test_index in kf:
        X_train, Y_train = x.iloc[train_index,:], y[train_index] 
        X_test, Y_test = x.iloc[test_index,:], y[test_index]
        
        estimator.fit(X_train, Y_train)
        y_predict = estimator.predict(X_test)
        if reg:
            score.append(mean_squared_error(Y_test, y_predict))
        else:
            score.append(accuracy_score(Y_test, y_predict))
    return np.mean(score)


In [544]:
port = pd.read_csv('student-por.csv', sep=";")
port['class'] = "portuguese"
math = pd.read_csv('student-mat.csv', sep=";")
math['class'] = 'math'
data = pd.concat([port, math], axis=0)
data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,class
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,4,0,11,11,portuguese
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,2,9,11,11,portuguese
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,6,12,13,12,portuguese
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,0,14,14,14,portuguese
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,0,11,13,13,portuguese
5,GP,M,16,U,LE3,T,4,3,services,other,...,4,2,1,2,5,6,12,12,13,portuguese
6,GP,M,16,U,LE3,T,2,2,other,other,...,4,4,1,1,3,0,13,12,13,portuguese
7,GP,F,17,U,GT3,A,4,4,other,teacher,...,1,4,1,1,1,2,10,13,13,portuguese
8,GP,M,15,U,LE3,A,3,2,services,other,...,2,2,1,1,1,0,15,16,17,portuguese
9,GP,M,15,U,GT3,T,3,4,other,other,...,5,1,1,1,5,0,12,12,13,portuguese


In [545]:
columns = ['class','Fjob','Mjob','school','sex','address','famsize','Pstatus','Medu','Fedu','reason','guardian'
           ,'schoolsup','famsup','paid','activities','nursery','higher', 'internet', 'romantic']
data = dummy_variable(columns , data)


In [550]:
data['Alc'] = (2.0*data['Walc'] + 5.0*data['Dalc'])/7.0
# Y = data['Alc']
# X = data.drop(['Walc', 'Dalc', 'Alc'], axis=1)
Y = data['Walc']
X = pd.DataFrame(data['Dalc'])


In [551]:
X.columns

Index([u'Dalc'], dtype='object')

In [552]:
cross_val(estimator=LinearRegression(), x=X, y=Y)

1.005485503963836

In [553]:
cross_val(estimator=KNeighborsRegressor(n_neighbors=11), x=X, y=Y)

1.064730194048376

In [554]:
cross_val(estimator=DecisionTreeRegressor(max_depth=3), x=X, y=Y)

0.95001904419072325

In [555]:
Y = data['Alc'].astype(str)

In [556]:
Y.dtypes

dtype('O')

In [557]:
cross_val(estimator=KNeighborsClassifier(n_neighbors=2), x=X, y=Y, reg=False)

0.42044871794871791

In [558]:
cross_val(estimator=LinearDiscriminantAnalysis(), x=X, y=Y, reg=False)

ValueError: On entry to DGESDD parameter number 10 had an illegal value

In [559]:
cross_val(estimator=DecisionTreeClassifier(criterion='gini', max_depth=15), x=X, y=Y, reg=False)

0.50286630036630042

In [560]:
from sklearn.ensemble import RandomForestClassifier
estimators=25
cross_val(estimator=RandomForestClassifier(n_estimators = estimators, random_state=False), x=X, y=Y, reg=False)

0.50387362637362643

In [561]:
rate = .1
ada = AdaBoostClassifier(learning_rate=rate, random_state=False)
cross_val(estimator=ada, x=X, y=Y, reg=False)


0.5037728937728938

In [577]:
from sklearn.ensemble import BaggingClassifier
estimators = 40
bagger = BaggingClassifier(n_estimators=estimators, random_state=False)
cross_val(estimator=bagger, x=X, y=Y, reg=False)

0.50184981684981689

In [578]:
X.columns

Index([u'Dalc'], dtype='object')

In [534]:
Z = X.drop(['math_class', 'portuguese_class'], axis=1)

In [540]:
Z.duplicated().sum()

0