## Import and Load  Dataset

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# make matplotlib to show plots inline
%matplotlib inline

# import data from csv
dataframe = pd.read_csv('student-por.csv', usecols = range(0,33)) # select Portugese course dataset
# dataframe = pd.read_csv('student-mat.csv', usecols = range(0,33)) # select Math course dataset

dataset = dataframe.values
dataset.shape


(649, 33)

## Preprocessing

* Convert nominal attributes with Integer + One-Hot Encoding
* Convert G3 for difference supervised approaches
* NOTE: if we want we can also split further into A,B,C (A= all cols, B=same as A without G2, C=same as B without G1)
    * But leaving this out for now since we know A gives best accuracy

In [57]:
# helper functions for preprocessing
def convertToBinary(df, num_cols):
    df.loc[(df.G3 < 10), 'G3'] = 0
    df.loc[(df.G3 >= 10), 'G3'] = 1
    
    G3 = df.values[:,num_cols-1]
    return G3

def convertToFiveLevel(df, num_cols):
    df.loc[(df.G3 <= 9), 'G3'] = 0
    df.loc[(df.G3 > 9) & (df.G3 <= 11), 'G3'] = 1
    df.loc[(df.G3 > 11) & (df.G3 <= 13), 'G3'] = 2
    df.loc[(df.G3 > 13) & (df.G3 <= 16), 'G3'] = 3
    df.loc[(df.G3 > 16), 'G3'] = 4
    
    G3 = df.values[:,num_cols-1]
    return G3   

def oneHotEncode(df, num_cols):
    cols_to_transform = [
                        'school',
                        'sex',
                        'address',
                        'famsize',
                        'Pstatus',
                        'Mjob',                        
                        'Fjob',
                        'reason',
                        'guardian',
                        'famsup',
                        'schoolsup',
                        'paid',
                        'activities',
                        'nursery',                        
                        'higher',
                        'internet',
                        'romantic',
                        ]
    hot_encoded_df = pd.get_dummies(df, columns = cols_to_transform)
    
    attributes = np.delete(hot_encoded_df.values, obj=num_cols-1, axis=1)
    return attributes


# shuffle dataset
# np.random.shuffle(dataset_por)

# find col length
num_cols = dataset.shape[1]

# divide one-hot encoded attributes (X) and G3 (Y)
# X = np.delete(dataset, obj=num_cols-1, axis=1)
X = oneHotEncode(dataframe, num_cols)

# select supervised approach for G3
#Y = convertToBinary(dataframe, num_cols) # sets G3 to binary
Y = convertToFiveLevel(dataframe, num_cols) # set G3 to five-level scale
# Y = dataset[:,num_cols-1] # set G3 to current state for regression



In [59]:
# Y
# X

Unnamed: 0,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,...,Walc,health,absences,G1,G2,G3,school_GP,school_MS,sex_F,sex_M
0,18,U,GT3,A,4,4,at_home,teacher,course,mother,...,1,3,4,0,11,1,1,0,1,0
1,17,U,GT3,T,1,1,at_home,other,course,father,...,1,3,2,9,11,1,1,0,1,0
2,15,U,LE3,T,1,1,at_home,other,other,mother,...,3,3,6,12,13,2,1,0,1,0
3,15,U,GT3,T,4,2,health,services,home,mother,...,1,5,0,14,14,3,1,0,1,0
4,16,U,GT3,T,3,3,other,other,home,father,...,2,5,0,11,13,2,1,0,1,0
5,16,U,LE3,T,4,3,services,other,reputation,mother,...,2,5,6,12,12,2,1,0,0,1
6,16,U,LE3,T,2,2,other,other,home,mother,...,1,3,0,13,12,2,1,0,0,1
7,17,U,GT3,A,4,4,other,teacher,home,mother,...,1,1,2,10,13,2,1,0,1,0
8,15,U,LE3,A,3,2,services,other,home,mother,...,1,1,0,15,16,4,1,0,0,1
9,15,U,GT3,T,3,4,other,other,home,mother,...,1,5,0,12,12,2,1,0,0,1


## Validation

* Normalize data set -> scaler = sklearn.preprocessing.StandardScaler().fit(X_train)

In [None]:
ENABLE_LOG_LOSS = True

# method to calculate precision, accuracy, confusion matrix
def calcMetric(actual, predicted):
# return precision_score(actual, predicted)
# return confusion_matrix(actual, predicted, labels=[2, 1])
    return accuracy_score(actual, predicted, normalize = True)


# method to test train/test split by percentage
def splitByPercentageValidation(param, train_size=0.7):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = train_size, random_state=2017)
    
    # normalize data
    scaler = sklearn.preprocessing.StandardScaler().fit(X_train) 
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # run on test and train data
    clf = buildClf(X_train, Y_train, param)
    
    predicted = clf.predict(X_test)
    test_accuracy = calcMetric(Y_test, predicted) 
    predicted = clf.predict(X_train) 
    train_accuracy = calcMetric(Y_train, predicted)
    
    return train_accuracy, test_accuracy


## Classifier Selection