# SYDE 522 Project Code
Chang Li, Maathusan Rajendram, Anastasia Santasheva, Evan Yeung

## Import Packages


In [1]:
# standard useful packages 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# validation & normalization methods
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut

# accuracy, MSE, log loss & timer methods
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from time import time

# dim reduction & classification methods 
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

# make matplotlib to show plots inline
%matplotlib inline


## Set Configuration
* Select options for method validation
* Set parameters

In [2]:
# 1. set dataset
ENABLE_POR_DATA = True     # set Portugese course dataset
ENABLE_MAT_DATA = False    # set Math course dataset

# 2. set supervised approach for G3
ENABLE_BINARY_TARGET = True       # sets G3 to binary
ENABLE_5LEVEL_TARGET = False        # set G3 to five-level scale
ENABLE_REGRESSION_TARGET = False   # set G3 to current state for regression

# 3. set dimensionaltiy reduction method - set both to false for none
ENABLE_PCN = False
ENABLE_LDA = False

# 4. set validation type
ENABLE_KFOLD = True
ENABLE_LOO = False


## Load  Dataset
* Select a data set (Portugese course or Math course)

In [3]:
# import data from csv
if (ENABLE_POR_DATA):
    dataframe = pd.read_csv('student-por-train.csv', usecols = range(0,33)) 
    dataframe_test = pd.read_csv('student-por-test.csv', usecols = range(0,33)) 
elif (ENABLE_MAT_DATA): 
    dataframe = pd.read_csv('student-mat-train.csv', usecols = range(0,33))
    dataframe_test = pd.read_csv('student-mat-test.csv', usecols = range(0,33))

dataset = dataframe.values
dataset_test = dataframe_test.values
dataset.shape, dataset_test.shape

((519, 33), (130, 33))

## Preprocessing

* Convert nominal attributes with Integer + One-Hot Encoding
* Selects supervised approache for G3
* NOTE: if we want we can also split further into A,B,C (A= all cols, B=same as A without G2, C=same as B without G1)
    * But leaving this out for now since we know A gives best accuracy

In [4]:
# helper functions for preprocessing
def convertToBinary(df, num_cols):
    df.loc[(df.G3 < 10), 'G3'] = 0
    df.loc[(df.G3 >= 10), 'G3'] = 1
    
    G3 = df.values[:,num_cols-1]
    return G3

def convertToFiveLevel(df, num_cols):
    df.loc[(df.G3 <= 9), 'G3'] = 0
    df.loc[(df.G3 > 9) & (df.G3 <= 11), 'G3'] = 1
    df.loc[(df.G3 > 11) & (df.G3 <= 13), 'G3'] = 2
    df.loc[(df.G3 > 13) & (df.G3 <= 16), 'G3'] = 3
    df.loc[(df.G3 > 16), 'G3'] = 4
    
    G3 = df.values[:,num_cols-1]
    return G3   

def oneHotEncode(df, num_cols):
    df = df.drop(labels='G3', axis=1)
    cols_to_transform = [
                        'school',
                        'sex',
                        'address',
                        'famsize',
                        'Pstatus',
                        'Mjob',                        
                        'Fjob',
                        'reason',
                        'guardian',
                        'famsup',
                        'schoolsup',
                        'paid',
                        'activities',
                        'nursery',                        
                        'higher',
                        'internet',
                        'romantic',
                        ]
    hot_encoded_df = pd.get_dummies(df, columns = cols_to_transform)
    
    attributes = hot_encoded_df.values
    return attributes

def normalizeData(train_data, val_data, test_data):
    scaler = StandardScaler().fit(train_data)
    train_data = scaler.transform(train_data)
    val_data = scaler.transform(val_data)
    test_data = scaler.transform(test_data)
    
    return train_data, val_data, test_data

# shuffle dataset
np.random.shuffle(dataset)

# find col length
num_cols = dataset.shape[1]

# split one-hot encoded attributes (X) and G3 (Y)
X = oneHotEncode(dataframe, num_cols)
X_tst = oneHotEncode(dataframe_test, num_cols)

# selects supervised approach for G3
if (ENABLE_BINARY_TARGET):
    Y = convertToBinary(dataframe, num_cols).astype('int')
    Y_tst = convertToBinary(dataframe_test, num_cols).astype('int')
elif (ENABLE_5LEVEL_TARGET):
    Y = convertToFiveLevel(dataframe, num_cols).astype('int')
    Y_tst = convertToFiveLevel(dataframe_test, num_cols).astype('int')
elif (ENABLE_REGRESSION_TARGET):
    Y = dataset[:,num_cols-1].astype('int')
    Y_tst = dataset_test[:,num_cols-1].astype('int')
    

In [5]:
# Y
X.shape, Y.shape, X_tst.shape, Y_tst.shape

((519, 58), (519,), (130, 58), (130,))

## Dimensionality Reduction
* PCA & LDA reduction methods

In [113]:
def pcaReduction(train_data, val_data, test_data, n_comp):
    pca = PCA(n_components=n_comp)
    train_data = pca.fit_transform(train_data)
    val_data = pca.transform(val_data)
    test_data = pca.transform(test_data)
    
    return train_data, val_data, test_data

def ldaReduction(train_data, train_target, val_data, test_data, n_comp):
    lda = LinearDiscriminantAnalysis(n_components=n_comp)
    train_data = lda.fit_transform(train_data, train_target)
    val_data = lda.transform(val_data)
    test_data = lda.transform(test_data)
    
    return train_data, val_data, test_data


## Decision Tree
* Single decision tree

In [38]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)

## K-Fold Validation
* 10 fold

In [39]:
def get_prediction(n_folds=3):
    kf = KFold(n_splits=n_folds)

    results = []
    for tr, ts in kf.split(X):
        clf.fit(X[tr], Y[tr])
        y_pred = clf.predict(X[ts])
        results.append(accuracy_score(Y[ts], y_pred))
    return (results)

k_fold_results = get_prediction(10)
print(min(k_fold_results), max(k_fold_results))

0.846153846154 0.923076923077


## Test Set
* Test set accuracy

In [61]:
clf = clf.fit(X, Y)
predicted = clf.predict(X_tst)
print(accuracy_score(Y_tst, predicted))

0.915384615385
