# SYDE 522 Project Code
Chang Li, Maathusan Rajendram, Anastasia Santasheva, Evan Yeung

## Import and Load  Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# make matplotlib to show plots inline
%matplotlib inline

# import data from csv
dataframe = pd.read_csv('student-por.csv', usecols = range(0,33)) # select Portugese course dataset
# dataframe = pd.read_csv('student-mat.csv', usecols = range(0,33)) # select Math course dataset

dataset = dataframe.values
dataset.shape


(649, 33)

## Preprocessing

* Convert nominal attributes with Integer + One-Hot Encoding
* Convert G3 for difference supervised approaches
* NOTE: if we want we can also split further into A,B,C (A= all cols, B=same as A without G2, C=same as B without G1)
    * But leaving this out for now since we know A gives best accuracy

In [2]:
# helper functions for preprocessing
def convertToBinary(df, num_cols):
    df.loc[(df.G3 < 10), 'G3'] = 0
    df.loc[(df.G3 >= 10), 'G3'] = 1
    
    G3 = df.values[:,num_cols-1]
    return G3

def convertToFiveLevel(df, num_cols):
    df.loc[(df.G3 <= 9), 'G3'] = 0
    df.loc[(df.G3 > 9) & (df.G3 <= 11), 'G3'] = 1
    df.loc[(df.G3 > 11) & (df.G3 <= 13), 'G3'] = 2
    df.loc[(df.G3 > 13) & (df.G3 <= 16), 'G3'] = 3
    df.loc[(df.G3 > 16), 'G3'] = 4
    
    G3 = df.values[:,num_cols-1]
    return G3   

def oneHotEncode(df, num_cols):
    cols_to_transform = [
                        'school',
                        'sex',
                        'address',
                        'famsize',
                        'Pstatus',
                        'Mjob',                        
                        'Fjob',
                        'reason',
                        'guardian',
                        'famsup',
                        'schoolsup',
                        'paid',
                        'activities',
                        'nursery',                        
                        'higher',
                        'internet',
                        'romantic',
                        ]
    hot_encoded_df = pd.get_dummies(df, columns = cols_to_transform)
    
    attributes = np.delete(hot_encoded_df.values, obj=num_cols-1, axis=1)
    return attributes


# shuffle dataset
np.random.shuffle(dataset_por)

# find col length
num_cols = dataset.shape[1]

# split one-hot encoded attributes (X) and G3 (Y)
X = oneHotEncode(dataframe, num_cols)

# select supervised approach for G3
#Y = convertToBinary(dataframe, num_cols) # sets G3 to binary
# Y = convertToFiveLevel(dataframe, num_cols) # set G3 to five-level scale
Y = dataset[:,num_cols-1] # set G3 to current state for regression


In [5]:
# Y
# X.shape

(649, 58)

## Validation


In [None]:
ENABLE_LOG_LOSS = True

# method to calculate precision, accuracy, confusion matrix
def calcMetric(actual, predicted):
# return precision_score(actual, predicted)
# return confusion_matrix(actual, predicted, labels=[2, 1])
    return accuracy_score(actual, predicted, normalize = True)

# method for k-fold cross validation
def kFoldValidation(param, n_splits=10):
    kFold = KFold(n_splits=n_splits)
    
    # run on test data
    test_results = []
    train_results = []
    
    for train_index, test_index in kFold.split(X):
        X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = Y[train_index], Y[test_index]
        
        # normalize data
        scaler = sklearn.preprocessing.StandardScaler().fit(X_train) X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        
        # build classifier for each set
        clf = buildClf(X_train, Y_train, param)
        
        predicted = clf.predict(X_test)
        test_accuracy = calcMetric(Y_test, predicted)
        predicted = clf.predict(X_train)
        train_accuracy = calcMetric(Y_train, predicted)
        
        test_results.append(test_accuracy)
        train_results.append(train_accuracy)
        
    return np.mean(train_results, axis = 0), np.mean(test_results, axis = 0)

## Classifier Selection