# Assignment 1 - mkecera3@gatech.edu

## Import libraries

In [None]:
import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt
from time import time
from torchvision import datasets, transforms
from torch import nn, optim
import pandas as pd
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn import metrics
import torch.nn.functional as F
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns
from sklearn import preprocessing
from skorch import NeuralNetClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from imblearn.over_sampling import RandomOverSampler

## Load and save the datasets

Start of citation - the following code was addapted from https://towardsdatascience.com/handwritten-digit-mnist-pytorch-977b5338e627

In [None]:
transform = transforms.Compose([transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])
# transform = transforms.Compose([transforms.ToTensor()])

trainset = datasets.MNIST('./data', download=False, train=True, transform=transform)
testset = datasets.MNIST('./data', download=False, train=False, transform=transform)
trainset, valset = torch.utils.data.random_split(trainset, [50000, 10000])

trainloader = torch.utils.data.DataLoader(trainset, batch_size=64, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=64, shuffle=True)

## Neural network

### MNIST dataset

In [None]:
def nnPredict(model, valloader, criterion):
    correct_count, all_count = 0, 0
    running_loss = 0
    for images,labels in valloader:
        images = images.view(images.shape[0], -1)
        with torch.no_grad():
                output = model(images)
                loss = criterion(output, labels)
                running_loss += loss.item()
        
#         print(images.shape)
#         print(labels.shape)

        for i in range(len(labels)):
            img = images[i].view(1, 784)
#             print(img.shape)
            with torch.no_grad():
                logps = model(img)

            ps = torch.exp(logps)
            probab = list(ps.numpy()[0])
            pred_label = probab.index(max(probab))
            true_label = labels.numpy()[i]
            if(true_label == pred_label):
                correct_count += 1
            all_count += 1

#     print("Number Of Images Tested =", all_count)
#     print("Model Accuracy =", (correct_count/all_count))
#     print("Testing Loss  =", (running_loss/len(valloader)))
    
    return running_loss/len(valloader), correct_count, all_count

In [None]:
input_size = 784
hidden_sizes = [128, 64]
output_size = 10
drop_out = 0.2

model = nn.Sequential(
    nn.Linear(input_size, hidden_sizes[0]),
    nn.ReLU(),
    # nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[0], hidden_sizes[1]),
    nn.ReLU(),
    # nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[1], output_size),
    nn.LogSoftmax(dim=1)
    )

In [None]:
criterion = nn.NLLLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

logps = model(images) #log probabilities
loss = criterion(logps, labels) #calculate the NLL loss

In [None]:
def trainNN(model):
    optimizer = optim.SGD(model.parameters(), lr=0.003, momentum=0.9)
    time0 = time()
    epochs = 30
    lossData = []
    accData = []
    for e in range(epochs):
        running_loss = 0
        for images, labels in trainloader:
            # Flatten MNIST images into a 784 long vector
            images = images.view(images.shape[0], -1)
        
            # Training pass
            optimizer.zero_grad()
            
            output = model(images)
            loss = criterion(output, labels)
            
            #This is where the model learns by backpropagating
            loss.backward()
            
            #And optimizes its weights here
            optimizer.step()
            
            running_loss += loss.item()
        else:
            print("Epoch {} - Training loss: {}".format(e, running_loss/len(trainloader)))
            
        testingLoss, correctCount, allCount = nnPredict(model, valloader, criterion)
        print("Testing Loss  =", (testingLoss))
        
        lossData.append([e, running_loss/len(trainloader), 'Training Error'])
        lossData.append([e, testingLoss, 'Testing Error'])
        accData.append([e, correctCount / allCount])
    
    return lossData, accData    
    print("\nTraining Time (in minutes) =",(time()-time0)/60)

In [None]:
lossDataBaseline, accDataBaseline = trainNN(model)

In [None]:
accDataBaseline = [[row[0], row[1], 'Baseline Accuracy'] for row in accDataBaseline]

In [None]:
lossDataBaseline = pd.DataFrame.from_records(lossDataBaseline, columns=["Epoch", "Loss", "Stage"])
accDataBaseline = pd.DataFrame.from_records(accDataBaseline, columns=["Epoch", "Accuracy", "Stage"])

In [None]:
# vizualize train and test error across iterations
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Loss",
    hue="Stage",
    data=lossDataBaseline
    ).set_title('Baseline NN')

In [None]:
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Accuracy",
    hue="Stage",
    data=accDataBaseline
    ).set_title('3 hidden NN')

In [None]:
testingLoss, correctCount, allCount = nnPredict(model, testloader, criterion)

In [None]:
correctCount / allCount

In [None]:
input_size = 784
hidden_sizes = [256, 128, 64]
output_size = 10
drop_out = 0.2

model_complex = nn.Sequential(
    nn.Linear(input_size, hidden_sizes[0]),
    nn.ReLU(),
    # nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[0], hidden_sizes[1]),
    nn.ReLU(),
    # nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[1], hidden_sizes[2]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[2], output_size),
    nn.LogSoftmax(dim=1)
    )

In [None]:
criterion = nn.NLLLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

logps = model_complex(images) #log probabilities
loss = criterion(logps, labels) #calculate the NLL loss

In [None]:
lossDataComplex, accDataComplex = trainNN(model_complex)

In [None]:
accDataComplex = [[row[0], row[1], 'More nodes and layers Accuracy'] for row in accDataComplex]

In [None]:
lossDataComplex = pd.DataFrame.from_records(lossDataComplex, columns=["Epoch", "Loss", "Stage"])
accDataComplex = pd.DataFrame.from_records(accDataComplex, columns=["Epoch", "Accuracy", "Stage"])

In [None]:
# vizualize train and test error across iterations
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Loss",
    hue="Stage",
    data=lossDataComplex
    ).set_title('3 hidden NN')

In [None]:
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Accuracy",
    hue="Stage",
    data=accDataComplex
    ).set_title('3 hidden NN')

End of citation - the preceding code was addapted from https://towardsdatascience.com/handwritten-digit-mnist-pytorch-977b5338e627





In [None]:
testingLoss, correctCount, allCount = nnPredict(model_complex, testloader, criterion)

In [None]:
correctCount / allCount

In [None]:
input_size = 784
hidden_sizes = [256, 128, 64]
output_size = 10
drop_out = 0.2

model_dropout = nn.Sequential(
    nn.Linear(input_size, hidden_sizes[0]),
    nn.ReLU(),
    nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[0], hidden_sizes[1]),
    nn.ReLU(),
    # nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[1], hidden_sizes[2]),
    nn.ReLU(),
    # nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[2], output_size),
    nn.LogSoftmax(dim=1)
    )

In [None]:
criterion = nn.NLLLoss()
images, labels = next(iter(trainloader))
images = images.view(images.shape[0], -1)

logps = model_dropout(images) #log probabilities
loss = criterion(logps, labels) #calculate the NLL loss

In [None]:
lossDataDropout, accDataDropout = trainNN(model_dropout)

In [None]:
accDataDropout = [[row[0], row[1], 'Dropout Accuracy'] for row in accDataDropout]

In [None]:
lossDataDropout = pd.DataFrame.from_records(lossDataDropout, columns=["Epoch", "Loss", "Stage"])
accDataDropout = pd.DataFrame.from_records(accDataDropout, columns=["Epoch", "Accuracy", "Stage"])

In [None]:
# vizualize train and test error across iterations
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Loss",
    hue="Stage",
    data=lossDataDropout
    ).set_title('Dropout NN')

In [None]:
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Accuracy",
    hue="Stage",
    data=accDataDropout
    ).set_title('Dropout NN')

In [None]:
accData = accDataBaseline.copy()
accData = accData.append(accDataComplex)
accData = accData.append(accDataDropout)

In [None]:
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Accuracy",
    hue="Stage",
    data=accData
    ).set_title('Accuracy of NN versions')

In [None]:
testingLoss, correctCount, allCount = nnPredict(model_complex, testloader, criterion)

In [None]:
correctCount / allCount

### Credit default dataset

In [None]:
# load dataset
creditDataDf = pd.read_excel('./data/default of credit card clients.xls', header=1)
creditDataDf = creditDataDf.drop(columns=['ID'])

In [None]:
print(sum(creditDataDf['default payment next month']))
print(len(creditDataDf))
print(sum(creditDataDf['default payment next month'])/len(creditDataDf))


In [None]:
# create dummy variables from categorical
creditDataDf = pd.get_dummies(creditDataDf, prefix=['SEX', 'EDUCATION', 'MARRIAGE'], columns=['SEX', 'EDUCATION', 'MARRIAGE'])

In [None]:
# drop last mummy variable
creditDataDf = creditDataDf.drop(columns=['SEX_2', 'EDUCATION_6', 'MARRIAGE_3'])

In [None]:
y = creditDataDf['default payment next month']
creditDataDf = creditDataDf.drop(columns=['default payment next month'])

In [None]:
# balance the dataset
ros = RandomOverSampler(random_state=37)
creditDataDf, y = ros.fit_resample(creditDataDf, y)

In [None]:
print(sum(y)/len(y))

In [None]:
# normalize the data
x = creditDataDf.values #returns a numpy array
min_max_scaler = preprocessing.StandardScaler()
x_scaled = min_max_scaler.fit_transform(x)
creditDataDf = pd.DataFrame(x_scaled, columns=creditDataDf.columns)

In [None]:
creditDataDf

In [None]:
X_train, X_test, y_train, y_test = train_test_split(creditDataDf, y, test_size=0.2, random_state=37)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=37)

In [None]:
print(X_train.columns)
print(len(X_train.columns))

In [None]:
sum(y_train)/len(y_train)

In [None]:
# convert data to tensors
X_train_tensor = torch.tensor(X_train.values.astype(np.float32))
X_test_tensor = torch.tensor(X_test.values.astype(np.float32))
X_val_tensor = torch.tensor(X_val.values.astype(np.float32))

y_train_tensor = torch.tensor(y_train.values.astype(np.int))
y_test_tensor = torch.tensor(y_test.values.astype(np.int))
y_val_tensor = torch.tensor(y_val.values.astype(np.int))

In [None]:
trainTorchDataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
valTorchDataset = torch.utils.data.TensorDataset(X_val_tensor, y_val_tensor)
testTorchDataset = torch.utils.data.TensorDataset(X_test_tensor, y_test_tensor)

In [None]:
print(len(valTorchDataset), len(testTorchDataset), len(trainTorchDataset))

In [None]:
trainloader2 = torch.utils.data.DataLoader(trainTorchDataset, batch_size=64, shuffle=True)
valloader2 = torch.utils.data.DataLoader(valTorchDataset, batch_size=len(valTorchDataset), shuffle=True)
testloader2 = torch.utils.data.DataLoader(testTorchDataset, batch_size=len(testTorchDataset), shuffle=True)

In [None]:
def nnPredictTabular(model2, valloader2, criterion):
    correct_count, all_count = 0, 0
    running_loss = 0
    probList = []
    for rows,labels in valloader2:
        
#         print(rows.shape)
#         print(labels.shape)
        
        with torch.no_grad():
                output = model2(rows)
                loss = criterion(output, labels)
                running_loss += loss.item()


        for i in range(len(labels)):
            row = rows[i].view(1, 30)
#             print(row.shape)
            with torch.no_grad():
                logps = model2(row)

            ps = torch.exp(logps)
            probab = list(ps.numpy()[0])
            probList.append(probab)
            pred_label = probab.index(max(probab))
            true_label = labels.numpy()[i]
            if(true_label == pred_label):
                correct_count += 1
            all_count += 1

#     print("Number Of Images Tested =", all_count)
#     print("Model Accuracy =", (correct_count/all_count))
#     print("Testing Loss  =", (running_loss/len(valloader2)))
    
    return running_loss/len(valloader2), correct_count, all_count, probList

In [None]:
input_size = 30
hidden_sizes = [25, 15]
output_size = 2

modelTabBaseline = nn.Sequential(
    nn.Linear(input_size, hidden_sizes[0]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[0], hidden_sizes[1]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[1], output_size),
    nn.LogSoftmax(dim=1)
    )
criterion = nn.NLLLoss()

In [None]:
def trainTabNN(model):
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
    # optimizer = optim.Adam(model.parameters(), lr=0.01)
    time0 = time()
    epochs = 100
    lossData = []
    accData=[]
    for e in range(epochs):
        running_loss = 0
        for rows, labels in trainloader2:
                
            # Training pass
            optimizer.zero_grad()
            
            output = model(rows)
            loss = criterion(output, labels)
            
            #This is where the model learns by backpropagating
            loss.backward()
            
            #And optimizes its weights here
            optimizer.step()
            
            running_loss += loss.item()
        else:
            print("Epoch {} - Training loss: {}".format(e, running_loss/len(trainloader2)))
            
        testingLoss, correctCount, allCount, proba = nnPredictTabular(model, valloader2, criterion)
        print("Testing Loss  =", (testingLoss))
        print("Testing Accuracy  =", (correctCount / allCount))
        
        lossData.append([e, running_loss/len(trainloader2), 'Training Loss'])
        lossData.append([e, testingLoss, 'Testing Loss'])
        accData.append([e, correctCount / allCount])
    
    return lossData, accData
        
    print("\nTraining Time (in minutes) =",(time()-time0)/60)

In [None]:
lossDataBaseline, accDataBaseline = trainTabNN(modelTabBaseline)

In [None]:
testingLoss, correctCount, allCount, proba = nnPredictTabular(modelTabBaseline, testloader2, criterion)

In [None]:
correctCount/allCount

In [None]:
accDataBaseline = [[row[0], row[1], 'Baseline Accuracy'] for row in accDataBaseline]
lossDataBaseline = pd.DataFrame.from_records(lossDataBaseline, columns=["Epoch", "Loss", "Stage"])
accDataBaseline = pd.DataFrame.from_records(accDataBaseline, columns=["Epoch", "Accuracy", "Stage"])

In [None]:
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Loss",
    hue="Stage",
    data=lossDataBaseline
    ).set_title('Baseline NN')

In [None]:
sns.set(
    style="darkgrid"
    )
    
sns.lineplot(
    x="Epoch", y="Accuracy",
    hue="Stage",
    data=accDataBaseline
    ).set_title('Baseline NN')

In [None]:
input_size = 30
hidden_sizes = [25, 20, 20, 20, 15, 15]
output_size = 2

modelTabLarge = nn.Sequential(
    nn.Linear(input_size, hidden_sizes[0]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[0], hidden_sizes[1]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[1], hidden_sizes[2]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[2], hidden_sizes[3]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[3], hidden_sizes[4]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[4], hidden_sizes[5]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[5], output_size),
    nn.LogSoftmax(dim=1)
    )
criterion = nn.NLLLoss()

In [None]:
lossDataLarge, accDataLarge = trainTabNN(modelTabLarge)

In [None]:
testingLoss, correctCount, allCount, proba = nnPredictTabular(modelTabLarge, testloader2, criterion)

In [None]:
correctCount/allCount

In [None]:
accDataLarge = [[row[0], row[1], '5 hidden Accuracy'] for row in accDataLarge]
lossDataLarge = pd.DataFrame.from_records(lossDataLarge, columns=["Epoch", "Loss", "Stage"])
accDataLarge = pd.DataFrame.from_records(accDataLarge, columns=["Epoch", "Accuracy", "Stage"])

In [None]:
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Loss",
    hue="Stage",
    data=lossDataLarge
    ).set_title('Baseline NN')

In [None]:
sns.set(
    style="darkgrid"
    )
    
sns.lineplot(
    x="Epoch", y="Accuracy",
    hue="Stage",
    data=accDataLarge
    ).set_title('Baseline NN')

In [None]:
input_size = 30
hidden_sizes = [25, 20, 20, 20, 15, 15]
output_size = 2
drop_out = 0.2

modelTabDropout = nn.Sequential(
    nn.Linear(input_size, hidden_sizes[0]),
    nn.ReLU(),
    nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[0], hidden_sizes[1]),
    nn.ReLU(),
    nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[1], hidden_sizes[2]),
    nn.ReLU(),
    nn.Dropout(drop_out, inplace=True),
    nn.Linear(hidden_sizes[2], hidden_sizes[3]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[3], hidden_sizes[4]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[4], hidden_sizes[5]),
    nn.ReLU(),
    nn.Linear(hidden_sizes[5], output_size),
    nn.LogSoftmax(dim=1)
    )
criterion = nn.NLLLoss()

In [None]:
lossDataLarge, accDataLarge = trainTabNN(modelTabDropout)

In [None]:
testingLoss, correctCount, allCount, proba = nnPredictTabular(modelTabDropout, testloader2, criterion)

In [None]:
correctCount/allCount

In [None]:
accDataLarge = [[row[0], row[1], 'Dropout Accuracy'] for row in accDataLarge]
lossDataLarge = pd.DataFrame.from_records(lossDataLarge, columns=["Epoch", "Loss", "Stage"])
accDataLarge = pd.DataFrame.from_records(accDataLarge, columns=["Epoch", "Accuracy", "Stage"])

In [None]:
sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Epoch", y="Loss",
    hue="Stage",
    data=lossDataLarge
    ).set_title('Baseline NN')

In [None]:
sns.set(
    style="darkgrid"
    )
    
sns.lineplot(
    x="Epoch", y="Accuracy",
    hue="Stage",
    data=accDataLarge
    ).set_title('Baseline NN')

## Decision tree

### MNIST dataset

In [None]:
trainset = datasets.MNIST('./data', download=False, train=True, transform=transform)
testset = datasets.MNIST('./data', download=False, train=False, transform=transform)

In [None]:
X_train_MNIST = trainset.data.numpy()
X_test_MNIST = testset.data.numpy()

In [None]:
y_train_MNIST = trainset.targets.numpy()
y_test_MNIST = testset.targets.numpy()

In [None]:
np.bincount(y_train_MNIST) + np.bincount(y_test_MNIST)

In [None]:
print(X_train_MNIST.shape)
print(y_train_MNIST.shape)

In [None]:
X_train_MNIST_reshaped = X_train_MNIST.reshape((60000, 784))
X_test_MNIST_reshaped = X_test_MNIST.reshape((10000, 784))

In [None]:
param_grid = {'max_depth': np.arange(3, 30, 3), 'min_samples_leaf': np.arange(5, 50, 5) }
gridSearchTree = GridSearchCV(
    tree.DecisionTreeClassifier(), param_grid, n_jobs=-1, verbose=3, scoring='accuracy'
    )
gridSearchTree.fit(X_train_MNIST_reshaped, y_train_MNIST)

In [None]:
gridSearchTree.best_params_

In [None]:
preds = gridSearchTree.predict(X_test_MNIST_reshaped)
print(accuracy_score(preds, y_test_MNIST))

In [None]:
cvResultsScore = pd.DataFrame(gridSearchTree.cv_results_)
cvResultsScore = cvResultsScore[['param_max_depth', 'param_min_samples_leaf', 'mean_test_score']]
cvResultsScore['mean_test_score'] = round(cvResultsScore['mean_test_score'], 2)
cvResultsScore = cvResultsScore.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_test_score')
sns.set_context("paper")
sns.heatmap(cvResultsScore, annot=True, fmt="g", cmap='viridis').set_title('Accuracy')

In [None]:
cvResultsTimeTrain = pd.DataFrame(gridSearchTree.cv_results_)
cvResultsTimeTrain = cvResultsTimeTrain[['param_max_depth', 'param_min_samples_leaf', 'mean_fit_time']]
cvResultsTimeTrain['mean_fit_time'] = round(cvResultsTimeTrain['mean_fit_time'], 2)
cvResultsTimeTrain = cvResultsTimeTrain.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_fit_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeTrain, annot=True, fmt="g", cmap='viridis').set_title('Training time')

In [None]:
cvResultsTimeScore = pd.DataFrame(gridSearchTree.cv_results_)
cvResultsTimeScore = cvResultsTimeScore[['param_max_depth', 'param_min_samples_leaf', 'mean_score_time']]
cvResultsTimeScore['mean_score_time'] = round(cvResultsTimeScore['mean_score_time'], 3)
cvResultsTimeScore = cvResultsTimeScore.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_score_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeScore, annot=True, fmt="g", cmap='viridis').set_title('Score time')

In [None]:
bestModel = tree.DecisionTreeClassifier(**gridSearchTree.best_params_)
train_sizes, train_scores, valid_scores = learning_curve(
                                            estimator = bestModel,
                                            X = X_train_MNIST_reshaped,
                                            y = y_train_MNIST, 
                                            train_sizes = np.linspace(0.1, 1.0, 5), 
                                            scoring = 'accuracy',
                                            cv=5
                                            )
lcurvePlotData = pd.DataFrame({'Train': np.mean(train_scores, axis=1), 'Validation': np.mean(valid_scores, axis=1), 'Train size': train_sizes})
lcurvePlotData = lcurvePlotData.melt(id_vars=['Train size'], value_vars=['Train', 'Validation'])
lcurvePlotData.rename(columns={'value': 'Accuracy'}, inplace=True)

sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Train size", y="Accuracy",
    hue="variable",
    data=lcurvePlotData
    ).set_title('Learning curve - decision tree credit default')

### Credit default dataset

In [None]:
np.bincount(y_train) + np.bincount(y_test) + np.bincount(y_val)

In [None]:
# concatenate dfs
X_train = pd.concat([X_train,X_val], axis=0)
y_train = pd.concat([y_train,y_val], axis=0)
print(X_train.shape)
print(y_train.shape)

In [None]:
param_grid = {'max_depth': np.arange(3, 30, 3), 'min_samples_leaf': np.arange(5, 50, 5) }
gridSearchTree = GridSearchCV(
    tree.DecisionTreeClassifier(), param_grid, n_jobs=-1, verbose=2, scoring='accuracy'
    )
gridSearchTree.fit(X_train, y_train)

In [None]:
preds = gridSearchTree.predict(X_test)
probs = gridSearchTree.predict_proba(X_test)
print(accuracy_score(preds, y_test))

In [None]:
cvResultsScore = pd.DataFrame(gridSearchTree.cv_results_)
cvResultsScore = cvResultsScore[['param_max_depth', 'param_min_samples_leaf', 'mean_test_score']]
cvResultsScore['mean_test_score'] = round(cvResultsScore['mean_test_score'], 2)
cvResultsScore = cvResultsScore.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_test_score')
sns.set_context("paper")
sns.heatmap(cvResultsScore, annot=True, fmt="g", cmap='viridis').set_title('Accuracy')

In [None]:
cvResultsTimeTrain = pd.DataFrame(gridSearchTree.cv_results_)
cvResultsTimeTrain = cvResultsTimeTrain[['param_max_depth', 'param_min_samples_leaf', 'mean_fit_time']]
cvResultsTimeTrain['mean_fit_time'] = round(cvResultsTimeTrain['mean_fit_time'], 2)
cvResultsTimeTrain = cvResultsTimeTrain.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_fit_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeTrain, annot=True, fmt="g", cmap='viridis').set_title('Training time')

In [None]:
cvResultsTimeScore = pd.DataFrame(gridSearchTree.cv_results_)
cvResultsTimeScore = cvResultsTimeScore[['param_max_depth', 'param_min_samples_leaf', 'mean_score_time']]
cvResultsTimeScore['mean_score_time'] = round(cvResultsTimeScore['mean_score_time'], 3)
cvResultsTimeScore = cvResultsTimeScore.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_score_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeScore, annot=True, fmt="g", cmap='viridis').set_title('Score time')

In [None]:
bestModel = tree.DecisionTreeClassifier(**gridSearchTree.best_params_)
train_sizes, train_scores, valid_scores = learning_curve(
                                            estimator = bestModel,
                                            X = X_train,
                                            y = y_train, 
                                            train_sizes = np.linspace(0.1, 1.0, 5), 
                                            scoring = 'accuracy',
                                            cv=5
                                            )
lcurvePlotData = pd.DataFrame({'Train': np.mean(train_scores, axis=1), 'Validation': np.mean(valid_scores, axis=1), 'Train size': train_sizes})
lcurvePlotData = lcurvePlotData.melt(id_vars=['Train size'], value_vars=['Train', 'Validation'])
lcurvePlotData.rename(columns={'value': 'Accuracy'}, inplace=True)

sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Train size", y="Accuracy",
    hue="variable",
    data=lcurvePlotData
    ).set_title('Learning curve - decision tree credit default')

## Boosting

### MNIST

In [None]:
param_grid = {'max_depth': np.arange(3, 30, 3), 'n_estimators': np.arange(10, 100, 10) }
gridSearchBoosting = GridSearchCV(
    GradientBoostingClassifier(), param_grid, n_jobs=-1, verbose=3, scoring='accuracy'
    )
gridSearchBoosting.fit(X_train_MNIST_reshaped, y_train_MNIST)

In [None]:
gridSearchBoosting.best_params_

In [None]:
preds = gridSearchBoosting.predict(X_test_MNIST_reshaped)
print(accuracy_score(preds, y_test_MNIST))

In [None]:
cvResultsScore = pd.DataFrame(gridSearchBoosting.cv_results_)
cvResultsScore = cvResults[['param_max_depth', 'param_n_estimators', 'mean_test_score']]
cvResultsScore['mean_test_score'] = round(cvResultsScore['mean_test_score'], 2)
cvResultsScore = cvResultsScore.pivot(index='param_max_depth', columns='param_n_estimators', values='mean_test_score')
sns.set_context("paper")
sns.heatmap(cvResultsScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Accuracy')

In [None]:
cvResultsTimeTrain = pd.DataFrame(gridSearchBoosting.cv_results_)
cvResultsTimeTrain = cvResultsTimeTrain[['param_max_depth', 'param_n_estimators', 'mean_fit_time']]
cvResultsTimeTrain['mean_fit_time'] = round(cvResultsTimeTrain['mean_fit_time'], 2)
cvResultsTimeTrain = cvResultsTimeTrain.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_fit_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeTrain, annot=True, fmt="g", cmap='viridis').set_title('Training time')

In [None]:
cvResultsTimeScore = pd.DataFrame(gridSearchBoosting.cv_results_)
cvResultsTimeScore = cvResultsTimeScore[['param_max_depth', 'param_n_estimators', 'mean_score_time']]
cvResultsTimeScore['mean_score_time'] = round(cvResultsTimeScore['mean_score_time'], 3)
cvResultsTimeScore = cvResultsTimeScore.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_score_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeScore, annot=True, fmt="g", cmap='viridis').set_title('Score time')

In [None]:
bestModel = GradientBoostingClassifier(**gridSearchBoosting.best_params_)
train_sizes, train_scores, valid_scores = learning_curve(
                                            estimator = bestModel,
                                            X = X_train_MNIST_reshaped,
                                            y = y_train_MNIST, 
                                            train_sizes = np.linspace(0.1, 1.0, 5), 
                                            scoring = 'accuracy',
                                            cv=5
                                            )
lcurvePlotData = pd.DataFrame({'Train': np.mean(train_scores, axis=1), 'Validation': np.mean(valid_scores, axis=1), 'Train size': train_sizes})
lcurvePlotData = lcurvePlotData.melt(id_vars=['Train size'], value_vars=['Train', 'Validation'])
lcurvePlotData.rename(columns={'value': 'Accuracy'}, inplace=True)

sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Train size", y="Accuracy",
    hue="variable",
    data=lcurvePlotData
    ).set_title('Learning curve - decision tree credit default')

### Credit default

In [None]:
param_grid = {'max_depth': np.arange(3, 30, 3), 'n_estimators': np.arange(10, 100, 10) }
gridSearchBoosting = GridSearchCV(
    GradientBoostingClassifier(), param_grid, n_jobs=-1, verbose=3, scoring='accuracy'
    )
gridSearchBoosting.fit(X_train, y_train)


In [None]:
preds = gridSearchBoosting.predict(X_test)
probs = gridSearchBoosting.predict_proba(X_test)
print(accuracy_score(preds, y_test))
probs = [row[1] for row in probs]
print(gridSearchBoosting.best_params_)

In [None]:
cvResultsScore = pd.DataFrame(gridSearchBoosting.cv_results_)
cvResultsScore = cvResults[['param_max_depth', 'param_n_estimators', 'mean_test_score']]
cvResultsScore['mean_test_score'] = round(cvResultsScore['mean_test_score'], 2)
cvResultsScore = cvResultsScore.pivot(index='param_max_depth', columns='param_n_estimators', values='mean_test_score')
sns.set_context("paper")
sns.heatmap(cvResultsScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Accuracy')

In [None]:
cvResultsTimeTrain = pd.DataFrame(gridSearchBoosting.cv_results_)
cvResultsTimeTrain = cvResultsTimeTrain[['param_max_depth', 'param_n_estimators', 'mean_fit_time']]
cvResultsTimeTrain['mean_fit_time'] = round(cvResultsTimeTrain['mean_fit_time'], 2)
cvResultsTimeTrain = cvResultsTimeTrain.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_fit_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeTrain, annot=True, fmt="g", cmap='viridis').set_title('Training time')

In [None]:
cvResultsTimeScore = pd.DataFrame(gridSearchBoosting.cv_results_)
cvResultsTimeScore = cvResultsTimeScore[['param_max_depth', 'param_n_estimators', 'mean_score_time']]
cvResultsTimeScore['mean_score_time'] = round(cvResultsTimeScore['mean_score_time'], 3)
cvResultsTimeScore = cvResultsTimeScore.pivot(index='param_max_depth', columns='param_min_samples_leaf', values='mean_score_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeScore, annot=True, fmt="g", cmap='viridis').set_title('Score time')

In [None]:
bestModel = GradientBoostingClassifier(**gridSearchBoosting.best_params_)
train_sizes, train_scores, valid_scores = learning_curve(
                                            estimator = bestModel,
                                            X = X_train,
                                            y = y_train, 
                                            train_sizes = np.linspace(0.1, 1.0, 5), 
                                            scoring = 'accuracy',
                                            cv=5
                                            )
lcurvePlotData = pd.DataFrame({'Train': np.mean(train_scores, axis=1), 'Validation': np.mean(valid_scores, axis=1), 'Train size': train_sizes})
lcurvePlotData = lcurvePlotData.melt(id_vars=['Train size'], value_vars=['Train', 'Validation'])
lcurvePlotData.rename(columns={'value': 'Accuracy'}, inplace=True)

sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Train size", y="Accuracy",
    hue="variable",
    data=lcurvePlotData
    ).set_title('Learning curve - decision tree credit default')

## KNN

### MNIST

In [None]:
param_grid = {'n_neighbors': np.arange(3, 24, 3), 'weights': ['uniform', 'distance']}
kNN = GridSearchCV(
    KNeighborsClassifier(), param_grid, n_jobs=-1, verbose=3, scoring='accuracy'
    )
kNN.fit(X_train_MNIST_reshaped, y_train_MNIST)

preds = kNN.predict(X_test_MNIST_reshaped)
print(accuracy_score(preds, y_test_MNIST))

In [None]:
cvResultsScore = pd.DataFrame(kNN.cv_results_)
cvResultsScore = cvResultsScore[['param_n_neighbors', 'param_weights', 'mean_test_score']]
cvResultsScore['mean_test_score'] = round(cvResultsScore['mean_test_score'], 2)
cvResultsScore = cvResultsScore.pivot(index='param_n_neighbors', columns='param_weights', values='mean_test_score')
sns.set_context("paper")
sns.heatmap(cvResultsScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Accuracy')

In [None]:
cvResultsTimeTrain = pd.DataFrame(kNN.cv_results_)
cvResultsTimeTrain = cvResultsTimeTrain[['param_n_neighbors', 'param_weights', 'mean_fit_time']]
cvResultsTimeTrain['mean_fit_time'] = round(cvResultsTimeTrain['mean_fit_time'], 2)
cvResultsTimeTrain = cvResultsTimeTrain.pivot(index='param_n_neighbors', columns='param_weights', values='mean_fit_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeTrain, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Training Time')

In [None]:
cvResultsTimeScore = pd.DataFrame(kNN.cv_results_)
cvResultsTimeScore = cvResultsTimeScore[['param_n_neighbors', 'param_weights', 'mean_score_time']]
cvResultsTimeScore['mean_score_time'] = round(cvResultsTimeScore['mean_score_time'], 2)
cvResultsTimeScore = cvResultsTimeScore.pivot(index='param_n_neighbors', columns='param_weights', values='mean_score_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Score Time')

In [None]:
bestModel = KNeighborsClassifier(**kNN.best_params_)
train_sizes, train_scores, valid_scores = learning_curve(
                                            estimator = bestModel,
                                            X = X_train_MNIST_reshaped,
                                            y = y_train_MNIST, 
                                            train_sizes = np.linspace(0.1, 1.0, 5), 
                                            scoring = 'accuracy',
                                            cv=5
                                            )
lcurvePlotData = pd.DataFrame({'Train': np.mean(train_scores, axis=1), 'Validation': np.mean(valid_scores, axis=1), 'Train size': train_sizes})
lcurvePlotData = lcurvePlotData.melt(id_vars=['Train size'], value_vars=['Train', 'Validation'])
lcurvePlotData.rename(columns={'value': 'Accuracy'}, inplace=True)

sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Train size", y="Accuracy",
    hue="variable",
    data=lcurvePlotData
    ).set_title('Learning curve - decision tree credit default')

### Credit default

In [None]:
param_grid = {'n_neighbors': np.arange(3, 24, 3), 'weights': ['uniform', 'distance']}
kNN = GridSearchCV(
    KNeighborsClassifier(), param_grid, n_jobs=-1, verbose=3, scoring='accuracy'
    )
kNN.fit(X_train, y_train)

In [None]:
preds = kNN.predict(X_test)
probs = kNN.predict_proba(X_test)
print(accuracy_score(preds, y_test))

In [None]:
cvResultsScore = pd.DataFrame(kNN.cv_results_)
cvResultsScore = cvResultsScore[['param_n_neighbors', 'param_weights', 'mean_test_score']]
cvResultsScore['mean_test_score'] = round(cvResultsScore['mean_test_score'], 2)
cvResultsScore = cvResultsScore.pivot(index='param_n_neighbors', columns='param_weights', values='mean_test_score')
sns.set_context("paper")
sns.heatmap(cvResultsScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Accuracy')

In [None]:
cvResultsTimeTrain = pd.DataFrame(kNN.cv_results_)
cvResultsTimeTrain = cvResultsTimeTrain[['param_n_neighbors', 'param_weights', 'mean_fit_time']]
cvResultsTimeTrain['mean_fit_time'] = round(cvResultsTimeTrain['mean_fit_time'], 2)
cvResultsTimeTrain = cvResultsTimeTrain.pivot(index='param_n_neighbors', columns='param_weights', values='mean_fit_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeTrain, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Training Time')

In [None]:
cvResultsTimeScore = pd.DataFrame(kNN.cv_results_)
cvResultsTimeScore = cvResultsTimeScore[['param_n_neighbors', 'param_weights', 'mean_score_time']]
cvResultsTimeScore['mean_score_time'] = round(cvResultsTimeScore['mean_score_time'], 2)
cvResultsTimeScore = cvResultsTimeScore.pivot(index='param_n_neighbors', columns='param_weights', values='mean_score_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Score Time')

In [None]:
bestModel = KNeighborsClassifier(**kNN.best_params_)
train_sizes, train_scores, valid_scores = learning_curve(
                                            estimator = bestModel,
                                            X = X_train,
                                            y = y_train, 
                                            train_sizes = np.linspace(0.1, 1.0, 5), 
                                            scoring = 'accuracy',
                                            cv=5
                                            )
lcurvePlotData = pd.DataFrame({'Train': np.mean(train_scores, axis=1), 'Validation': np.mean(valid_scores, axis=1), 'Train size': train_sizes})
lcurvePlotData = lcurvePlotData.melt(id_vars=['Train size'], value_vars=['Train', 'Validation'])
lcurvePlotData.rename(columns={'value': 'Accuracy'}, inplace=True)

sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Train size", y="Accuracy",
    hue="variable",
    data=lcurvePlotData
    ).set_title('Learning curve - decision tree credit default')

## SVM

### MNIST

In [None]:
param_grid = {'kernel': ['poly', 'rbf', 'linear'], 'max_iter': np.arange(10, 100, 10)}
SVM = GridSearchCV(
    svm.SVC(), param_grid, n_jobs=-1, verbose=3, scoring='accuracy'
    )
SVM.fit(X_train_MNIST_reshaped, y_train_MNIST)

preds = SVM.predict(X_test_MNIST_reshaped)
print(accuracy_score(preds, y_test_MNIST))

In [None]:
cvResultsScore = pd.DataFrame(SVM.cv_results_)
cvResultsScore = cvResultsScore[['param_kernel', 'param_max_iter', 'mean_test_score']]
cvResultsScore['mean_test_score'] = round(cvResultsScore['mean_test_score'], 2)
cvResultsScore = cvResultsScore.pivot(index='param_kernel', columns='param_max_iter', values='mean_test_score')
sns.set_context("paper")
sns.heatmap(cvResultsScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Accuracy')

In [None]:
cvResultsTimeTrain = pd.DataFrame(SVM.cv_results_)
cvResultsTimeTrain = cvResultsTimeTrain[['param_kernel', 'param_max_iter', 'mean_fit_time']]
cvResultsTimeTrain['mean_fit_time'] = round(cvResultsTimeTrain['mean_test_score'], 2)
cvResultsTimeTrain = cvResultsTimeTrain.pivot(index='param_kernel', columns='param_max_iter', values='mean_fit_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeTrain, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Training Time')

In [None]:
cvResultsTimeScore = pd.DataFrame(SVM.cv_results_)
cvResultsTimeScore = cvResultsTimeScore[['param_kernel', 'param_max_iter', 'mean_fit_time']]
cvResultsTimeScore['mean_fit_time'] = round(cvResultsTimeScore['mean_test_score'], 2)
cvResultsTimeScore = cvResultsTimeScore.pivot(index='param_kernel', columns='param_max_iter', values='mean_score_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Score Time')

In [None]:
bestModel = svm.SVC(**SVM.best_params_)
train_sizes, train_scores, valid_scores = learning_curve(
                                            estimator = bestModel,
                                            X = X_train_MNIST_reshaped,
                                            y = y_train_MNIST, 
                                            train_sizes = np.linspace(0.1, 1.0, 5), 
                                            scoring = 'accuracy',
                                            cv=5
                                            )
lcurvePlotData = pd.DataFrame({'Train': np.mean(train_scores, axis=1), 'Validation': np.mean(valid_scores, axis=1), 'Train size': train_sizes})
lcurvePlotData = lcurvePlotData.melt(id_vars=['Train size'], value_vars=['Train', 'Validation'])
lcurvePlotData.rename(columns={'value': 'Accuracy'}, inplace=True)

sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Train size", y="Accuracy",
    hue="variable",
    data=lcurvePlotData
    ).set_title('Learning curve - decision tree credit default')

### Credit default

In [None]:
param_grid = {'kernel': ['poly', 'rbf', 'linear'], 'max_iter': np.arange(10, 100, 10)}
SVM = GridSearchCV(
    svm.SVC(), param_grid, n_jobs=-1, verbose=3, scoring='accuracy'
    )
SVM.fit(X_train, y_train)

preds = SVM.predict(X_test)
print(accuracy_score(preds, y_test))

In [None]:
cvResultsScore = pd.DataFrame(SVM.cv_results_)
cvResultsScore = cvResultsScore[['param_kernel', 'param_max_iter', 'mean_test_score']]
cvResultsScore['mean_test_score'] = round(cvResultsScore['mean_test_score'], 2)
cvResultsScore = cvResultsScore.pivot(index='param_kernel', columns='param_max_iter', values='mean_test_score')
sns.set_context("paper")
sns.heatmap(cvResultsScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Accuracy')

In [None]:
cvResultsTimeTrain = pd.DataFrame(SVM.cv_results_)
cvResultsTimeTrain = cvResultsTimeTrain[['param_kernel', 'param_max_iter', 'mean_fit_time']]
cvResultsTimeTrain['mean_fit_time'] = round(cvResultsTimeTrain['mean_fit_time'], 2)
cvResultsTimeTrain = cvResultsTimeTrain.pivot(index='param_kernel', columns='param_max_iter', values='mean_fit_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeTrain, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Training Time')

In [None]:
cvResultsTimeScore = pd.DataFrame(SVM.cv_results_)
cvResultsTimeScore = cvResultsTimeScore[['param_kernel', 'param_max_iter', 'mean_score_time']]
cvResultsTimeScore['mean_score_time'] = round(cvResultsTimeScore['mean_score_time'], 2)
cvResultsTimeScore = cvResultsTimeScore.pivot(index='param_kernel', columns='param_max_iter', values='mean_score_time')
sns.set_context("paper")
sns.heatmap(cvResultsTimeScore, annot=True, fmt="g", cmap='viridis', linewidths=.5).set_title('Score Time')

In [None]:
bestModel = svm.SVC(**SVM.best_params_)
train_sizes, train_scores, valid_scores = learning_curve(
                                            estimator = bestModel,
                                            X = X_train,
                                            y = y_train, 
                                            train_sizes = np.linspace(0.1, 1.0, 5), 
                                            scoring = 'accuracy',
                                            cv=5
                                            )
lcurvePlotData = pd.DataFrame({'Train': np.mean(train_scores, axis=1), 'Validation': np.mean(valid_scores, axis=1), 'Train size': train_sizes})
lcurvePlotData = lcurvePlotData.melt(id_vars=['Train size'], value_vars=['Train', 'Validation'])
lcurvePlotData.rename(columns={'value': 'Accuracy'}, inplace=True)

sns.set(
    style="darkgrid"
    )

sns.lineplot(
    x="Train size", y="Accuracy",
    hue="variable",
    data=lcurvePlotData
    ).set_title('Learning curve - decision tree credit default')

# END