In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('C:/Users/bowen/Documents/Data Science from Scratch/scripts/')

In [3]:
import os
import numpy as np
import pandas as pd
from preprocessing import Preprocessor
from methods import Method, Validator

In [4]:
trainData = pd.read_csv('C:/Users/bowen/Documents/Data Science from Scratch/data/spaceship-titanic/train.csv')
testData = pd.read_csv('C:/Users/bowen/Documents/Data Science from Scratch/data/spaceship-titanic/test.csv')
trainXdata = trainData[[col for col in trainData.columns if col != 'Transported']]
testXdata = testData[[col for col in testData.columns if col != 'Transported']]
trainYData = trainData['Transported'].astype(float)

In [5]:
trainXdata[['Deck','RoomNo','ShipSide']] = trainXdata['Cabin'].str.split('/', expand=True)
trainXdata[['GroupNo', 'GroupSize']] = trainXdata['PassengerId'].str.split('_', expand=True)
trainXdata['Child'], trainXdata['Infant'] = (trainXdata['Age'] <= 18).astype(int), (trainXdata['Age'] <= 5).astype(int)
trainXdata['TotalSpent'] = np.sum(trainXdata[['RoomService', 'FoodCourt', 'ShoppingMall','Spa','VRDeck']], axis=1)

In [6]:
testXdata[['Deck','RoomNo','ShipSide']] = testXdata['Cabin'].str.split('/', expand=True)
testXdata[['GroupNo', 'GroupSize']] = testXdata['PassengerId'].str.split('_', expand=True)
testXdata['Child'], testXdata['Infant'] = (testXdata['Age'] <= 18).astype(int), (testXdata['Age'] <= 5).astype(int)
testXdata['TotalSpent'] = np.sum(testXdata[['RoomService', 'FoodCourt', 'ShoppingMall','Spa','VRDeck']], axis=1)

In [7]:
data = Preprocessor(dropCols = ['Name', 'Cabin', 'PassengerId', 'RoomNo', 'GroupNo', 'FoodCourt', 'ShoppingMall'],
                onehotCols = ['HomePlanet', 'Destination', 'ShipSide'], directCols=['VIP', 'CryoSleep', 'GroupSize'],
                ordinalCols = {'Deck': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']},
                splitterMethod='kfold', splitGroups=5, imputerMethod='lazy', normalizerMethod='zscore', bagging=True)

data(trainXdata, trainYData)

normXtrain = data.transform(trainXdata, means=data.normalizer.means, stds=data.normalizer.stds)
normXtest = data.transform(testXdata, means=data.normalizer.means, stds=data.normalizer.stds)

In [None]:
method = Method(modelMethod='logreg', lossMethod='binary_cross_entropy', optimizerMethod='gd', 
                modelParams={}, 
                lossParams={}, 
                optimizerParams={'gradThreshold': 1e-5})
validator = Validator(method=method, diagnosticParams={'metrics': ('accuracy', 'precision', 'recall')})

validator(trainXs=data.splitter.trainX, trainYs=data.splitter.trainY, 
          validationXs=data.splitter.validationX, validationYs=data.splitter.validationY, 
          binary=True, threshold=0.5)

In [None]:
method(X=normXtrain, y=trainYData)
yHat = method.predict(normXtest)
yHat = (yHat > 0.5)*1#np.quantile(yHat, q=1-trainYData.mean())) * 1
testSubmission = pd.DataFrame(np.vstack((testData['PassengerId'], yHat)).T, columns=['PassengerId','Transported'])
testSubmission.to_csv('gd_submission.csv', index=False)

In [None]:
method = Method(modelMethod='logreg', lossMethod='binary_cross_entropy', optimizerMethod='mgd', 
                modelParams={}, 
                lossParams={}, 
                optimizerParams={'lossThreshold': 2e-6, 'maxEpochs':10000, 'gradThreshold':0})
validator = Validator(method=method, diagnosticParams={'metrics': ('accuracy', 'precision', 'recall')})

validator(trainXs=data.splitter.trainX, trainYs=data.splitter.trainY, 
          validationXs=data.splitter.validationX, validationYs=data.splitter.validationY, 
          binary=True, threshold=0.5)

In [None]:
method(X=normXtrain, y=trainYData)
yHat = method.predict(normXtest)
yHat = (yHat > 0.5)*1 # np.quantile(yHat, q=1-trainYData.mean())) * 1
testSubmission = pd.DataFrame(np.vstack((testData['PassengerId'], yHat)).T, columns=['PassengerId','Transported'])
testSubmission.to_csv('mgd_submission.csv', index=False)

In [None]:
method = Method(modelMethod='logreg', lossMethod='binary_cross_entropy', optimizerMethod='sa', 
                modelParams={}, 
                lossParams={}, 
                optimizerParams={'lossThreshold': 1e-5, 'maxEpochs':10000, 'gradThreshold':0, 'coolingRate':1e-6,
                                 'temperature':0.05, 'initialGuess':'zeros', 'stepSize':1})

validator = Validator(method=method, diagnosticParams={'metrics': ('accuracy', 'precision', 'recall')})

validator(trainXs=data.splitter.trainX, trainYs=data.splitter.trainY, 
          validationXs=data.splitter.validationX, validationYs=data.splitter.validationY, 
          binary=True, threshold=0.5)

In [None]:
method(X=normXtrain, y=trainYData)
yHat = method.predict(normXtest)
yHat = (yHat > 0.5)*1 #np.quantile(yHat, q=1-trainYData.mean())) * 1
testSubmission = pd.DataFrame(np.vstack((testData['PassengerId'], yHat)).T, columns=['PassengerId','Transported'])
testSubmission.to_csv('sa_submission.csv', index=False)

In [64]:
method = Method(modelMethod='logreg', lossMethod='mse', optimizerMethod='rf', 
                modelParams={}, 
                lossParams={}, 
                optimizerParams={'maxTreeDepth':10, 'minGroupSize':5, 'nCols':0.33, 'nRows':0.67, 'nTrees':5,
                                 'splitMethod':'histogram', 'classification': False, 'lossMethod':'mse'})

validator = Validator(method=method, diagnosticParams={'metrics': ('accuracy', 'precision', 'recall')})

validator(trainXs=data.splitter.trainX, trainYs=data.splitter.trainY, 
          validationXs=data.splitter.validationX, validationYs=data.splitter.validationY, 
          binary=True, threshold=0.5)

Runtime: 2.66, Accuracy: 69.793, Precision: 68.008, Recall: 76.557
Runtime: 3.42, Accuracy: 73.763, Precision: 78.755, Recall: 66.061
Runtime: 1.91, Accuracy: 73.59, Precision: 76.119, Recall: 69.625
Runtime: 1.83, Accuracy: 73.13, Precision: 73.904, Recall: 74.64
Runtime: 1.88, Accuracy: 68.7, Precision: 66.235, Recall: 72.663
Average Runtime: 2.34, Runtime Variance: 0.38508, Average Accuracy: 71.795, Accuracy Variance: 4.49286, Average Precision: 72.604, Precision Variance: 22.71348, Average Recall: 71.909, Recall Variance: 13.80931


In [65]:
method(X=normXtrain, y=trainYData)
yHat = method.predict(normXtest)
yHat = (yHat > 0.5) #np.quantile(yHat, q=1-trainYData.mean())) * 1
testSubmission = pd.DataFrame(np.vstack((testData['PassengerId'], yHat)).T, columns=['PassengerId','Transported'])
# testSubmission.to_csv('rf_submission.csv', index=False)

In [None]:
method = Method(modelMethod='logreg', lossMethod='mse', optimizerMethod='gb', 
                modelParams={}, 
                lossParams={}, 
                optimizerParams={'maxTreeDepth':2, 'minGroupSize':5, 'maxBoosterDepth':10,
                                 'splitMethod':'histogram', 'classification': False, 'lossMethod':'mse', 'lr':0.01})

validator = Validator(method=method, diagnosticParams={'metrics': ('accuracy', 'precision', 'recall')})

validator(trainXs=data.splitter.trainX, trainYs=data.splitter.trainY, 
          validationXs=data.splitter.validationX, validationYs=data.splitter.validationY, 
          binary=True, threshold=0.5)

Runtime: 3.32, Accuracy: 55.121, Precision: 53.368, Recall: 92.412
Runtime: 3.12, Accuracy: 51.956, Precision: 54.212, Recall: 33.598
Runtime: 3.41, Accuracy: 54.488, Precision: 57.774, Recall: 37.201


In [None]:
submissionFiles = [x for x in os.listdir() if '_submission.csv' in x and 'ensemble' not in x]
yPreds = np.zeros((len(submissionFiles), yHat.shape[0]))
for i in range(len(submissionFiles)):
    yPreds[i] = (pd.read_csv(submissionFiles[i])['Transported'])
    
testSubmission = pd.DataFrame(np.vstack((testData['PassengerId'], np.round(np.mean(yPreds, axis=0))>0)).T, 
                              columns=['PassengerId','Transported'])
testSubmission.to_csv('ensemble_submission.csv', index=False)

agree = np.round((1 - np.sum(np.sum(np.abs(yPreds - np.mean(yPreds, axis=0)), axis=0) > 0) / yHat.shape[0]) * 100, 2)

print(f'Percent Perfect Agreement: {agree}%')
print(f'Total Variance Between Methods: {np.round(np.mean(np.var(yPreds, axis=0)), 4)}')

In [None]:
np.sum(np.abs(yPreds - np.mean(yPreds, axis=0)), axis=1)