In [2]:
import numpy as np
import pandas as pd
import random
import os
import math
from collections import Counter
from sklearn.model_selection import train_test_split

In [119]:
class Data:
    def __init__(self, knownFile, resultCol, validFile):
        self.resultCol = resultCol
        self.knownCols, self.knownIn, self.knownOut = self.extract(knownFile)
        self.validCols, self.validIn, __ = self.extract(validFile)
        self.check = self.dataCheck()
    
    def extract(self, file):
        data = pd.read_csv(file)
        result = data[self.resultCol] if self.resultCol in data.columns else None
        cols = [x for x in data.columns if x != self.resultCol]
        inputs = data[cols]
        return cols, inputs, result
    
    def dataCheck(self):
        if self.knownCols != self.validCols:
            print('Input columns do not match!')
            return False
        
        return True
    
class Branch:
    def __init__(self, col=None, val=None, left=None, right=None, result = None):
        self.col, self.val, self.left, self.right, self.result = col, val, left, right, result
    
class DecisionTree:
    def __init__(self, dataIn, dataOut, cutoffs, maxDepth, nTest):
        self.x, self.y, self.cutoffs = dataIn, dataOut, cutoffs
        self.maxDepth, self.nTest = maxDepth, nTest
        self.tree = [[] for i in range(maxDepth)]
        self.random_state = random.randint(0,1000)
        
        self._split()
        self.tree = self._create_tree(self.x, self.y)
        self.weight = self._accuracy()
        
    def _split(self):
        locants = [i for i in range(len(self.x))]
        random.shuffle(locants)
        
        self.trainIn, self.testIn, self.trainOut, self.testOut = train_test_split(self.x, self.y, test_size=self.nTest, \
                                                                                  random_state=self.random_state)
    
    def _create_tree(self, subX, subY, depth = 0):
        if depth >= self.maxDepth:
            result = self._find_result(subY)
            return Branch(result=result)
        
        bestEntropy = -1
        bulkEntropy = self._entropy(np.mean(subY))
        
        for col in subX.columns:
            for val in self.cutoffs[col]:
                mask = subX[col] > val
                    
                maskRat = np.sum(mask) / len(subY)
                posEntropy = self._entropy(np.mean(subY[mask]))
                negEntropy = self._entropy(np.mean(subY[~mask]))

                totEntropy = bulkEntropy - (maskRat * posEntropy + (1 - maskRat) * negEntropy)
                
                if totEntropy > bestEntropy:
                    bestEntropy, bestCol, bestVal, bestMask = totEntropy, col, val, mask
                
        try:
            left = self._create_tree(subX[bestMask], subY[bestMask], depth + 1)
            right = self._create_tree(subX[~bestMask], subY[~bestMask], depth + 1)
            return Branch(bestCol, bestVal, left, right)
        except:
            result = self._find_result(subY)
            return Branch(result=result)
            
    def _entropy(self, p):
        return -(p * math.log2(max(p,1e-7)) + (1-p) * math.log2(max(1-p,1e-7)))
    
    def _find_result(self, y):
        counter = Counter(y)
        return counter.most_common(1)[0][0]
    
    def _accuracy(self):
        xPred = np.array([self._climb_tree(self.testIn.iloc[i], self.tree) for i in range(len(self.testIn))])
        return np.sum(np.array(xPred) == np.array(self.testOut)) / len(xPred)
    
    def _predict(self, x):
        return np.array([self._climb_tree(x.iloc[i], self.tree) for i in range(len(x))])
        
    def _climb_tree(self, x, branch):
        if branch.result != None:
            return branch.result
        
        if x[branch.col] > branch.val:
            return self._climb_tree(x, branch.left)
        else:
            return self._climb_tree(x, branch.right)
    
class RandomForest:
    def __init__(self, data, nTrees = 200, maxDepth = 2, nCols = 0.2, nTest = 0.2, ignore = []):
        self.nTrees, self.maxDepth, self.nCols, self.nTest, self.ignore = nTrees, maxDepth, nCols, nTest, ignore
        self.inCols, self.resultCol = data.knownIn.columns, data.resultCol
        self.knownIn, self.knownOut, self.validIn = data.knownIn, data.knownOut, data.validIn
        self.trees = []
        self._create_tree_vector = np.vectorize(self._create_tree)
        
        self._find_cutoffs()
        self._create_forest()
#         self._predict(self.validIn)
        
    def _find_cutoffs(self):
        self.cutoffs = {}
        for col in self.inCols:
            if col in self.ignore:
                continue
            colVals = sorted(list(set(self.knownIn[col])))
            colVals = [x if x != np.nan else -np.inf for x in colVals]
            self.cutoffs[col] = np.diff(colVals) / 2 + np.array(colVals[:-1])
                
    def _select_cols(self):
        return random.sample(list(self.cutoffs.keys()), int(self.nCols * len(self.cutoffs)))
    
    def _create_tree(self, idx=None):
        cols = self._select_cols()
        return DecisionTree(self.knownIn[cols], self.knownOut, self.cutoffs, self.maxDepth, self.nTest)
    
    def _create_forest(self):
        self.trees = self._create_tree_vector([i for i in range(self.nTrees)])
            
#         for i in range(self.nTrees):
#             self.trees.append()
            
    def _predict(self, x):
        prediction, totWeight = np.zeros(len(x)), 0
        for i in range(len(self.trees)):
            prediction += (self.trees[i]._predict(x)) * self.trees[i].weight
            totWeight += self.trees[i].weight
            
        self.prediction = prediction / totWeight
        self.roundedPredictions = np.round(self.prediction)
        
def adjust(data):
    expenses = ['FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService']
    
#     for col in expenses:
#         data.loc[(data[col].isna()) & (data['CryoSleep'] != True), col] = data[col][data['CryoSleep'] != True].mean()
#         data[col] = data[col].where(data[col].notna(), 0)
        
    data['Group'], data['GroupNo'] = [[int(x.split('_')[i]) for x in data['PassengerId']] for i in range(2)]
    data['Deck'], data['DeckNo'], data['Side'] = [[x.split('/')[i] if '/' in str(x) else np.nan \
                                                   for x in data['Cabin']] for i in range(3)]
    data['DeckNo'] = np.round(data['DeckNo'].astype(float), -1)
    data = pd.merge(data, data['Group'].value_counts().to_frame(name='GroupSize'), how='inner', on='Group')
    data['RoomService'], data['FoodCourt'], data['ShoppingMall'], data['Spa'], data['VRDeck'], data['Group'] = \
            np.round(data['RoomService'], -2), np.round(data['FoodCourt'], -2), np.round(data['ShoppingMall'], -2), \
            np.round(data['Spa'], -2), np.round(data['VRDeck'], -2), np.round(data['Group'], -2)
    data['TotalSpending'] = data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck'] + \
                            data['RoomService']
    
#     data['CryoSleep'] = data['CryoSleep'].where(data['CryoSleep'].notna(), data['TotalSpending'] == 0)
#     data.loc[(data['HomePlanet'].isna()) & (data['TotalSpending'] > 3000), 'HomePlanet'] = 'Europa'
    
#     data['Destination'] = data['Destination'].where(data['Destination'].notna(), \
#                                                     data['Destination'].value_counts().idxmax())
#     data['Age'] = data['Age'].where(data['Age'].notna(), data['Age'].median())
#     data['DeckNo'] = data['DeckNo'].where(data['DeckNo'].notna(), data['DeckNo'].median())
    
#     data.loc[(data['HomePlanet'].isna()) & (data['TotalSpending'] > 3000), 'HomePlanet'] = 'Europa'
#     data.loc[(data['HomePlanet'].isna()) & (data['Deck'] == 'F'), 'HomePlanet'] = 'Mars'
#     data['HomePlanet'] = data['HomePlanet'].where(data['HomePlanet'].notna(), 'Earth')
    
#     data.loc[(data['VIP'].isna()) & (data['TotalSpending'] > 4000), 'VIP'] = True
#     data.loc[(data['VIP'].isna()) & (data['HomePlanet'] == 'Europa'), 'VIP'] = True
#     data['VIP'] = data['VIP'].where(data['VIP'].notna(), False)
    
#     data['HomePlanet'] = data['HomePlanet'].where(data['HomePlanet'].notna(), data['HomePlanet'].value_counts().idxmax())
#     data.loc[(data['Deck'].isna()) & (data['VIP'] == True), 'Deck'] = 'A'
#     data.loc[(data['Deck'].isna()) & (data['HomePlanet'] == 'Europa') & (data['TotalSpending'] < 3500), 'Deck'] = 'B' # All Europa
#     data.loc[(data['Deck'].isna()) & (data['HomePlanet'] == 'Europa'), 'Deck'] = 'C' # All Europa, bigger spenders than B
#     data.loc[(data['Deck'].isna()) & (data['HomePlanet'] == 'Mars'), 'Deck'] = 'D'
#     data.loc[(data['Deck'].isna()) & (data['CryoSleep'] == False) & (data['HomePlanet'] == 'Europa'), 'Deck'] = 'E'
#     data.loc[(data['Deck'].isna()) & (data['CryoSleep'] == True) & (data['TotalSpending'] < 1000), 'Deck'] = 'G'
#     data.loc[(data['Deck'].isna()) & (data['GroupSize'] == 1), 'Deck'] = 'T'
#     data['Deck'] = data['Deck'].where(data['Deck'].notna(), 'F')
#     data.loc[(data['Deck'].isna()) & (data['CryoSleep'] == True), 'Deck'] = 'G'
#     data['Deck'] = data['Deck'].where(data['Deck'].notna(), 'F')
    
#     data = pd.get_dummies(data, columns=['HomePlanet', 'Destination', 'Deck', 'Side'])
    return data

In [120]:
data = Data('train.csv', 'Transported', 'test.csv')
data.knownIn = adjust(data.knownIn)
data.validIn = adjust(data.validIn)

In [None]:
# print(data.knownIn['Deck'].value_counts())
# print(data.knownIn.groupby(['Deck'])['TotalSpending'].mean())
# print(data.knownIn.groupby(['Deck'])['CryoSleep'].value_counts())
# data.knownIn.groupby(['Deck'])['HomePlanet'].value_counts()

In [122]:
ignore = ['PassengerId', 'Name', 'Cabin', 'Side', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'RoomService', \
             'Group', 'GroupNo', 'DeckNo']

for col in data.knownIn.columns:
    if col not in ignore:
        nans = round(np.mean(data.knownIn[col].isna())*100, 2)
        zeros = round(np.mean(data.knownIn[col] == 0)*100, 2)
        corr = 0 #round(data.knownIn[col].corr(data.knownOut)*100, 2)
        print(f'{col},\n   NaN:  {nans}%\n   Zero: {zeros}%\n   Corr: {corr}%')

HomePlanet,
   NaN:  2.31%
   Zero: 0.0%
   Corr: 0%
CryoSleep,
   NaN:  2.5%
   Zero: 62.57%
   Corr: 0%
Destination,
   NaN:  2.09%
   Zero: 0.0%
   Corr: 0%
Age,
   NaN:  2.06%
   Zero: 2.05%
   Corr: 0%
VIP,
   NaN:  2.34%
   Zero: 95.38%
   Corr: 0%
RoomService,
   NaN:  2.08%
   Zero: 73.69%
   Corr: 0%
FoodCourt,
   NaN:  2.11%
   Zero: 72.25%
   Corr: 0%
ShoppingMall,
   NaN:  2.39%
   Zero: 75.59%
   Corr: 0%
Spa,
   NaN:  2.11%
   Zero: 72.7%
   Corr: 0%
VRDeck,
   NaN:  2.16%
   Zero: 73.67%
   Corr: 0%
Deck,
   NaN:  2.29%
   Zero: 0.0%
   Corr: 0%
GroupSize,
   NaN:  0.0%
   Zero: 0.0%
   Corr: 0%
TotalSpending,
   NaN:  10.45%
   Zero: 37.35%
   Corr: 0%


In [105]:
corrs, col1, col2 = [], [], []

for i in range(len(data.knownIn.columns)):
    j = i + 1
    while True:
        if j >= len(data.knownIn.columns) - 1:
            break
        
        if data.knownIn.columns[i] not in ignore and data.knownIn.columns[j] not in ignore:
            try:
                corr = data.knownIn[data.knownIn.columns[i]].corr(data.knownIn[data.knownIn.columns[j]])
                corrs.append(abs(corr)), col1.append(data.knownIn.columns[i]), col2.append(data.knownIn.columns[j])
                print(data.knownIn.columns[i], data.knownIn.columns[j], round(corr * 100, 2))
            except Exception as e:
                pass
            
        j += 1

CryoSleep Age -7.5
CryoSleep VIP -8.65
CryoSleep GroupSize 8.62
CryoSleep TotalSpending -39.15
CryoSleep HomePlanet_Earth -10.65
CryoSleep HomePlanet_Europa 9.18
CryoSleep HomePlanet_Mars 3.33
CryoSleep Destination_55 Cancri e 6.46
CryoSleep Destination_PSO J318.5-22 8.72
CryoSleep Destination_TRAPPIST-1e -11.21
CryoSleep Deck_A -3.29
CryoSleep Deck_B 13.45
CryoSleep Deck_C 2.33
CryoSleep Deck_D -6.45
CryoSleep Deck_E -11.13
CryoSleep Deck_F -22.75
CryoSleep Deck_G 25.9
CryoSleep Deck_T -5.22
CryoSleep Side_P -2.18
Age VIP 9.4
Age GroupSize -17.7
Age TotalSpending 18.43
Age HomePlanet_Earth -20.19
Age HomePlanet_Europa 21.81
Age HomePlanet_Mars 1.58
Age Destination_55 Cancri e 1.68
Age Destination_PSO J318.5-22 -2.81
Age Destination_TRAPPIST-1e 0.28
Age Deck_A 7.46
Age Deck_B 10.55
Age Deck_C 13.04
Age Deck_D 7.72
Age Deck_E 2.82
Age Deck_F -2.12
Age Deck_G -21.32
Age Deck_T 0.48
Age Side_P -1.16
VIP GroupSize 0.72
VIP TotalSpending 19.04
VIP HomePlanet_Earth -16.84
VIP HomePlanet_Euro

In [106]:
topCorrs = sorted(range(len(corrs)), key=lambda i: corrs[i])[-10:]

for idx in topCorrs:
    print(round(corrs[idx], 3), col1[idx], col2[idx])

0.396 HomePlanet_Europa Deck_F
0.414 TotalSpending HomePlanet_Europa
0.452 Deck_F Deck_G
0.487 Destination_PSO J318.5-22 Destination_TRAPPIST-1e
0.536 HomePlanet_Europa Deck_C
0.552 HomePlanet_Europa Deck_B
0.557 HomePlanet_Earth HomePlanet_Mars
0.59 HomePlanet_Earth Deck_G
0.634 HomePlanet_Earth HomePlanet_Europa
0.783 Destination_55 Cancri e Destination_TRAPPIST-1e


In [111]:
forest = RandomForest(data, nTrees = 200, maxDepth = 2, ignore = ignore)

In [110]:
np.mean([forest.trees[i].weight for i in range(len(forest.trees))])

0.6215641173087981

In [None]:
df = pd.DataFrame({'PassengerId': data.validIn['PassengerId'], 'Transported': forest.prediction > 0.5})
df.to_csv('random_forest_submission.csv', index=False)