# Titanic spaceship - ML models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import statsmodels.api as sm

In [2]:
data = pd.read_csv('spaceship_rf.csv', index_col = 'PassengerId')
data.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group_count,Single,Deck,Side,Expenses,Age_cut,Expenses_cut
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0001_01,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,1,0,0.0,3,0
0002_01,0,0,2,24.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1,1,5,1,736.0,2,1
0003_01,1,0,2,58.0,1,1.0,1.0,0.0,1.0,1.0,0.0,2,0,0,1,10383.0,4,4
0003_02,1,0,2,33.0,0,0.0,1.0,1.0,1.0,1.0,0.0,2,0,0,1,5176.0,3,4
0004_01,0,0,2,16.0,0,1.0,1.0,1.0,1.0,1.0,1.0,1,1,5,1,1091.0,2,2


In [3]:
train_df = data.loc[data['Transported'].isna() == False]
test_df = data.loc[data['Transported'].isna() == True]
print(train_df.shape)
print(test_df.shape)

(8693, 18)
(4277, 18)


In [4]:
features = train_df.columns.tolist()
target = 'Transported'
features.remove('Transported')

In [5]:
features.remove('Age_cut')
features.remove('Expenses_cut')

In [6]:
train_df = train_df.astype(int)

Random Forests Wrapper

In [7]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

def CVTestRF(nFolds = 5, randomState=2137, debug=False,  features=features, saveModels = False, *args, **kwargs):
    
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)

    testAUCResults = []
    trainAUCResults = []
    testACCResults = []
    trainACCResults = []
    predictions = []
    indices = []
    models = []
    
    for train, test in kf.split(train_df.index.values):
        # Preparing the estimator
        clf = RandomForestClassifier(*args, **kwargs, random_state=randomState, n_jobs=-1)
        # Show fuction execution
        if debug:
            print(clf)
        
        # Training the model
        clf.fit(train_df.iloc[train][features], train_df.iloc[train][target])

        # Prediction for train and test data
        # Sklearn returns two columns of probs for both classes!
        predsTrain = clf.predict_proba(train_df[features].iloc[train])[:,1]
        predsTest = clf.predict_proba(train_df[features].iloc[test])[:,1]
        predsTrain_binary = clf.predict(train_df[features].iloc[train])
        predsTest_binary = clf.predict(train_df[features].iloc[test])
        
        # Storing test predictions for each fold
        predictions.append(predsTest.tolist().copy())
        
        # With indices
        indices.append(train_df.iloc[test].index.tolist().copy())
        
        # Metrics for each fold
        trainAUCScore = roc_auc_score((train_df[target].iloc[train]), predsTrain)
        testAUCScore = roc_auc_score((train_df[target].iloc[test]), predsTest)
        trainACCScore = accuracy_score(train_df[target].iloc[train], predsTrain_binary)
        testACCScore = accuracy_score(train_df[target].iloc[test], predsTest_binary)
        
        # Storing results
        trainAUCResults.append(trainAUCScore)
        testAUCResults.append(testAUCScore)
        trainACCResults.append(trainACCScore)
        testACCResults.append(testACCScore)
        
        # Optional show results
        if debug:
            print("Train AUC:", trainScore,
                  "Valid AUC:", testScore)
            
        # Optional save models
        if saveModels:
            models.append(clf)
        
    return trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models

In [8]:
trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF()
print('Mean train AUC:', np.mean(trainAUCResults).round(3), '| Train AUC std:', np.std(trainAUCResults).round(5))
print('Mean test AUC:', np.mean(testAUCResults).round(3), '| Test AUC std:', np.std(testAUCResults).round(5))
print('Mean train ACC:', np.mean(trainACCResults).round(3), '| Train ACC std:', np.std(trainACCResults).round(5))
print('Mean test ACC:', np.mean(testACCResults).round(3), '| Test ACC std:', np.std(testACCResults).round(5))

Mean train AUC: 0.994 | Train AUC std: 0.00039
Mean test AUC: 0.79 | Test AUC std: 0.00833
Mean train ACC: 0.966 | Train ACC std: 0.00095
Mean test ACC: 0.728 | Test ACC std: 0.01342


In [9]:
results = [['RF raw', np.mean(testAUCResults).round(3), np.mean(testACCResults).round(3)]]

It seems that model is clearly overfitted but we have not performed any feature selection yet. Let's check feature importance.

# Feature selection

In [10]:
trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF(saveModels = True)

imps = list(zip(models[0].feature_importances_, features))
imps.sort(reverse=True)
imps

[(0.25203794785468847, 'Age'),
 (0.23602670221127642, 'Expenses'),
 (0.08890748768087212, 'CryoSleep'),
 (0.08191664941167012, 'Deck'),
 (0.05354262096519694, 'Group_count'),
 (0.04382021652983118, 'Spa'),
 (0.04227700461154378, 'RoomService'),
 (0.04054111065946563, 'HomePlanet'),
 (0.0343246344481739, 'Destination'),
 (0.033129803182800587, 'VRDeck'),
 (0.026594882312942637, 'Side'),
 (0.026319430052935546, 'ShoppingMall'),
 (0.022589916001626756, 'FoodCourt'),
 (0.012897494008608956, 'Single'),
 (0.005074100068367045, 'VIP')]

Mini wrapper for feature selection

In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

def CVTest(nFolds = 5, randomState=2137, features=[]):
    
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)

    testResults = []
    for train, test in kf.split(train_df.index.values):
        clf = RandomForestClassifier(random_state=randomState, n_jobs=-1)
        clf.fit(train_df.iloc[train][features], train_df.iloc[train][target])
        predsTrain = clf.predict_proba(train_df[features].iloc[train])[:,1]
        predsTest = clf.predict_proba(train_df[features].iloc[test])[:,1]
        testResults.append(roc_auc_score((train_df[target].iloc[test]), predsTest))
        
    return np.mean(testResults)

Let's check the AUC score for each feature (using it as the only feature in model).

In [12]:
imps2 = []
for var in features:
    res = CVTest(features=[var])
    imps2.append((res, var))

imps2.sort(reverse=True)
imps2

[(0.7225097528579976, 'Expenses'),
 (0.7186527712888691, 'CryoSleep'),
 (0.665826549546413, 'Spa'),
 (0.6645033825132827, 'RoomService'),
 (0.6580304815333666, 'VRDeck'),
 (0.6242630362931096, 'ShoppingMall'),
 (0.6111961739325478, 'FoodCourt'),
 (0.6059645266175268, 'Deck'),
 (0.5973598605323456, 'HomePlanet'),
 (0.5662141116832655, 'Group_count'),
 (0.5567414584177909, 'Single'),
 (0.5507408654828813, 'Side'),
 (0.5481184933262733, 'Destination'),
 (0.5471591767828248, 'Age'),
 (0.5056361768860744, 'VIP')]

Now let's try different approach. Let's remove each feature one at a time and compare model performance. 

In [13]:
imps3 = []
for var in features:
    res = CVTest(features=features)
    features2 = features.copy()
    features2.remove(var)
    res2 = CVTest(features=features2)
    imps3.append((res2 - res, var))

imps3.sort(reverse=True)
imps3

[(0.006614674857372771, 'Age'),
 (0.00321197697074882, 'Destination'),
 (0.0023501289744272125, 'Group_count'),
 (0.002191177705739955, 'Single'),
 (0.002167244214535291, 'VIP'),
 (-0.0005399954047931033, 'ShoppingMall'),
 (-0.0023132523564362018, 'CryoSleep'),
 (-0.0030793450243183917, 'RoomService'),
 (-0.0038204255699408884, 'HomePlanet'),
 (-0.004718025930257164, 'FoodCourt'),
 (-0.005949633758577821, 'Side'),
 (-0.007278244567115455, 'Spa'),
 (-0.007473872167478746, 'VRDeck'),
 (-0.013948730872628046, 'Deck'),
 (-0.015720449375329792, 'Expenses')]

In [14]:
features.remove('Age')

imps3 = []
for var in features:
    res = CVTest(features=features)
    features2 = features.copy()
    features2.remove(var)
    res2 = CVTest(features=features2)
    imps3.append((res2 - res, var))

imps3.sort(reverse=True)
imps3

[(0.005024875227978454, 'Destination'),
 (0.004619508872342637, 'Group_count'),
 (0.0033299033486683394, 'CryoSleep'),
 (0.0028686539202797956, 'Single'),
 (0.0021966619753566174, 'VIP'),
 (0.002051340324468298, 'ShoppingMall'),
 (-0.002202570682006355, 'HomePlanet'),
 (-0.003769763146116656, 'Side'),
 (-0.003924665947023009, 'FoodCourt'),
 (-0.004904138401199298, 'RoomService'),
 (-0.006272079721103108, 'VRDeck'),
 (-0.008283345849735335, 'Expenses'),
 (-0.009982341454551458, 'Spa'),
 (-0.013583767856148454, 'Deck')]

In [15]:
features.remove('Destination')

imps3 = []
for var in features:
    res = CVTest(features=features)
    features2 = features.copy()
    features2.remove(var)
    res2 = CVTest(features=features2)
    imps3.append((res2 - res, var))

imps3.sort(reverse=True)
imps3

[(0.0004088393076546737, 'VIP'),
 (0.0002084977065252014, 'Single'),
 (-0.0009873482950548684, 'ShoppingMall'),
 (-0.0015394384052778154, 'CryoSleep'),
 (-0.002375567554775193, 'Group_count'),
 (-0.005197183885751189, 'RoomService'),
 (-0.005566505504466357, 'HomePlanet'),
 (-0.006690248583770431, 'FoodCourt'),
 (-0.00695626659918569, 'VRDeck'),
 (-0.009651309324652302, 'Side'),
 (-0.010142987568016792, 'Expenses'),
 (-0.010348495823396475, 'Spa'),
 (-0.019325769104388102, 'Deck')]

In [16]:
features.remove('VIP')

imps3 = []
for var in features:
    res = CVTest(features=features)
    features2 = features.copy()
    features2.remove(var)
    res2 = CVTest(features=features2)
    imps3.append((res2 - res, var))

imps3.sort(reverse=True)
imps3

[(0.0016115495655496526, 'Single'),
 (-0.00043076120467322454, 'ShoppingMall'),
 (-0.0007056009141033215, 'Group_count'),
 (-0.001622389506535149, 'CryoSleep'),
 (-0.004443710614647989, 'HomePlanet'),
 (-0.005439668523254171, 'FoodCourt'),
 (-0.005983207298190352, 'VRDeck'),
 (-0.006191101280475841, 'RoomService'),
 (-0.009227488696688213, 'Expenses'),
 (-0.009580745021808612, 'Side'),
 (-0.010021047216399626, 'Spa'),
 (-0.017824180893210584, 'Deck')]

In [17]:
features.remove('Single')

imps3 = []
for var in features:
    res = CVTest(features=features)
    features2 = features.copy()
    features2.remove(var)
    res2 = CVTest(features=features2)
    imps3.append((res2 - res, var))

imps3.sort(reverse=True)
imps3

[(-0.0009382188312696549, 'ShoppingMall'),
 (-0.002045278531674466, 'CryoSleep'),
 (-0.005068703912727512, 'HomePlanet'),
 (-0.0055200219638603665, 'FoodCourt'),
 (-0.006020338523003543, 'VRDeck'),
 (-0.006169766859807746, 'RoomService'),
 (-0.00739273777355598, 'Group_count'),
 (-0.008866671521953973, 'Spa'),
 (-0.009401068658176026, 'Side'),
 (-0.010562184459950719, 'Expenses'),
 (-0.016867348255854342, 'Deck')]

Now dropping any variable is worsening the score. Let's finish feature selection here.

In [18]:
trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF()
print('Mean train AUC:', np.mean(trainAUCResults).round(3), '| Train AUC std:', np.std(trainAUCResults).round(5))
print('Mean test AUC:', np.mean(testAUCResults).round(3), '| Test AUC std:', np.std(testAUCResults).round(5))
print('Mean train ACC:', np.mean(trainACCResults).round(3), '| Train ACC std:', np.std(trainACCResults).round(5))
print('Mean test ACC:', np.mean(testACCResults).round(3), '| Test ACC std:', np.std(testACCResults).round(5))

Mean train AUC: 0.973 | Train AUC std: 0.0006
Mean test AUC: 0.803 | Test AUC std: 0.0128
Mean train ACC: 0.919 | Train ACC std: 0.00133
Mean test ACC: 0.729 | Test ACC std: 0.0121


In [19]:
results.append(['RF after feature selection', np.mean(testAUCResults).round(3), np.mean(testACCResults).round(3)])

# Hyperparameters tuning

In [20]:
# number of trees
for k in [10, 25, 50, 100, 200, 500, 1000]:
    trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF(n_estimators=k)
    print(k, np.mean(trainAUCResults), np.mean(testAUCResults))

10 0.9693162232577202 0.7901737893994935
25 0.9718800923584728 0.7991782952612765
50 0.972946913528177 0.8023410039769159
100 0.9734893771053954 0.8034354467799487
200 0.9736479219026537 0.8043322309858201
500 0.9737839295100772 0.8048135728336394
1000 0.9738142987219355 0.8047112420269962


In [21]:
# max depth 
for k in range(2,22,2):
    trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF(n_estimators=200, max_depth=k)
    print(k, np.mean(trainAUCResults), np.mean(testAUCResults))

2 0.804017880429065 0.8010924046308757
4 0.8229331888239495 0.8134897387551453
6 0.8408914091975911 0.8189888649680078
8 0.8711882273381326 0.8238737557445139
10 0.9050095381378362 0.8238236793151081
12 0.9339972294065145 0.8207730817261396
14 0.9531953790000735 0.8163404789510273
16 0.9647849924326858 0.8109546481161768
18 0.9706376300197348 0.8070314324427305
20 0.9727897993139829 0.804756552846591


In [22]:
# min samples split
for k in [2, 4, 6, 8, 10, 12]:
    trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF(n_estimators=200, 
                                                                                                      max_depth=8,
                                                                                                      min_samples_split=k)
    print(k, np.mean(trainAUCResults), np.mean(testAUCResults))

2 0.8711882273381326 0.8238737557445139
4 0.8695943654730396 0.8238815774578363
6 0.868113946500541 0.8239241640244247
8 0.8666357521991241 0.8234289800628826
10 0.8654235482080438 0.8238004789709754
12 0.8642124527816668 0.8235860915375122


In [23]:
# min samples leaf
for k in [1,2,3,4,5]:
    trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF(n_estimators=200, 
                                                                                                      max_depth=8,
                                                                                                      min_samples_split=6,
                                                                                                      min_samples_leaf=k)
    print(k, np.mean(trainAUCResults), np.mean(testAUCResults))

1 0.868113946500541 0.8239241640244247
2 0.8658648582119277 0.8231993479848025
3 0.8641841239771513 0.8229023590928815
4 0.8616880917249758 0.8231692826768487
5 0.8601117183890603 0.8228521521873307


In [24]:
# max features
for k in range(2, len(features) + 1,2):
    trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF(n_estimators=200, 
                                                                                                      max_depth=8,
                                                                                                      min_samples_split=6,
                                                                                                      min_samples_leaf=1,
                                                                                                      max_features = k)
    print(k, np.mean(trainAUCResults), np.mean(testAUCResults))

2 0.86347586035686 0.8217572797250176
4 0.8714855024588799 0.824978549797072
6 0.875809378489099 0.8259047676112725
8 0.877795944950045 0.826654915728302
10 0.8804241012982296 0.8269284287264889


In [25]:
trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF(n_estimators=500, 
                                                                                                      max_depth=8,
                                                                                                      min_samples_split=6,
                                                                                                      min_samples_leaf=1,
                                                                                                      max_features = 11)
print('Mean train AUC:', np.mean(trainAUCResults).round(3), '| Train AUC std:', np.std(trainAUCResults).round(5))
print('Mean test AUC:', np.mean(testAUCResults).round(3), '| Test AUC std:', np.std(testAUCResults).round(5))
print('Mean train ACC:', np.mean(trainACCResults).round(3), '| Train ACC std:', np.std(trainACCResults).round(5))
print('Mean test ACC:', np.mean(testACCResults).round(3), '| Test ACC std:', np.std(testACCResults).round(5))

Mean train AUC: 0.882 | Train AUC std: 0.00247
Mean test AUC: 0.827 | Test AUC std: 0.00576
Mean train ACC: 0.786 | Train ACC std: 0.00169
Mean test ACC: 0.753 | Test ACC std: 0.00743


In [26]:
results.append(['RF after 1st hyperparameters tuning', np.mean(testAUCResults).round(3), np.mean(testACCResults).round(3)])

Now let's try grid search.

from sklearn.model_selection import GridSearchCV

params = {
    'bootstrap': [True, False],
    'n_estimators' : [100],
    'max_depth': [8,9,10,11],
    'min_samples_split': [3,4,5,6,7],
    'min_samples_leaf': [1,2],
    'max_features': [9,10,11]    
}

clf = RandomForestClassifier()
gs = GridSearchCV(estimator = clf, param_grid = params, cv = 3, n_jobs = -1, verbose = 2)

gs.fit(train_df[features], train_df[target])

gs.best_params_

In [27]:
trainAUCResults, testAUCResults, trainACCResults, testACCResults, predictions, indices, models = CVTestRF(n_estimators=500,
                                                                                                             bootstrap = True,
                                                                                                             max_depth = 10,
                                                                                                             max_features = 10,
                                                                                                             min_samples_leaf = 1,
                                                                                                             min_samples_split = 7)
print('Mean train AUC:', np.mean(trainAUCResults).round(3), '| Train AUC std:', np.std(trainAUCResults).round(5))
print('Mean test AUC:', np.mean(testAUCResults).round(3), '| Test AUC std:', np.std(testAUCResults).round(5))
print('Mean train ACC:', np.mean(trainACCResults).round(3), '| Train ACC std:', np.std(trainACCResults).round(5))
print('Mean test ACC:', np.mean(testACCResults).round(3), '| Test ACC std:', np.std(testACCResults).round(5))    

Mean train AUC: 0.91 | Train AUC std: 0.00246
Mean test AUC: 0.827 | Test AUC std: 0.00668
Mean train ACC: 0.808 | Train ACC std: 0.00122
Mean test ACC: 0.753 | Test ACC std: 0.00853


In [28]:
results.append(['RF after grid searching hyperparameters ', np.mean(testAUCResults).round(3), np.mean(testACCResults).round(3)])

In [29]:
display(pd.DataFrame(data = results, columns = ['Model name', 'AUC score', 'Accuracy']))

Unnamed: 0,Model name,AUC score,Accuracy
0,RF raw,0.79,0.728
1,RF after feature selection,0.803,0.729
2,RF after 1st hyperparameters tuning,0.827,0.753
3,RF after grid searching hyperparameters,0.827,0.753


# Submission (random forests)

In [30]:
clf = RandomForestClassifier(n_estimators=500, max_depth=10, min_samples_split=7, min_samples_leaf=1, max_features = 10, 
                            bootstrap = True)
clf.fit(train_df[features], train_df[target])
preds = clf.predict_proba(train_df[features])[:,1]
roc_auc_score(train_df[target], preds)

0.9033995427469168

In [31]:
test_df['Transported'] = clf.predict(test_df[features])
test_df['Transported'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Transported'] = clf.predict(test_df[features])


0    2348
1    1929
Name: Transported, dtype: int64

In [32]:
submission_rf = test_df['Transported'].reset_index()
submission_rf['Transported'] = submission_rf['Transported'].replace({1:True, 0:False})
submission_rf.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False


In [33]:
#submission_rf.to_csv('submission_rf.csv', index = False)