In [52]:
import pandas as pd
import numpy as np
import matplotlib as mpl
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from plotnine import *
from sklearn import metrics
from sklearn import svm
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import BaggingClassifier



test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

Counting nulls in each column

In [53]:
train.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [54]:
test.isna().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

Creating dummy variables, creating X_train, X_test, and Y_train

In [55]:
X_train = pd.get_dummies(train, columns = ['HomePlanet','Destination'])
X_test = pd.get_dummies(test, columns = ['HomePlanet','Destination'])

In [56]:
Y_train = train['Transported']

Removing unwanted columns

In [57]:
X_train = X_train.drop(['Name','Cabin','Transported'], axis = 1)
X_test = X_test.drop(['Name','Cabin'], axis = 1)

In [58]:
X_train.columns

Index(['PassengerId', 'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e'],
      dtype='object')

Using imputer to replace nulls

In [59]:
imputer = IterativeImputer()
X_train = imputer.fit_transform(X_train)
# imputer.transform(X_train)
X_test = imputer.fit_transform(X_test)

Converting to dataframes

In [60]:
X_train = pd.DataFrame(X_train, columns = ['PassengerId', 'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e'])

In [61]:
X_test = pd.DataFrame(X_test, columns = ['PassengerId', 'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'HomePlanet_Earth',
       'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e',
       'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e'])

Standardizing continuous variables

In [62]:
z = StandardScaler()
z.fit(X_train[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])
X_train[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = z.transform(X_train[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])


In [63]:
X_test[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = z.transform(X_test[['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']])

Creating random forest (depth 9 chosen by trial and error tuning)

In [127]:
forest = RandomForestClassifier(max_depth = 9)
forest.fit(X_train, Y_train)

Evaluating using training set

In [130]:
metrics.accuracy_score(Y_train, forest.predict(X_train))

0.8331991257333486

Creating predictions dataframe

In [133]:
outcomes = pd.DataFrame(columns = ['PassengerId','Transported'])
outcomes['PassengerId'] = test['PassengerId']
outcomes['Transported'] = forest.predict(X_test)

In [134]:
outcomes.shape

(4277, 2)

In [135]:
outcomes.to_csv('Forest_Titanic_Submissions.csv', index = False)