In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [2]:
train_path = '../../preprocess_train_dataset/normalized_train_data.csv'
test_path = '../../preprocess_test_dataset/normalized_test_data.csv'
pid_path = '../../spaceship-titanic_rawData/sample_submission.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
pid = pd.read_csv(pid_path)

train_x = train.drop('Transported', axis=1)
train_y = train['Transported']
sub_pid = pid['PassengerId']

train_x

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,...,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,False,True,False,False,False,True,False,False,False,False,...,False,False,True,0.493671,False,0.000000,0.000000,0.000000,0.000000,0.000000
1,True,False,False,False,False,False,False,False,False,True,...,False,False,True,0.303797,False,0.007608,0.000302,0.001064,0.024500,0.001823
2,False,True,False,False,True,False,False,False,False,False,...,False,False,True,0.734177,True,0.003001,0.119948,0.000000,0.299670,0.002030
3,False,True,False,False,True,False,False,False,False,False,...,False,False,True,0.417722,False,0.000000,0.043035,0.015793,0.148563,0.007997
4,True,False,False,False,False,False,False,False,False,True,...,False,False,True,0.202532,False,0.021149,0.002348,0.006428,0.025214,0.000083
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,True,False,False,True,False,False,False,False,False,...,True,False,False,0.518987,True,0.000000,0.228726,0.000000,0.073322,0.003066
8689,True,False,False,True,False,False,False,False,False,False,...,False,True,False,0.227848,False,0.000000,0.000000,0.000000,0.000000,0.000000
8690,True,False,False,False,False,False,False,False,False,False,...,False,False,True,0.329114,False,0.000000,0.000000,0.079687,0.000045,0.000000
8691,False,True,False,False,False,False,False,False,True,False,...,True,False,False,0.405063,False,0.000000,0.035186,0.000000,0.015753,0.134049


In [3]:
train_y

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool

In [4]:
kFold = KFold(n_splits=10)
rf = RandomForestClassifier()

score = cross_val_score(rf, train_x, train_y, cv = kFold)

print('Cross Validation Score: ', score)

Cross Validation Score:  [0.81034483 0.77356322 0.79195402 0.79171461 0.7986191  0.82163406
 0.82623705 0.80322209 0.82278481 0.7721519 ]


In [5]:
# find the best hyperparameters by GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth' : [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

grid_search.fit(train_x, train_y)

print(grid_search.best_score_)
print(grid_search.best_params_)

0.8023730951282797
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 12, 'min_samples_leaf': 5, 'min_samples_split': 15, 'n_estimators': 200}


In [10]:
# submission
rf = RandomForestClassifier(n_estimators=200, max_depth=12, min_samples_split=15, min_samples_leaf=5, bootstrap=True, criterion='entropy')

rf.fit(train_x, train_y)
pred = rf.predict(test)

rf_submission = pd.DataFrame({'PassengerId': sub_pid, 'Transported': pred})

rf_submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [11]:
rf_submission.to_csv('../../output_prediction/rf_submission.csv', index=False, index_label=False)