In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [2]:
train_path = '../../preprocess_train_dataset/pre05_train.csv'
test_path = '../../preprocess_test_dataset/pre05_test.csv'
pid_path = '../../spaceship-titanic_rawData/sample_submission.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
pid = pd.read_csv(pid_path)

train_x = train.drop('Transported', axis=1)
train_y = train['Transported']
sub_pid = pid['PassengerId']

train_x

Unnamed: 0,CryoSleep,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_P,Cabin_side_S,Age,spending_Categ_Low,spending_Categ_Medium,spending_Categ_High
0,False,False,True,False,False,True,False,False,False,False,False,False,True,False,39.0,True,False,False
1,False,True,False,False,False,False,False,False,False,True,False,False,False,True,24.0,False,True,False
2,False,False,True,False,True,False,False,False,False,False,False,False,False,True,58.0,False,False,True
3,False,False,True,False,True,False,False,False,False,False,False,False,False,True,33.0,False,False,True
4,False,True,False,False,False,False,False,False,False,True,False,False,False,True,16.0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,False,True,False,True,False,False,False,False,False,False,False,True,False,41.0,False,False,True
8689,True,True,False,False,False,False,False,False,False,False,True,False,False,True,18.0,True,False,False
8690,False,True,False,False,False,False,False,False,False,False,True,False,False,True,26.0,False,True,False
8691,False,False,True,False,False,False,False,False,True,False,False,False,False,True,32.0,False,False,True


In [6]:
kFold = KFold(n_splits=10)
rf = RandomForestClassifier()

score = cross_val_score(rf, train_x, train_y, cv = kFold)

print('Cross Validation Score: ', score)

Cross Validation Score:  [0.72528736 0.66666667 0.68505747 0.716916   0.67663982 0.69850403
 0.68124281 0.72036824 0.69850403 0.71001151]


In [7]:
# find the best hyperparameters by GridSearchCV

param_grid = {
    'n_estimators': [100],
    'max_depth' : [3, 5, 6, 7, 9, 11],
    'min_samples_split': [2, 5, 7],
    'min_samples_leaf': [1, 2, 5],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

grid_search.fit(train_x, train_y)

print(grid_search.best_score_)
print(grid_search.best_params_)

0.7481900699514489
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 9, 'min_samples_leaf': 5, 'min_samples_split': 7, 'n_estimators': 100}


In [10]:
# find the best hyperparameters by GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 350],
    'max_depth' : [7, 9, 10, 11, 12],
    'min_samples_split': [5, 6, 7, 9],
    'min_samples_leaf': [5, 6, 7, 9],
    'bootstrap': [True],
    'criterion': ['gini']
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

grid_search.fit(train_x, train_y)

print(grid_search.best_score_)
print(grid_search.best_params_)

0.7485353605202784
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 11, 'min_samples_leaf': 9, 'min_samples_split': 9, 'n_estimators': 100}


In [11]:
# submission
# rf = RandomForestClassifier(n_estimators=200, max_depth=12, min_samples_split=15, min_samples_leaf=5, bootstrap=True, criterion='entropy')
# rf.fit(train_x, train_y)

best_model = grid_search.best_estimator_
best_model.fit(train_x, train_y)
pred = best_model.predict(test)

rf_submission = pd.DataFrame({'PassengerId': sub_pid, 'Transported': pred})

rf_submission.isna().sum()

PassengerId    0
Transported    0
dtype: int64

In [12]:
rf_submission.to_csv('../../output_prediction/rf02_pre05_submission.csv', index=False, index_label='PassengerId')