In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [6]:
train_path = '../../preprocess_train_dataset/pre06_train.csv'
test_path = '../../preprocess_test_dataset/pre06_test.csv'
pid_path = '../../spaceship-titanic_rawData/sample_submission.csv'

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
pid = pd.read_csv(pid_path)

train_x = train.drop('Transported', axis=1)
train_y = train['Transported']
sub_pid = pid['PassengerId']

train_x

Unnamed: 0,CryoSleep,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Cabin_deck_A,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,...,Cabin_side_S,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,spending_Categ_Low,spending_Categ_Medium,spending_Categ_High
0,False,False,True,False,False,True,False,False,False,False,...,False,39.0,0.0,0.0,0.0,0.0,0.0,True,False,False
1,False,True,False,False,False,False,False,False,False,True,...,True,24.0,109.0,9.0,25.0,549.0,44.0,False,True,False
2,False,False,True,False,True,False,False,False,False,False,...,True,58.0,43.0,3576.0,0.0,6715.0,49.0,False,False,True
3,False,False,True,False,True,False,False,False,False,False,...,True,33.0,0.0,1283.0,371.0,3329.0,193.0,False,False,True
4,False,True,False,False,False,False,False,False,False,True,...,True,16.0,303.0,70.0,151.0,565.0,2.0,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,False,False,True,False,True,False,False,False,False,False,...,False,41.0,0.0,6819.0,0.0,1643.0,74.0,False,False,True
8689,True,True,False,False,False,False,False,False,False,False,...,True,18.0,0.0,0.0,0.0,0.0,0.0,True,False,False
8690,False,True,False,False,False,False,False,False,False,False,...,True,26.0,0.0,0.0,1872.0,1.0,0.0,False,True,False
8691,False,False,True,False,False,False,False,False,True,False,...,True,32.0,0.0,1049.0,0.0,353.0,3235.0,False,False,True


In [7]:
kFold = KFold(n_splits=10)
rf = RandomForestClassifier()

score = cross_val_score(rf, train_x, train_y, cv = kFold)

print('Cross Validation Score: ', score)

Cross Validation Score:  [0.79770115 0.76896552 0.78965517 0.79516686 0.8009206  0.81472957
 0.82393556 0.8296893  0.80782509 0.79286536]


In [4]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth' : [3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5)

grid_search.fit(train_x, train_y)

print(grid_search.best_score_)
print(grid_search.best_params_)

0.80237249957153
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 15, 'n_estimators': 100}


In [10]:
# submission
rf = RandomForestClassifier(n_estimators=200, max_depth=12, min_samples_split=15, min_samples_leaf=5, bootstrap=True, criterion='entropy')

rf.fit(train_x, train_y)
pred = rf.predict(test)

rf_submission = pd.DataFrame({'PassengerId': sub_pid, 'Transported': pred})

rf_submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [11]:
rf_submission.to_csv('../../output_prediction/rf_pre06_submission.csv', index=False, index_label=False)