# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from numpy import mean
from numpy import absolute
from numpy import sqrt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score,mean_squared_error,r2_score

# Importing Datasets

In [3]:
train=pd.read_csv("train_dataset.csv")

In [4]:
test=pd.read_csv("test_dataset.csv")

# Extracting Data

In [5]:
train=train.set_index('Unnamed: 0')
X_train=train.iloc[:,0:9]
y_train=train.Survived

In [6]:
test=test.set_index('Unnamed: 0')
X_test=test.iloc[:,0:9]
y_test=test.Survived

# Taking a look at the data

In [7]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(534, 9)
(534,)
(179, 9)
(179,)


In [8]:
X_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Fare,Embarked,FamilyCount,Title,Has Cabin,IsAlone
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
711,1,0,29.699118,26.55,0,0,1,1,0
466,2,0,29.699118,0.0,0,0,1,0,0
213,2,0,30.0,13.0,0,0,1,0,0
496,1,1,54.0,78.2667,2,1,2,1,1
583,1,0,36.0,40.125,2,0,1,1,0


In [9]:
y_train.head()

Unnamed: 0
711    0
466    0
213    0
496    1
583    0
Name: Survived, dtype: int64

# RandomForest Classifier Model

In [10]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred=rf.predict(X_test)
acc_rf= accuracy_score(y_test,y_pred)
acc_rf

0.8212290502793296

# Hyperparameter Tuning to improve Accuracy

#### RandomizedSearchCV

In [12]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [14]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [16]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 70,
 'bootstrap': True}

In [17]:
rf_random.best_score_

0.8520599250936329

#### GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()

In [19]:
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [21]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 100,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 200}

In [22]:
grid_search.best_score_

0.8576779026217228

In [24]:
rf = grid_search.best_estimator_

# Cross validating the model

In [25]:
#define cross-validation method to use
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [26]:
scores = cross_val_score(rf, X_train, y_train, scoring='neg_mean_absolute_error',cv=cv, n_jobs=-1)

#view mean absolute error
mean(absolute(scores))

0.15702306079664569

In [None]:
#define cross-validation method to use
cv = KFold(n_splits=5, random_state=1, shuffle=True) 

#use LOOCV to evaluate model
scores = cross_val_score(rf, X_train, y_train, scoring='neg_mean_squared_error',cv=cv, n_jobs=-1)

#view RMSE
sqrt(mean(absolute(scores)))

0.43278612094847574

Both MAE and RMSE are quite low meaning our model's performane is quite good

# Pickle the model

In [None]:
# save the model to disc
import pickle
filename = "RandomForestClassifier model.pkl"
pickle.dump(rf,open(filename,'wb'))