In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
def preprocessing(proc_df):
    
    feature_cols = proc_df.columns.to_list()
    
    try:
        feature_cols.remove('Survived')
    except ValueError:
        pass
    
    feature_cols.remove('Name')
    feature_cols.remove('Ticket')
    feature_cols.remove('PassengerId')
    
    proc_df = proc_df[feature_cols]
    
    # Convert Cabin to binary
    proc_df['Cabin'] = proc_df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
    
    # Chane categorical values to numerical values
    le = LabelEncoder()
    proc_df[['Sex', 'Embarked']] = proc_df[['Sex', 'Embarked']].apply(lambda col: le.fit_transform(col.to_list()))
    
    # Handle missing values
    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    imputer.fit(proc_df['Age'].values.reshape(-1, 1))
    proc_df['Age'] = imputer.transform(proc_df['Age'].values.reshape(-1, 1))
    
    imputer.fit(proc_df['Fare'].values.reshape(-1, 1))
    proc_df['Fare'] = imputer.transform(proc_df['Fare'].values.reshape(-1, 1))
    
    return proc_df

In [5]:
y = df['Survived']
x = preprocessing(df)
x.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pand

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.0,1,0,7.25,0,2
1,1,0,38.0,1,0,71.2833,1,0
2,3,0,26.0,0,0,7.925,0,2
3,1,0,35.0,1,0,53.1,1,2
4,3,1,35.0,0,0,8.05,0,2


In [6]:
# Run the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x, y)

print(rf.score(x, y))

0.9854096520763187


In [12]:
# Now lets predict for test csv given to us
x_test = pd.read_csv('test.csv')

passenger_id = x_test['PassengerId']
x_test = preprocessing(x_test)

x_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pand

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,34.5,0,0,7.8292,0,1
1,3,0,47.0,1,0,7.0,0,2
2,2,1,62.0,0,0,9.6875,0,1
3,3,1,27.0,0,0,8.6625,0,2
4,3,0,22.0,1,1,12.2875,0,2


In [32]:
final_df = pd.concat([passenger_id, pd.DataFrame(rf.predict(x_test))], axis=1)
final_df.columns = ['PassengerId', 'Survived']
final_df.to_csv('submission.csv', index=False)

final_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0


In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(3, 50, num = 8)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 9, 16, 23, 29, 36, 43, 50, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [9]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfclf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator=rfclf, param_grid=random_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=10)

# Fit the random search model
rf_random.fit(x, y)

Fitting 3 folds for each of 3240 candidates, totalling 9720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  20 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done  49 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  60 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  73 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done 101 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 150 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 169 tasks      | elapsed:   

GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,...
                                              warm_start=False),
             iid='deprecated', n_jobs=-1,
    

In [11]:
rf_random.best_score_

0.8282828282828283

In [14]:
final_df_grid = pd.concat([passenger_id, pd.DataFrame(rf_random.predict(x_test))], axis=1)
final_df_grid.columns = ['PassengerId', 'Survived']
final_df_grid.to_csv('submission_grid.csv', index=False)

final_df_grid.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
