In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
def preprocessing(proc_df):
    
    feature_cols = proc_df.columns.to_list()
    
    try:
        feature_cols.remove('Survived')
    except ValueError:
        pass
    
    feature_cols.remove('Name')
    feature_cols.remove('Ticket')
    feature_cols.remove('PassengerId')
    
    proc_df = proc_df[feature_cols]
    
    # Convert Cabin to binary
    proc_df['Cabin'] = proc_df['Cabin'].apply(lambda x: 0 if pd.isnull(x) else 1)
    
    # Chane categorical values to numerical values
    le = LabelEncoder()
    proc_df[['Sex', 'Embarked']] = proc_df[['Sex', 'Embarked']].apply(lambda col: le.fit_transform(col.to_list()))
    
    # Handle missing values
    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    
    imputer.fit(proc_df['Age'].values.reshape(-1, 1))
    proc_df['Age'] = imputer.transform(proc_df['Age'].values.reshape(-1, 1))
    
    imputer.fit(proc_df['Fare'].values.reshape(-1, 1))
    proc_df['Fare'] = imputer.transform(proc_df['Fare'].values.reshape(-1, 1))
    
    # instantiate OneHotEncoder
    ohe = OneHotEncoder(sparse=False)

    one_hot_encode_df = proc_df[['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked']]
    one_hot_encoded_arr = ohe.fit_transform(one_hot_encode_df) # It returns an numpy array
    column_names = ohe.get_feature_names(['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked'])
    one_hot_encoded_frame =  pd.DataFrame(one_hot_encoded_arr, columns=column_names.tolist())
    
    proc_df = pd.concat([one_hot_encoded_frame, proc_df[['Age', 'Fare']]], axis=1)
    
    return proc_df

In [8]:
y = df['Survived']
x = preprocessing(df)
x.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pand

Unnamed: 0,Pclass_1.0,Pclass_2.0,Pclass_3.0,Sex_0.0,Sex_1.0,SibSp_0.0,SibSp_1.0,SibSp_2.0,SibSp_3.0,SibSp_4.0,...,Parch_5.0,Parch_6.0,Cabin_0.0,Cabin_1.0,Embarked_0.0,Embarked_1.0,Embarked_2.0,Embarked_3.0,Age,Fare
0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,22.0,7.25
1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,38.0,71.2833
2,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,26.0,7.925
3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,35.0,53.1
4,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,35.0,8.05


In [31]:
# Run the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, criterion='gini')
rf.fit(x, y)

print(rf.score(x, y))

0.9854096520763187


In [25]:
# Now lets predict for test csv given to us
x_test = pd.read_csv('test.csv')

passenger_id = x_test['PassengerId']
x_test = preprocessing(x_test)

x_test.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pand

Unnamed: 0,Pclass_1.0,Pclass_2.0,Pclass_3.0,Sex_0.0,Sex_1.0,SibSp_0.0,SibSp_1.0,SibSp_2.0,SibSp_3.0,SibSp_4.0,...,Parch_5.0,Parch_6.0,Parch_9.0,Cabin_0.0,Cabin_1.0,Embarked_0.0,Embarked_1.0,Embarked_2.0,Age,Fare
0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,34.5,7.8292
1,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,47.0,7.0
2,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,62.0,9.6875
3,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,27.0,8.6625
4,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,22.0,12.2875


In [26]:
final_df = pd.concat([passenger_id, pd.DataFrame(rf.predict(x_test))], axis=1)
final_df.columns = ['PassengerId', 'Survived']
final_df.to_csv('submission.csv', index=False)

final_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [33]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 800, num = 8)]

# criterion
# criterion = ['gini', 'entropy']

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 300, num = 6)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
#                'criterion': criterion,
#                'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800], 'max_depth': [50, 100, 150, 200, 250, 300, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [34]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rfclf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = GridSearchCV(estimator=rfclf, param_grid=random_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=10)

# Fit the random search model
rf_random.fit(x, y)

Fitting 5 folds for each of 1008 candidates, totalling 5040 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done  53 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:   30.1s
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done 105 tasks      | elapsed:   37.2s
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   54.6s
[Parallel(n_jobs=-1)]: Done 173 tasks      | elapsed:   

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [None]:
rf_random.best_score_

In [28]:
final_df_grid = pd.concat([passenger_id, pd.DataFrame(rf_random.predict(x_test))], axis=1)
final_df_grid.columns = ['PassengerId', 'Survived']
final_df_grid.to_csv('submission_grid.csv', index=False)

final_df_grid.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
