# Survival Prediction with Random Forest Score: 79.18%

In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

# Import Data

In [2]:
df = pd.read_csv('../input/titanic/train.csv', index_col='PassengerId')
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 11)

In [5]:
df.isna().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

### processing

In [6]:
def processing(data):
    # Title
    '''
    Get Title from Name column
    '''
    data['Title'] = data.Name.apply(lambda x: re.findall(' ([a-zA-Z]+)\.', x)[0])
    data['Title'] = data.Title.replace(['Major', 'Sir', 'Jonkheer', 'Dr','Col','Don', 'Capt','Rev'], 'Mr')
    data['Title'] = data.Title.replace(['Ms','Lady', 'Countess','Dona'], 'Mrs')
    data['Title'] = data.Title.replace(['Mme','Mlle'], 'Miss')
    
    # Age
    '''
    Fill missing value Age column from groupby Pclass and Title and get mean Age 
    '''
    data['Age'] = data.groupby(['Pclass', 'Title'])['Age'].apply(lambda x: x.fillna(x.mean()))
    
    # Embarked
    '''
    fill missing value Embarked column with most common value : (S)
    '''
    data['Embarked'] = data['Embarked'].fillna('S')
    
    # Cabin
    '''
    '''
    df['Cabin'] = df['Cabin'].fillna('N')
    df['Cabin'] = df['Cabin'].apply(lambda x: x[0])
    df['Cabin'] = df['Cabin'].apply(lambda x: 0 if x in ['A', 'B', 'C', 'D','E', 'F', 'G', 'T'] else 1)
    
    # Drop features
    data.drop(columns=['Name', 'Ticket'], inplace=True)
    
    return data

In [7]:
df = processing(df)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,22.0,1,0,7.25,S,Mr
2,1,1,female,38.0,1,0,71.2833,C,Mrs
3,1,3,female,26.0,0,0,7.925,S,Miss
4,1,1,female,35.0,1,0,53.1,S,Mrs
5,0,3,male,35.0,0,0,8.05,S,Mr


# Dataset Splitting

In [8]:
X = df.drop(columns='Survived')
y = df.Survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 8), (179, 8), (712,), (179,))

# Preprocessor

In [9]:
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('poly', PolynomialFeatures())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('numeric', numerical_pipeline, ['SibSp', 'Parch', 'Fare']),
    ('categoric', categorical_pipeline, ['Pclass', 'Sex', 'SibSp', 'Embarked', 'Title'])
])

# Pipeline

In [10]:
pipeline = Pipeline([
    ('prep', preprocessor),
    ('algo', RandomForestClassifier(n_jobs=-1, random_state=42))
])

parameter = {
    'prep__numeric__poly__degree': [3],
    'prep__numeric__poly__interaction_only': [True],
    'algo__max_depth': [48],
    'algo__max_features': [0.5768062890209851],
    'algo__min_samples_leaf': [10],
    'algo__n_estimators': [100]
}

# Tunning

In [11]:
model = GridSearchCV(pipeline, param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(f'''
Best params : {model.best_params_}
Best score  : {model.best_score_}
Train score : {model.score(X_train, y_train)}
Test score  : {model.score(X_test, y_test)}
''')

Fitting 3 folds for each of 1 candidates, totalling 3 fits

Best params : {'algo__max_depth': 48, 'algo__max_features': 0.5768062890209851, 'algo__min_samples_leaf': 10, 'algo__n_estimators': 100, 'prep__numeric__poly__degree': 3, 'prep__numeric__poly__interaction_only': True}
Best score  : 0.8216383599853444
Train score : 0.8525280898876404
Test score  : 0.8435754189944135



# Submission

In [12]:
pred = pd.read_csv('../input/titanic/test.csv', index_col='PassengerId')
pred = processing(pred)
pred.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
892,3,male,34.5,0,0,7.8292,Q,Mr
893,3,female,47.0,1,0,7.0,S,Mrs
894,2,male,62.0,0,0,9.6875,Q,Mr
895,3,male,27.0,0,0,8.6625,S,Mr
896,3,female,22.0,1,1,12.2875,S,Mrs


In [13]:
submission = pd.read_csv('../input/titanic/gender_submission.csv')
submission['Survived'] = model.predict(pred)

In [14]:
submission.to_csv('submission.csv', index=False)

In [15]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
