# Titanic Prediction with Hyper Parameters

In [9]:
#Data visualisation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Machine Learning models
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier

## Aquciring the data

In [10]:
train = pd.read_csv("../input/titanic/train.csv", index_col ='PassengerId')
test = pd.read_csv("../input/titanic/test.csv", index_col ='PassengerId')

#### lets visualize our dataset

In [11]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Lets check for missing values

In [12]:
print((train.isnull().sum().sort_values(ascending=False)*100)/train.shape[0])
print((test.isnull().sum().sort_values(ascending=False)*100)/test.shape[0])

Cabin       77.104377
Age         19.865320
Embarked     0.224467
Survived     0.000000
Pclass       0.000000
Name         0.000000
Sex          0.000000
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Fare         0.000000
dtype: float64
Cabin       78.229665
Age         20.574163
Fare         0.239234
Pclass       0.000000
Name         0.000000
Sex          0.000000
SibSp        0.000000
Parch        0.000000
Ticket       0.000000
Embarked     0.000000
dtype: float64


*Ticket* and *Cabin* can be droped, we can extract further info from the name variable like the titles.

In [13]:
#Extracion of title
train['Title'] = train.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test['Title'] = test.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['Title'], train['Sex'])

Sex,female,male
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [14]:
combine_df = [train,test]
for dataset in combine_df:
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Other')

    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    
train[['Title', 'Survived']].groupby(['Title'], as_index=False).count()

Unnamed: 0,Title,Survived
0,Master,40
1,Miss,185
2,Mr,517
3,Mrs,126
4,Other,23


Now we can safely drop the variables

In [15]:
train = train.drop(["Name", "Ticket", "Cabin"], axis=1)
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)

The two variables **SibSp** and **Parch** can be joined into one variable labeled **Family**

In [16]:
train['Family'] = np.where(train['SibSp'] + train['Parch'] > 0, 1, 0)
test['Family'] = np.where(test['SibSp'] + test['Parch'] > 0, 1, 0)
train.drop(['SibSp', 'Parch'], axis= 1, inplace= True)
test.drop(['SibSp', 'Parch'], axis= 1, inplace= True)

In [17]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Fare,Embarked,Title,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0,3,male,22.0,7.25,S,Mr,1
2,1,1,female,38.0,71.2833,C,Mrs,1
3,1,3,female,26.0,7.925,S,Miss,0
4,1,1,female,35.0,53.1,S,Mrs,1
5,0,3,male,35.0,8.05,S,Mr,0


### Lets impute the missing values

In [18]:
#Variable Age imputation
train['Age'] = train.Age.fillna(train.Age.mean())
test['Age'] = test.Age.fillna(test.Age.mean())

In [19]:
#Variable Fare Imputation
test['Fare'].fillna(test['Fare'].median(), inplace= True)

In [20]:
#Variable Embarked imputation
train['Embarked'].fillna('S', inplace= True)
test['Embarked'].fillna('S', inplace= True)

#### Our model cannot take into account categorical variables as such we need to transform the categorical variables into quantitatives.

In [21]:
le = LabelEncoder()
train.Sex = le.fit_transform(train['Sex'])
train.Embarked = le.fit_transform(train['Embarked'])
train.Title = le.fit_transform(train['Title'])

In [22]:
test.Sex = le.fit_transform(test['Sex'])
test.Embarked = le.fit_transform(test['Embarked'])
test.Title = le.fit_transform(test['Title'])

### Scaling

In [23]:
sc = StandardScaler()
train[['Age', 'Fare']] = sc.fit_transform(train[['Age', 'Fare']])
test[['Age', 'Fare']] = sc.fit_transform(test[['Age', 'Fare']])

In [24]:
print(test.head())
print('-'*20)
print(train.head())

             Pclass  Sex       Age      Fare  Embarked  Title  Family
PassengerId                                                          
892               3    1  0.334993 -0.497413         1      2       0
893               3    0  1.325530 -0.512278         2      3       1
894               2    1  2.514175 -0.464100         1      2       0
895               3    1 -0.259330 -0.482475         2      2       0
896               3    0 -0.655545 -0.417492         2      3       1
--------------------
             Survived  Pclass  Sex       Age      Fare  Embarked  Title  \
PassengerId                                                               
1                   0       3    1 -0.592481 -0.502445         2      2   
2                   1       1    0  0.638789  0.786845         0      3   
3                   1       3    0 -0.284663 -0.488854         2      1   
4                   1       1    0  0.407926  0.420730         2      3   
5                   0       3    1  0.4

Lets split our data, bring out our target variable and the independent variables.

In [25]:
X = train.drop("Survived", axis=1)
y = train["Survived"]

## Lets go with the **Hyper Parameters**

In [26]:
# Create the grid
rf_grid = {'n_estimators': [100,150,200,250,300,350,400],
               'max_depth': [1,2,3,4,5,6,7,8],
}

In [27]:
rf_base = RandomForestClassifier()
rf_random = GridSearchCV(estimator = rf_base, param_grid= rf_grid, cv = 5)

In [28]:
rf_random.fit(X, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8],
                         'n_estimators': [100, 150, 200, 250, 300, 350, 400]})

In [29]:
rf_random.best_score_

0.8305191136777352

## We can go on and make our prediction

In [30]:
Survived = rf_random.predict(test)

In [31]:
data = pd.DataFrame({"PassengerId":test.index, "Survived": Survived})

In [33]:
data.to_csv("submission.csv", index = False)