In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In this notebook, we will try to predict who survived the Titanic disaster. Let's import the necessary libraries.

In [None]:
import sys
import matplotlib 
import scipy as sp 
import IPython
from IPython import display 
import sklearn 
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

Let's look at the data now.

In [None]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

test_data.head()

In [None]:
train_data.shape, test_data.shape, train_data.count(), test_data.count()

We notice we have a mix of numerical and categorical data. We also have a lot of missing values, especially under the "Cabin" column (much more than half of the values are missing). Thus, we can drop it. Instead, we can use fillna() to fill in the missing values for the other columns. Furthermore, we will drop the columns "Ticket" (we won't attempt to get any information from this column) and "Passenger ID" (as it does not provide additional information on the passengers).

In [None]:
data = [train_data, test_data]

for dataset in data:
    dataset['Age'].fillna(dataset['Age'].mean(), inplace = True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)

pass
columns_to_drop = ["PassengerId", "Cabin", "Ticket"]
test_data.head()

Now we will build new columns that could help us with the prediction. We will also get the title of each person from the "Name" column, which is otherwise not useful.

In [None]:
for dataset in data:
    dataset["Family_size"] = dataset ["SibSp"] + dataset['Parch'] + 1
    dataset['IsAlone'] = 1
    dataset['IsAlone'].loc[dataset['Family_size'] > 1] = 0
    dataset['Title'] = dataset['Name'].str.split(", ", expand = True)[1].str.split(".", expand = True)[0]
    dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
    dataset['FareBin'] = pd.cut(dataset['Fare'].astype(int), 4)

stat_min = 7
train_title_names = (train_data['Title'].value_counts() < stat_min)
test_title_names = (test_data['Title'].value_counts() < stat_min)

train_data['Title'] = train_data['Title'].apply(lambda x: 'Misc' if train_title_names.loc[x] == True else x)
test_data['Title'] = test_data['Title'].apply(lambda x: 'Misc' if test_title_names.loc[x] == True else x)


test_data.head()

In [None]:
label = LabelEncoder()

for dataset in data: 
    dataset['Title_Code'] = label.fit_transform(dataset['Title'])
    dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
    dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
    dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
    dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])

pass

#train_y = ['Survived']
#train_X = ['Sex', 'Pclass', 'Embarked_Code', 'Title_Code', 'SibSp', 'Parch', 'AgeBin_Code', 'FareBin_Code', 'IsAlone']
#test_X = ['Sex', 'Pclass', 'Embarked_Code', 'Title_Code', 'SibSp', 'Parch', 'AgeBin_Code', 'FareBin_Code', 'IsAlone']

test_data.head()

In [None]:
Target = ['Survived']
data1_x = ['Family_size', 'Age', 'Fare', 'Sex','Pclass', 'Embarked', 'Title', 'SibSp', 'Parch', 'IsAlone']
data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare']
data1_xy =  Target + data1_x


data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'AgeBin_Code', 'FareBin_Code']
data1_xy_bin = Target + data1_x_bin

#define x and y variables for dummy features original
data1_dummy = pd.get_dummies(train_data[data1_x])
data2_dummy = pd.get_dummies(test_data[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = Target + data1_x_dummy


data1_dummy.insert(18, 'Survived', train_data['Survived'])
data1_dummy.insert(0, 'PassengerId', train_data['PassengerId'])
data2_dummy.insert(0, 'PassengerId', test_data['PassengerId'])
train_data = data1_dummy
test_data = data2_dummy

train_data.head()

In [None]:
train_data.head()

Now let's build our model.

In [None]:
Titanic_features = ["Age", "Fare", "Pclass", "IsAlone", "SibSp", "Parch","Sex_female","Sex_male","Embarked_C","Embarked_Q","Embarked_S","Title_Master", "Title_Misc","Title_Miss","Title_Mr","Title_Mrs"]
Titanic_prediction_target = ['Survived']

train_X = train_data[Titanic_features]
test_X = test_data[Titanic_features]
train_y = train_data[Titanic_prediction_target]

test_X.head()

Let's find the best parameters for our Random Forest Classifier.

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 0, stop = 2000, num = 11)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose = 2, 
                               random_state = 42, n_jobs = -1)
rf_random.fit(train_X, train_y)

In [None]:
rf_random.best_params_

Now, using these parameters, we can build our Classification Model.

In [None]:
my_model = RandomForestClassifier(min_samples_split = 40, 
                             max_leaf_nodes = 15, 
                             n_estimators = 40, 
                             max_depth = 5,
                             min_samples_leaf = 3,
                             max_features = 'sqrt')

my_model.fit(train_X, train_y.values.ravel())
my_model.predict(train_X)
my_model_predictions = my_model.predict(test_X)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': my_model_predictions})
print(output)
output.to_csv("my_submission.csv", index=False)
print("You submitted the file")

We achieved a score of about %78.23. 