# Titanic – Machine Learning from Disaster
**Kaggle:** https://www.kaggle.com/c/titanic

In this part of the exercise we will work on the Titanic dataset provided by Kaggle. The Titanic dataset contains information of the passengers boarding the Titanic on its final voyage. We will work on predicting whether a given passenger will survive the trip.

In [206]:
import numpy as np
import pandas as pd

In [214]:
# training set
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)

# test set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

In [208]:
# explore training set
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [209]:
# explore test set
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Data Dictionary
https://www.kaggle.com/c/titanic/data

In [210]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Cleaning Data
- Replace missing values of age to their mean
- Convert categorical fields (Embarked, and Sex) to number

In [215]:
# cleaning data
purified_train = train
purified_train["Age"] = purified_train["Age"].fillna(purified_train["Age"].median())

# purified_train.loc[(purified_train["Embarked"] != "S") & (purified_train["Embarked"] != "C") & (purified_train["Embarked"] != "Q"), "Embarked"] = np.nan
purified_train.loc[purified_train["Embarked"] == "S", "Embarked"] = 0
purified_train.loc[purified_train["Embarked"] == "C", "Embarked"] = 1
purified_train.loc[purified_train["Embarked"] == "Q", "Embarked"] = 2
purified_train["Embarked"] = purified_train["Embarked"].fillna(purified_train["Embarked"].mode()[0])

purified_train.loc[purified_train["Sex"] == "male", "Sex"] = 0
purified_train.loc[purified_train["Sex"] == "female", "Sex"] = 1
purified_train["Sex"] = purified_train["Sex"].fillna(purified_train["Sex"].mode()[0])
purified_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,29.361582,0.523008,0.381594,32.204208,0.361392
std,257.353842,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,0.635673
min,1.0,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,0.0
50%,446.0,0.0,3.0,0.0,28.0,0.0,0.0,14.4542,0.0
75%,668.5,1.0,3.0,1.0,35.0,1.0,0.0,31.0,1.0
max,891.0,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


## Extract the features
Extract Pclass, Sex, Age, and Embarked from Pandas to Numpy

In [330]:
data = np.array(purified_train[["Pclass", "Sex", "Age", "Embarked"]].values, dtype = np.float)
print(data)
label = np.array(purified_train["Survived"].values, dtype = np.float)
# label

[[  3.   0.  22.   0.]
 [  1.   1.  38.   1.]
 [  3.   1.  26.   0.]
 ..., 
 [  3.   1.  28.   0.]
 [  1.   0.  26.   1.]
 [  3.   0.  32.   2.]]


array([ 0.,  1.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,
        0.,  0.,  1.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,  0.,  1.,
        0.,  0.,  1.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  1.,  0.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  1.,  1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  1.,  1.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  1.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  1.,  0.,  0.,
        0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

## Sigmoid Function
$$f(x) = \frac{1}{1 + e^{-x}}$$

In [331]:
# sigmoid function (or logistic function)
def sigmoid_func(x) :
    return 1.0 / (1.0 + np.exp((-1.0) * x))

In [361]:
def gradient_descent(initial_theta, learning_rate, iterations, x, y) :
    theta_shape = initial_theta.shape
    current_theta = np.array([0.0] * theta_shape[0])
    for i in range(theta_shape[0]) :
        current_theta[i] = initial_theta[i]
    for i in range(iterations) :
        theta_tmp = np.array([0.0] * theta_shape[0])
#         for j in range(theta_shape[0]) :
#             theta_tmp[j] = current_theta[j]
        h = sigmoid_func(np.array(np.dot(x, current_theta), dtype = np.float))
#         print(h)
        for j in range(theta_shape[0]) :
#             print(np.sum((y - h) * x[:, j]))
            theta_tmp[j] = current_theta[j] - learning_rate * np.sum((y - h) * x[:, j])
        # simultaneous update
        for j in range(theta_shape[0]) :
            current_theta[j] = theta_tmp[j]
    return current_theta

In [362]:
n_data_param = data.shape[1]
initial_theta = np.array([0.0] * (n_data_param + 1))

# append x0 column to x
x0 = np.array([[1]] * data.shape[0])
x = np.append(x0, data, axis = 1)
y = label

In [364]:
optimal_theta = gradient_descent(initial_theta, 0.000005, 10000, x, y)
optimal_theta

array([  27.4440948 ,   69.48595894,    4.04785294,  824.13635075,
          8.4479108 ])

In [371]:
def h_function(x, theta) :
    return sigmoid_func(np.array(np.dot(x, theta), dtype = np.float))

In [366]:
# cleaning data
purified_test = test
purified_test["Age"] = purified_test["Age"].fillna(purified_test["Age"].median())

# purified_train.loc[(purified_train["Embarked"] != "S") & (purified_train["Embarked"] != "C") & (purified_train["Embarked"] != "Q"), "Embarked"] = np.nan
purified_test.loc[purified_test["Embarked"] == "S", "Embarked"] = 0
purified_test.loc[purified_test["Embarked"] == "C", "Embarked"] = 1
purified_test.loc[purified_test["Embarked"] == "Q", "Embarked"] = 2
purified_test["Embarked"] = purified_test["Embarked"].fillna(purified_test["Embarked"].mode()[0])

purified_test.loc[purified_test["Sex"] == "male", "Sex"] = 0
purified_test.loc[purified_test["Sex"] == "female", "Sex"] = 1
purified_test["Sex"] = purified_test["Sex"].fillna(purified_test["Sex"].mode()[0])
purified_test.describe()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,417.0,418.0
mean,1100.5,2.26555,0.363636,29.599282,0.447368,0.392344,35.627188,0.464115
std,120.810458,0.841838,0.481622,12.70377,0.89676,0.981429,55.907576,0.685516
min,892.0,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,996.25,1.0,0.0,23.0,0.0,0.0,7.8958,0.0
50%,1100.5,3.0,0.0,27.0,0.0,0.0,14.4542,0.0
75%,1204.75,3.0,1.0,35.75,1.0,0.0,31.5,1.0
max,1309.0,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


In [367]:
test_data = np.array(purified_test[["Pclass", "Sex", "Age", "Embarked"]].values, dtype = np.float)
print(test_data)

[[  3.    0.   34.5   2. ]
 [  3.    1.   47.    0. ]
 [  2.    0.   62.    2. ]
 ..., 
 [  3.    0.   38.5   0. ]
 [  3.    0.   27.    0. ]
 [  3.    0.   27.    1. ]]


In [369]:
x_test0 = np.array([[1]] * test_data.shape[0])
x_test = np.append(x_test0, test_data, axis = 1)

In [373]:
prediction = h_function(x_test, optimal_theta)
prediction

array([ 1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1