# Titanic – Machine Learning from Disaster
**GitHub:** https://github.com/pirsquareff/pattern-assignment-1<br>
**Kaggle:** https://www.kaggle.com/c/titanic


In this part of the exercise we will work on the Titanic dataset provided by Kaggle. The Titanic dataset contains information of the passengers boarding the Titanic on its final voyage. We will work on predicting whether a given passenger will survive the trip.

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
import copy

In [2]:
# training set
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)

# test set
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url)

In [3]:
# explore training set
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# explore test set
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Data Dictionary
https://www.kaggle.com/c/titanic/data

In [5]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Cleaning Data
- Replace missing values of age to their mean
- Convert categorical fields (Embarked, and Sex) to number

In [6]:
# cleaning data
purified_train = train
purified_train["Age"] = purified_train["Age"].fillna(purified_train["Age"].median())
purified_train[['Age']] = purified_train[['Age']].apply(zscore)

purified_train.loc[purified_train["Embarked"] == "S", "Embarked"] = 0
purified_train.loc[purified_train["Embarked"] == "C", "Embarked"] = 1
purified_train.loc[purified_train["Embarked"] == "Q", "Embarked"] = 2
purified_train["Embarked"] = purified_train["Embarked"].fillna(purified_train["Embarked"].mode()[0])

purified_train.loc[purified_train["Sex"] == "male", "Sex"] = 0
purified_train.loc[purified_train["Sex"] == "female", "Sex"] = 1
purified_train["Sex"] = purified_train["Sex"].fillna(purified_train["Sex"].mode()[0])
purified_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,0.352413,2.27278e-16,0.523008,0.381594,32.204208,0.361392
std,257.353842,0.486592,0.836071,0.47799,1.000562,1.102743,0.806057,49.693429,0.635673
min,1.0,0.0,1.0,0.0,-2.224156,0.0,0.0,0.0,0.0
25%,223.5,0.0,2.0,0.0,-0.5657365,0.0,0.0,7.9104,0.0
50%,446.0,0.0,3.0,0.0,-0.1046374,0.0,0.0,14.4542,0.0
75%,668.5,1.0,3.0,1.0,0.4333115,1.0,0.0,31.0,1.0
max,891.0,1.0,3.0,1.0,3.891554,8.0,6.0,512.3292,2.0


## Extract the features
Extract Pclass, Sex, Age, and Embarked from Pandas to Numpy

In [7]:
data = np.array(purified_train[["Pclass", "Sex", "Age", "Embarked"]].values, dtype = np.float)
print(data)
label = np.array(purified_train["Survived"].values, dtype = np.float)

[[ 3.          0.         -0.56573646  0.        ]
 [ 1.          1.          0.66386103  1.        ]
 [ 3.          1.         -0.25833709  0.        ]
 ..., 
 [ 3.          1.         -0.1046374   0.        ]
 [ 1.          0.         -0.25833709  1.        ]
 [ 3.          0.          0.20276197  2.        ]]


## Sigmoid Function
$$f(x) = \frac{1}{1 + e^{-x}}$$

In [8]:
# sigmoid function (or logistic function)
def sigmoid_func(x) :
    return 1.0 / (1.0 + np.exp((-1.0) * x))

In [26]:
def gradient_descent(initial_theta, learning_rate, iterations, x, y) :
    theta_shape = initial_theta.shape
    current_theta = copy.deepcopy(initial_theta)
    for i in range(iterations) :
        h = sigmoid_func(np.dot(x, current_theta))
        for j in range(theta_shape[0]) :
            current_theta[j] = current_theta[j] + learning_rate * np.sum(np.dot((y - h), x[:, j]))
    return current_theta

In [27]:
n_data_param = data.shape[1]
initial_theta = np.array([0.5] * (n_data_param + 1))

# append x0 column to x
x0 = np.array([[1]] * data.shape[0])
x = np.append(x0, data, axis = 1)
y = label

In [28]:
optimal_theta = gradient_descent(initial_theta, 0.01, 100000, x, y)
optimal_theta

array([  2.90562249,  -1.60737601,  13.21597695,  -1.13000986,   1.119408  ])

In [29]:
def h_function(x, theta) :
    return sigmoid_func(np.array(np.dot(x, theta), dtype = np.float))

In [30]:
# cleaning data
purified_test = test
purified_test["Age"] = purified_test["Age"].fillna(purified_test["Age"].median())
purified_test[['Age']] = purified_test[['Age']].apply(zscore)

purified_test.loc[purified_test["Embarked"] == "S", "Embarked"] = 0
purified_test.loc[purified_test["Embarked"] == "C", "Embarked"] = 1
purified_test.loc[purified_test["Embarked"] == "Q", "Embarked"] = 2
purified_test["Embarked"] = purified_test["Embarked"].fillna(purified_test["Embarked"].mode()[0])

purified_test.loc[purified_test["Sex"] == "male", "Sex"] = 0
purified_test.loc[purified_test["Sex"] == "female", "Sex"] = 1
purified_test["Sex"] = purified_test["Sex"].fillna(purified_test["Sex"].mode()[0])
purified_test.describe()

  result = getattr(x, name)(y)


TypeError: invalid type comparison

In [31]:
test_data = np.array(purified_test[["Pclass", "Sex", "Age", "Embarked"]].values, dtype = np.float)

In [32]:
x_test0 = np.array([[1]] * test_data.shape[0])
x_test = np.append(x_test0, test_data, axis = 1)

In [33]:
prediction = h_function(x_test, optimal_theta)

In [34]:
adjusted_prediction = [1 if p >= 0.5 else 0 for p in prediction]

In [35]:
result = test[["PassengerId"]]
result["Survived"] = pd.Series(np.array(adjusted_prediction, dtype = int))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [36]:
result.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [38]:
result.to_csv("prediction.csv", sep = ',', index = False)

## Kaggle Grading
<img src="kaggle_screenshot.png" width="60%" height="60%"/>