# Predicting Titanic Survivors

In [1]:
# import resources
import pandas as pd
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

import numpy as np
import re
from sklearn.tree import DecisionTreeClassifier

### Load Data

In [2]:
# load data
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")
all_data = [train_data,test_data]

### Explore Data

* PassengerId : int     : Id
* Survived    : int     : Survival (0=No; 1=Yes)
* Pclass      : int     : Passenger Class
* Name        : object  : Name
* Sex         : object  : Sex
* Age         : float   : Age
* SibSp       : int     : Number of Siblings/Spouses Aboard
* Parch       : int     : Number of Parents/Children Aboard
* Ticket      : object  : Ticket Number
* Fare        : float   : Passenger Fare
* Cabin       : object  : Cabin
* Embarked    : object  : Port of Embarkation (C=Cherbourg; Q=Queenstown; S=Southampton)

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.dtypes

PassengerId    int64  
Survived       int64  
Pclass         int64  
Name           object 
Sex            object 
Age            float64
SibSp          int64  
Parch          int64  
Ticket         object 
Fare           float64
Cabin          object 
Embarked       object 
dtype: object

In [5]:
train_data.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Thorneycroft, Mrs. Percival (Florence Kate White)",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


### Feature Engineering
Feature engineering is measuring the impact of each feature on the output


##### 1 -  Exploring Passenger Class Feature

In [6]:
# 63% of citizens from the first class have survived - feature is impactful
print(train_data[["Pclass","Survived"]].groupby(["Pclass"], as_index = False).mean())

   Pclass  Survived
0  1       0.629630
1  2       0.472826
2  3       0.242363


##### 2 -  Exploring Sex Features

In [7]:
# more females than males survived - feature is impactful
print(train_data[["Sex","Survived"]].groupby(["Sex"], as_index = False).mean())

      Sex  Survived
0  female  0.742038
1  male    0.188908


##### 3 -  Exploring Family Size Feature

In [8]:
for item in all_data:
    item["FamilySize"] = item["Parch"] + item["SibSp"] + 1

In [9]:
# family members of 4 survived at a rate of 72%
print(train_data[["FamilySize","Survived"]].groupby(["FamilySize"], as_index = False).mean())

   FamilySize  Survived
0  1           0.303538
1  2           0.552795
2  3           0.578431
3  4           0.724138
4  5           0.200000
5  6           0.136364
6  7           0.333333
7  8           0.000000
8  11          0.000000


##### 4 -  Exploring Embarked Feature

In [10]:
for item in all_data:
    item["Embarked"] = item["Embarked"].fillna("S") #most occurance

print(train_data[["Embarked","Survived"]].groupby(["Embarked"], as_index = False).mean())

  Embarked  Survived
0  C        0.553571
1  Q        0.389610
2  S        0.339009


##### 5 -  Exploring Fare Feature

In [11]:
# due to empty values, the median will be taken 
for item in all_data:
    item["Fare"] = item["Fare"].fillna(item['Fare'].median())
    
train_data['CategoryFare'] = pd.qcut(train_data['Fare'], 4)
print( train_data[["CategoryFare","Survived"]].groupby(["CategoryFare"], as_index = False).mean() )

      CategoryFare  Survived
0  (-0.001, 7.91]   0.197309
1  (7.91, 14.454]   0.303571
2  (14.454, 31.0]   0.454955
3  (31.0, 512.329]  0.581081


##### 6 -  Exploring Age Feature

In [12]:
for item in all_data: 
    avg_age = item["Age"].mean()
    std_age = item["Age"].std()
    null_age = item["Age"].isnull().sum()
    
    random_list = np.random.randint(avg_age - std_age, avg_age + std_age , size = null_age)
    item['Age'][np.isnan(item['Age'])] = random_list
    item['Age'] = item['Age'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [13]:
for data in all_data:
    age_avg  = data['Age'].mean()
    age_std  = data['Age'].std()
    age_null = data['Age'].isnull().sum()

    random_list = np.random.randint(age_avg - age_std, age_avg + age_std , size = age_null)
    data['Age'][np.isnan(data['Age'])] = random_list
    data['Age'] = data['Age'].astype(int)

train_data['CategoryAge'] = pd.cut(train_data['Age'], 5)
print( train_data[["CategoryAge","Survived"]].groupby(["CategoryAge"], as_index = False).mean())

     CategoryAge  Survived
0  (-0.08, 16.0]  0.527778
1  (16.0, 32.0]   0.341518
2  (32.0, 48.0]   0.396078
3  (48.0, 64.0]   0.434783
4  (64.0, 80.0]   0.090909


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


##### 7 -  Exploring Title Feature

In [14]:
def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\. ', name)
    
    if title_search:
        return title_search.group(1)
    return ""

for data in all_data:
    data['Title'] = data['Name'].apply(get_title)
    
for data in all_data:
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],'Rare')
    data['Title'] = data['Title'].replace('Mlle','Miss')
    data['Title'] = data['Title'].replace('Ms','Miss')
    data['Title'] = data['Title'].replace('Mme','Mrs')

print(train_data[['Title','Survived']].groupby(['Title'], as_index = False).mean())

    Title  Survived
0  Master  0.575000
1  Miss    0.702703
2  Mr      0.156673
3  Mrs     0.793651
4  Rare    0.347826


### Mapping Data

In [15]:
for item in all_data:
    
    # map sex 
    sex_map = { 'female':0 , 'male':1 }
    item['Sex'] = item['Sex'].map(sex_map).astype(int)

    
    # map embarked
    embark_map = {'S':0, 'C':1, 'Q':2}
    item['Embarked'] = item['Embarked'].map(embark_map).astype(int)
    
    # map title
    title_map = {'Mr':1, 'Miss':2, 'Mrs':3, 'Master':4, 'Rare':5}
    item['Title'] = item['Title'].map(title_map)
    item['Title'] = item['Title'].fillna(0)
    
    # map fare
    item.loc[ item['Fare'] <= 7.91, 'Fare'] = 0
    item.loc[(item['Fare'] > 7.91) & (item['Fare'] <= 14.454), 'Fare'] = 1
    item.loc[(item['Fare'] > 14.454) & (item['Fare'] <= 31), 'Fare'] = 2
    item.loc[ item['Fare'] > 31, 'Fare'] = 3
    item['Fare'] = item['Fare'].astype(int)

    #Mapping Age
    item.loc[ item['Age'] <= 16, 'Age'] = 0
    item.loc[(item['Age'] > 16) & (item['Age'] <= 32), 'Age'] = 1
    item.loc[(item['Age'] > 32) & (item['Age'] <= 48), 'Age'] = 2
    item.loc[(item['Age'] > 48) & (item['Age'] <= 64), 'Age'] = 3
    item.loc[ item['Age'] > 64, 'Age'] = 4

In [16]:
drop_elements = ["Name", "Ticket", "Cabin", "SibSp", "Parch", "FamilySize"]

# drop columns from both data sets
train_data = train_data.drop(drop_elements, axis = 1)

In [17]:
train_data = train_data.drop(columns = ["PassengerId","CategoryFare", "CategoryAge"])
print(train_data.head(5))

   Survived  Pclass  Sex  Age  Fare  Embarked  Title
0  0         3       1    1    0     0         1    
1  1         1       0    2    3     1         3    
2  1         3       0    1    1     0         2    
3  1         1       0    2    3     0         3    
4  0         3       1    2    1     0         1    


In [18]:
test_data = test_data.drop(drop_elements, axis = 1)
print(test_data.head(5))

   PassengerId  Pclass  Sex  Age  Fare  Embarked  Title
0  892          3       1    2    0     2         1    
1  893          3       0    2    0     0         3    
2  894          2       1    3    1     2         1    
3  895          3       1    1    1     0         1    
4  896          3       0    1    1     0         3    


### Prediction
+ X : X_train : Contains all the features
+ Y : Y_train : Contains the actual output (Survived)

In [22]:
X_train = train_data.drop("Survived", axis=1)
Y_train = train_data["Survived"]
X_test  = test_data.drop("PassengerId", axis=1).copy()

In [23]:
# running classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [24]:
Y_pred = decision_tree.predict(X_test)
accuracy = round(decision_tree.score(X_train, Y_train) * 100, 2)

print("Model Accuracy: ",accuracy)

Model Accuracy:  86.76


### Saving results

In [25]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Title
0,892,3,1,2,0,2,1
1,893,3,0,2,0,0,3
2,894,2,1,3,1,2,1
3,895,3,1,1,1,0,1
4,896,3,0,1,1,0,3


In [28]:
submission = pd.DataFrame({
    "PassengerId": test_data["PassengerId"],
    "Survived": Y_pred
})

submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [29]:
submission.to_csv('submission.csv', index = False)