# TITANIC: MACHINE LEARNING FROM DISASTER

[Link](https://www.kaggle.com/c/titanic)

## 1. EXAMING THE DATA

In [7]:
# First glance at data
import pandas as pd

train_titanic = pd.read_csv('train.csv')

train_titanic.head()
train_titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
# Clean up data by replacing the missing ones with median
train_titanic['Age'] = train_titanic['Age'].fillna(train_titanic['Age'].median())
train_titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
# Convert non-numeric into numeric values
train_titanic.loc[train_titanic['Sex'] == 'male', 'Sex'] = 0
train_titanic.loc[train_titanic['Sex'] == 'female', 'Sex'] = 1
train_titanic['Embarked'] = train_titanic['Embarked'].fillna(0)
train_titanic.loc[train_titanic['Embarked'] == 'S', 'Embarked'] = 0
train_titanic.loc[train_titanic['Embarked'] == 'C', 'Embarked'] = 1
train_titanic.loc[train_titanic['Embarked'] == 'Q', 'Embarked'] = 2
print(train_titanic['Embarked'].unique())
train_titanic

[0 1 2]


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,0
5,6,0,3,"Moran, Mr. James",0,28.0,0,0,330877,8.4583,,2
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,0
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.0750,,0
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,0
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,1


## Cross validation

In [13]:
from sklearn.linear_model import LinearRegression
# Package that mitigates cross validation
from sklearn.cross_validation import KFold

# Columns we use to predict the target
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

# Initialize algorithm class
alg = LinearRegression()

# 
kf = KFold(train_titanic.shape[0], n_folds=3, random_state=1)

predictions = []

for train, test in kf:
    # 
    train_predictors = (train_titanic[predictors].iloc[train,:])
    #     
    train_target = train_titanic['Survived'].iloc[train]
    #
    alg.fit(train_predictors, train_target)
    #
    test_predictions = alg.predict(train_titanic[predictors].iloc[test,:])
    
    predictions.append(test_predictions)

print(train_titanic.shape[0])

891


## Evaluate result

In [14]:
# We use percentage of matched predictions in predictions with respect to actual values in 
# train_titanic['Survived'] as performance metrics (evaluation metrics)
# The predictions are in three separate numpy arrays
import numpy as np

predictions = np.concatenate(predictions, axis=0)

# Map the results into proper class
predictions[predictions > .5] = 1
predictions[predictions <= .5] = 0

# Calculate performance metrics
counter = 0
num_precise = 0
for pred in predictions:
    if(pred == train_titanic['Survived'][counter]):
        num_precise = num_precise + 1
    counter = counter + 1

accuracy = num_precise/len(predictions)


## Logistic Regression


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation

alg = LogisticRegression()

scores = cross_validation.cross_val_score(alg, train_titanic[predictors], train_titanic['Survived'], cv=3)



## Submission

In [16]:
# Do exactly the same things on the test set
test_titanic = pd.read_csv('test.csv')

# Replace missing data
test_titanic['Fare'] = test_titanic['Fare'].fillna(test_titanic['Fare'].median())
test_titanic['Age'] = test_titanic['Age'].fillna(test_titanic['Age'].median())

# Map non-numeric data into numeric
test_titanic.loc[test_titanic['Sex'] == 'male', 'Sex'] = 0
test_titanic.loc[test_titanic['Sex'] == 'female', 'Sex'] = 1
test_titanic['Embarked'] = test_titanic['Embarked'].fillna(0)
test_titanic.loc[test_titanic['Embarked'] == 'S', 'Embarked'] = 0
test_titanic.loc[test_titanic['Embarked'] == 'C', 'Embarked'] = 1
test_titanic.loc[test_titanic['Embarked'] == 'Q', 'Embarked'] = 2


In [17]:
# Initialize the algorithm
alg = LogisticRegression()

# Train the algorithm using all the training data
alg.fit(train_titanic[predictors], train_titanic['Survived'])

# Make prediction using test set
predictions = alg.predict(test_titanic[predictors])

# Create a new dataframe with only the columns Kaggle wants from the dataset
submission = pd.DataFrame({
        'PassengerId': test_titanic['PassengerId'],
        'Survived': predictions
    })

# Output csv file for submission
submission.to_csv('kaggle.csv', index=False)


## Improve performance

We will improve the performance by 3 following ways:
1. Finding a better algorithm
2. Generating better features
3. Combining multiple machine learning algorithms

### Finding a better algorithm - Random Forest

In [39]:
# Random forest
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

# Initialize therandom forest algorithm
alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)


# Generating folds for cross validation
scores = cross_validation.cross_val_score(alg, train_titanic[predictors], train_titanic['Survived'], cv = 3)

scores.mean()

0.8204264870931538

### Generating new features

In [45]:
# Generating a familysize column which equal SibSp + Parch
train_titanic['FamilySize'] = train_titanic['SibSp'] + train_titanic['Parch']

# Generating the lenght of the name
train_titanic['NameLength'] = train_titanic['Name'].apply(lambda x: len(x))


In [62]:
# Extract the title feature because titles can strongly relate to family class

import re

# A function to get the title from a name.
def get_title(name):
    # Use a regular expression to search for a title.  Titles always 
    # consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

# Get all the titles and print how often each one occurs.
titles = train_titanic["Name"].apply(get_title)


# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, 
                 "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Lady": 10, 
                 "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
for k,v in title_mapping.items():
    titles[titles == k] = v

# Verify that we converted everything.
print(pd.value_counts(titles))

# Add in the title column.
train_titanic["Title"] = titles


1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64


In [53]:
# Extract the family group feature because chance of survival can be related to people who are 
# in the same family group
import operator

# A dictionary mapping family name to id
family_id_mapping = {}

# A function to get the id given a row
def get_family_id(row):
    # Find the last name by splitting on a comma
    last_name = row["Name"].split(",")[0]
    # Create the family id
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    # Look up the id in the mapping
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            # Get the maximum id from the mapping and add one to it if we don't have an id
            current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

# Get the family ids with the apply method
family_ids = train_titanic.apply(get_family_id, axis=1)

# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.
family_ids[train_titanic["FamilySize"] < 3] = -1

# Print the count of each unique id.
print(pd.value_counts(family_ids))

train_titanic["FamilyId"] = family_ids

train_titanic

-1      800
 14       8
 149      7
 63       6
 50       6
 59       6
 17       5
 384      4
 27       4
 25       4
 162      4
 8        4
 84       4
 340      4
 43       3
 269      3
 58       3
 633      2
 167      2
 280      2
 510      2
 90       2
 83       1
 625      1
 376      1
 449      1
 498      1
 588      1
dtype: int64


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,NameLength,Title,FamilyId
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,0,1,23,1,-1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,1,51,3,-1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,22,2,-1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0,1,44,3,-1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,0,0,24,1,-1
5,6,0,3,"Moran, Mr. James",0,28.0,0,0,330877,8.4583,,2,0,16,1,-1
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,0,0,23,1,-1
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.0750,,0,4,30,4,8
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,0,2,49,3,-1
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,1,1,35,3,-1


### Finding the best features

Feature engineering is the most important part of any machine learning task, and there are lots more features
we could calculate. But we need a way to figure out which features are the best

In [60]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif

predictors = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title", "FamilyId"]

# Perform feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(train_titanic[predictors], train_titanic['Survived'])

# Get the raw p-values for each features, and transform from p-values into scores
scores = -np.log10(selector.pvalues_)

# Plot the scores.  See how "Pclass", "Sex", "Title", and "Fare" are the best?
plt.bar(range(len(predictors)), scores)
plt.xticks(range(len(predictors)), predictors, rotation='vertical')
plt.show()

# Pick only the four best features.
predictors = ["Pclass", "Sex", "Fare", "Title"]

alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)

scores = cross_validation.cross_val_score(alg, train_titanic[predictors], train_titanic['Survived'], cv =3)

print(scores.mean())

0.811447811448


### Finding a better algorithm - Gradient Boosting Classifier