In [1]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold
from sklearn import cross_validation
import numpy as np
import pandas
import re

titanic = pandas.read_csv('data_sets/train.csv')
titanic_test = pandas.read_csv("data_sets/test.csv")

# Data Treatment: Training and Test

In [2]:
# Missing data
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Embarked'] = titanic['Embarked'].fillna('S')
# Non-numeric data
titanic.loc[titanic['Sex']=='male', 'Sex'] = 0
titanic.loc[titanic['Sex']=='female', 'Sex'] = 1

embarkedDict = {'Q': 2, 'C': 1, 'S': 0}
for key in embarkedDict:
    titanic.loc[titanic['Embarked'] == key, 'Embarked'] = embarkedDict[key]

In [3]:
# Missing data
titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())
titanic_test['Fare'] = titanic_test['Fare'].fillna(titanic['Fare'].median())
# Non-numeric data
titanic_test.loc[titanic_test['Sex']=='male', 'Sex'] = 0
titanic_test.loc[titanic_test['Sex']=='female', 'Sex'] = 1

for key in embarkedDict:
    titanic_test.loc[titanic_test['Embarked'] == key, 'Embarked'] = embarkedDict[key]

# Adding features

In [4]:
# A function to get the title from a name.
def get_title(name):
    # Use a regular expression to search for a title.  Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

# Get all the titles and print how often each one occurs.
titles = titanic["Name"].apply(get_title)
titles_test = titanic_test["Name"].apply(get_title)
# Title mapping. We're adding the Dona title to the mapping, because it's in the test set, but not the training set
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8,  "Mme": 8, "Don": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}
title_mapping_test = title_mapping.copy()
title_mapping_test["Dona"] = 10
for k,v in title_mapping.items():
    titles[titles == k] = v
titanic["Title"] = titles
for k,v in title_mapping_test.items():
    titles_test[titles_test == k] = v
titanic_test["Title"] = titles_test
# Verify that we converted everything.
#print(pandas.value_counts(titanic["Title"]))
# Check the counts of each unique title.
#print(pandas.value_counts(titanic_test["Title"]))

In [5]:
# Generating a familysize column
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
titanic_test["FamilySize"] = titanic_test["SibSp"] + titanic_test["Parch"]

In [6]:
predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Title"]

In [8]:
algRF = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)
scores = cross_validation.cross_val_score(algRF, titanic[predictors], titanic["Survived"], cv=3)
print(scores.mean())

0.828282828283


In [None]:
#alg = GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3)
#scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
#print(scores.mean())

In [None]:
#alg = LogisticRegression(random_state=1)
#scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic["Survived"], cv=3)
#print(scores.mean())

In [9]:
algRF.fit(titanic[predictors], titanic["Survived"])
predictions = algRF.predict_proba(titanic_test[predictors].astype(float))[:,1]

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0
predictions = predictions.astype(int)

submission = pandas.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })
submission.to_csv("RandomForest.csv", index=False)