# Kaggle Titanic Challenge

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

full_set = train.append(test, ignore_index = True, sort=False)

del train, test

train_set = full_set[ :891]
train_set.describe()


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Preprocessing

In [88]:
# One Hot Encode Sex
sex = pd.get_dummies(full_set.Sex)

# One Hot Encode Embarked
embarked = pd.get_dummies(full_set.Embarked, prefix='Embarked')

# One Hot Encode Pclass
pclass = pd.get_dummies(full_set.Pclass, prefix='Pclass')

# Fill in missing variables in Age, Fare with averages respectively
age = full_set.Age.fillna(full_set.Age.median())
fare = full_set.Fare.fillna(full_set.Fare.median())

# Add a new column called Title
title = pd.DataFrame()
title['Title'] = full_set.Name.map(lambda name: name.split(',')[1].split('.')[0].strip())
title_dictionary = {
                    "Capt":       "Officer",
                    "Col":        "Officer",
                    "Major":      "Officer",
                    "Jonkheer":   "Royalty",
                    "Don":        "Royalty",
                    "Sir" :       "Royalty",
                    "Dr":         "Officer",
                    "Rev":        "Officer",
                    "the Countess":"Royalty",
                    "Dona":       "Royalty",
                    "Mme":        "Mrs",
                    "Mlle":       "Miss",
                    "Ms":         "Miss",
                    "Mr" :        "Mr",
                    "Mrs" :       "Mrs",
                    "Miss" :      "Miss",
                    "Master" :    "Master",
                    "Lady" :      "Royalty"
                    }
title['Title'] = title.Title.map(title_dictionary)
title = pd.get_dummies(title.Title)

# Extract category information
cabin = pd.DataFrame()
cabin['Cabin'] = full_set.Cabin.fillna('N') # 'N' for NaN
cabin['Cabin'] = cabin['Cabin'].map(lambda x : x[0])
cabin = pd.get_dummies(cabin['Cabin'], prefix='Cabin')



In [89]:
# Assemble data sets for modeling
# Columns can be age, fare, title, cabin, SibSp, Parch, pclass, embarked, ticket
full_X = pd.concat([title, age, fare, pclass, full_set.SibSp, full_set.Parch, embarked, cabin], axis=1)
full_X.head()

train_X, test_X, train_y, test_y = train_test_split(full_X[0:891], train_set.Survived, train_size=0.7)

In [90]:
# Try different models and see which works best
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

best_model_acc = 0
best_model = None

models = [RandomForestClassifier(), LogisticRegression(), DecisionTreeClassifier(), KNeighborsClassifier(n_neighbors=5), GaussianNB(), SVC()]
for model in models:
    curr_model = model
    curr_model.fit(train_X, train_y)
    print(model)
    training_acc = curr_model.score(train_X, train_y)
    testing_acc = curr_model.score(test_X, test_y)
    print('Training Accuracy: %s' %training_acc)
    print('Testing Accuracy: %s' %testing_acc)
    print('\n')
    if(testing_acc > best_model_acc):
        best_model_acc = testing_acc
        best_model = model
    

# Best model is LogisticRegression
print(best_model)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Training Accuracy: 0.9582664526484751
Testing Accuracy: 0.8805970149253731


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Training Accuracy: 0.8282504012841091
Testing Accuracy: 0.8582089552238806


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decreas

In [91]:
# Submitting Results
pred_y = best_model.predict(full_X[891:])
passenger_id = full_set[891:].PassengerId
test = pd.DataFrame( {'PassengerId': passenger_id, 'Survived': pred_y} )
print(test.shape)
print(test.head())
test.to_csv('titanic_pred.csv', index=False)

(418, 2)
     PassengerId  Survived
891          892       0.0
892          893       0.0
893          894       0.0
894          895       1.0
895          896       1.0
