## Titanic: Machine Learning from Disaster

- about competition: https://www.kaggle.com/c/titanic
- data: https://www.kaggle.com/c/titanic/data
- leaderboard: https://www.kaggle.com/c/titanic/leaderboard
- my best submission: https://www.kaggle.com/c/titanic/leaderboard?submissionId=3335297 75.598% (Gradient Boosting)

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import KFold

Get Data

In [2]:
df_train_src = pd.read_csv('data/train.csv')
df_test_src = pd.read_csv('data/test.csv')

print df_train_src.info()
df_train_src.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Cleaning and feature engineering

In [3]:
def feature_eng(df):
    tit = df.copy(deep=True)
    
    # Existing features
    # Pclass: one hot
    tit = one_hot(tit, 'Pclass', ['1', '2', '3'])
    # Sex: one hot
    tit = one_hot(tit, 'Sex', ['female', 'male'])
    # Age: NaN to mean
    tit['Age'] = df['Age'].fillna(df['Age'].mean())
    # Fare: NaN to mean
    tit['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    # Cabin: one hot of first letter
    tit['Cabin'] = tit['Cabin'].map(lambda x: '0' if pd.isnull(x) else x[0])
    tit = one_hot(tit, 'Cabin', ['a', 'b', 'c', 'd', 'e', 'f', 'g', 't'])
    # Embarked: one hot
    tit = one_hot(tit, 'Embarked', ['c', 'q', 's'])
    
    # Add new features
    # Family size
    tit['FamilySize'] = df['SibSp'] + df['Parch']
    # Is adult
    tit['IsAdult'] = tit['Age'].map(lambda x: 1 if x > 18 else 0)
    # Is mother: female, not child, has children
    tit['IsMother'] = tit['Sex_female'] & tit['IsAdult'] & (tit['Parch'] > 0)
    # Missing Age
    tit['MisAge'] = pd.isnull(df.Age).astype(int)
    # Missing Cabin
    tit['MisCabin'] = pd.isnull(df.Cabin).astype(int)

    tit.drop(['PassengerId', 'Ticket', 'Name', 'SibSp', 'Parch'], axis=1, inplace=True)

    return(tit)

def one_hot(df, colname, vocabulary):
    cnt_vectorizer = CountVectorizer(vocabulary=vocabulary, token_pattern='(?u)\\b\\w+\\b')
    data = cnt_vectorizer.fit_transform(df.pop(colname).map(
        lambda x: str(x) if str(x) in vocabulary else 'other'))
    colnames = [colname + '_' + x for x in vocabulary]
    df = pd.concat([
        df.reset_index(drop=True),
        pd.DataFrame(data.toarray(), columns=colnames).reset_index(drop=True)],
        axis=1, join='inner')
    return df

df_train = feature_eng(df_train_src)
df_test = feature_eng(df_test_src)

In [4]:
y = df_train.pop('Survived').values
X = df_train.values

In [5]:
df_train.tail()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Cabin_a,Cabin_b,Cabin_c,...,Cabin_g,Cabin_t,Embarked_c,Embarked_q,Embarked_s,FamilySize,IsAdult,IsMother,MisAge,MisCabin
886,27.0,13.0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,False,0,1
887,19.0,30.0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,False,0,0
888,29.699118,23.45,0,0,1,1,0,0,0,0,...,0,0,0,0,0,3,1,True,1,1
889,26.0,30.0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,False,0,0
890,32.0,7.75,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,1,False,0,1


In [6]:
rf = RandomForestClassifier(n_estimators = 200)
gb = GradientBoostingClassifier(n_estimators = 200)
lr = LogisticRegression()
kf = KFold(n=X.shape[0], n_folds=10, shuffle=True, random_state=42)

for model in [lr, rf, gb]:
    scores = []
    for train_index, test_index in kf:

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        scores.append(model.score(X_test, y_test))

    print 'min: {}\navg: {}\nmax: {}\n'.format(np.array(scores).min(), np.array(scores).mean(), np.array(scores).max())

min: 0.719101123596
avg: 0.800162297129
max: 0.865168539326

min: 0.775280898876
avg: 0.822646691635
max: 0.898876404494

min: 0.76404494382
avg: 0.830524344569
max: 0.898876404494



Submission

In [7]:
submission = pd.DataFrame({
        'PassengerId': df_test_src['PassengerId'],
        'Survived': gb.predict(df_test.values)
    })
submission.to_csv('submissions/submission_20160813_1654_gb.csv', index=False)