## Titanic: Machine Learning from Disaster

- about competition: https://www.kaggle.com/c/titanic
- data: https://www.kaggle.com/c/titanic/data
- leaderboard: https://www.kaggle.com/c/titanic/leaderboard
- my best submission: https://www.kaggle.com/c/titanic/leaderboard?submissionId=3350542 76.555% (Voting Ensemble)

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import f1_score

Get Data

In [2]:
df_train_src = pd.read_csv('data/train.csv')
df_test_src = pd.read_csv('data/test.csv')

print df_train_src.info()
df_train_src.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df_test_src.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


Cleaning and feature engineering

In [4]:
titles = {
            "Capt":       "Officer",
            "Col":        "Officer",
            "Major":      "Officer",
            "Jonkheer":   "Royalty",
            "Don":        "Royalty",
            "Sir" :       "Royalty",
            "Dr":         "Officer",
            "Rev":        "Officer", 
            "the Countess":"Royalty",
            "Dona":       "Royalty",
            "Mme":        "Mrs",
            "Mlle":       "Miss",
            "Ms":         "Mrs",
            "Mr" :        "Mr",
            "Mrs" :       "Mrs",
            "Miss" :      "Miss",
            "Master" :    "Master",
            "Lady" :      "Royalty",
            }

def feature_eng(df):
    tit = df.copy(deep=True)
    
    # Existing features
    # Pclass: one hot
    tit = one_hot(tit, 'Pclass', ['1', '2', '3'])
    # Sex: one hot
    tit = one_hot(tit, 'Sex', ['female', 'male'])
    # Age: NaN to mean
    tit['Age'] = df['Age'].fillna(df['Age'].mean())
    # Fare: NaN to mean - only one missing in test
    tit['Fare'] = df['Fare'].fillna(df['Fare'].mean())
    # Cabin: one hot of first letter
    tit['Cabin'] = tit['Cabin'].map(lambda x: '0' if pd.isnull(x) else x[0])
    tit = one_hot(tit, 'Cabin', ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T'])
    # Embarked: one hot
    tit = one_hot(tit, 'Embarked', ['C', 'Q', 'S'])
    
    # Add new features
    # Title: build and one hot encode
    tit['Title'] = tit['Name'].apply(lambda x: titles[x.split(',')[1].split('.')[0].strip()])
    tit = one_hot(tit, 'Title', set(titles.values()))
    # Family size
    tit['FamilySize'] = df['SibSp'] + df['Parch']
    # Is adult
    tit['IsAdult'] = tit['Age'].map(lambda x: 1 if x > 18 else 0)
    # Is mother: female, not child, has children
    tit['IsMother'] = tit['Sex_female'] & tit['IsAdult'] & (tit['Parch'] > 0)
    # Missing Age
    tit['MisAge'] = pd.isnull(df.Age).astype(int)
    # Missing Cabin
    tit['MisCabin'] = pd.isnull(df.Cabin).astype(int)

    tit.drop(['PassengerId', 'Ticket', 'Name', 'SibSp', 'Parch'], axis=1, inplace=True)

    return(tit)

def one_hot(df, colname, vocabulary):
    cnt_vectorizer = CountVectorizer(vocabulary=vocabulary, token_pattern='\\w+', lowercase=False)
    data = cnt_vectorizer.fit_transform(df.pop(colname).map(
        lambda x: str(x) if str(x) in vocabulary else 'other'))
    colnames = [colname + '_' + x for x in vocabulary]
    df = pd.concat([
        df.reset_index(drop=True),
        pd.DataFrame(data.toarray(), columns=colnames).reset_index(drop=True)],
        axis=1, join='inner')
    return df

df_train = feature_eng(df_train_src)
df_test = feature_eng(df_test_src)

In [5]:
y = df_train.pop('Survived').values
X = df_train.values

In [6]:
df_train.head(3).T

Unnamed: 0,0,1,2
Age,22,38,26
Fare,7.25,71.2833,7.925
Pclass_1,0,1,0
Pclass_2,0,0,0
Pclass_3,1,0,1
Sex_female,0,1,1
Sex_male,1,0,0
Cabin_A,0,0,0
Cabin_B,0,0,0
Cabin_C,0,1,0


Try few different models

In [7]:
rf = RandomForestClassifier(n_estimators=200, random_state=42)
gb = GradientBoostingClassifier(n_estimators=200, random_state=42)
ada = AdaBoostClassifier(n_estimators=300, random_state=42)
knn = KNeighborsClassifier(n_neighbors=8)
svm = SVC(probability=True)
lr = LogisticRegression()

estimators = [
    ('Logistic Regression', lr),
    ('KNN', knn),
    ('SVC', svm),
    ('Random Forest', rf),
    ('Ada boost', ada),
    ('Gradient boost', gb),
]

for name, model in estimators:
    result = cross_val_score(model, X, y, scoring = 'accuracy')
    print("{0:<20} ({1:.4f}) +/- ({2:.4f})".format(name, result.mean(), result.std()))

Logistic Regression  (0.8227) +/- (0.0141)
KNN                  (0.7026) +/- (0.0278)
SVC                  (0.7340) +/- (0.0372)
Random Forest        (0.7969) +/- (0.0193)
Ada boost            (0.8036) +/- (0.0348)
Gradient boost       (0.8294) +/- (0.0203)


In [8]:
for name, model in estimators:
    model.fit(X, y)
    y_pred = model.predict(X)
    print('{} score: {:.4f}'.format(name, f1_score(y, y_pred)))

Logistic Regression score: 0.7862
KNN score: 0.6830
SVC score: 0.8164
Random Forest score: 0.9838
Ada boost score: 0.8276
Gradient boost score: 0.9080


Build voting ensemble out of the best perfoming models

In [9]:
voters = [
    ('Logistic Regression', lr),
#    ('KNN', knn),
#    ('SVC', svm),
    ('Random Forest', rf),
    ('Ada boost', ada),
    ('Gradient boost', gb),
]

voting_ensemble = VotingClassifier(voters, voting='soft')

results = cross_val_score(voting_ensemble, X, y)
print("({0:.4}) +/- ({1:.4f})".format(results.mean(), results.std()))

(0.8328) +/- (0.0156)


Submission

In [10]:
voting_ensemble.fit(X, y)
submission = pd.DataFrame({
        'PassengerId': df_test_src['PassengerId'],
        'Survived': voting_ensemble.predict(df_test.values)
    })
submission.to_csv('submissions/submission_20160819_1625_vote.csv', index=False)