In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.base import TransformerMixin
from sklearn_pandas import DataFrameMapper, cross_val_score
import seaborn as sns
import matplotlib
from sklearn import linear_model

In [None]:
train = pd.read_csv('train.csv').drop('PassengerId', axis=1)
test = pd.read_csv('test.csv').drop('PassengerId', axis=1)
alldata = pd.concat([train.drop('Survived', axis=1),test])

# Exploring Data

In [None]:
train.head()

In [None]:
alldata.info()

**PassengerId** - irrelevant in terms of features (already droped)<br>
**Survived** - "1" if survived, "0" otherwise <br>
**Pclass** - one of three classes (1,2,3) <br>
**Name** - name of a passenger including their titles<br>
**Sex** - male or female<br>
**Age** - age in years <br>
**SibSp** - number of passenger's siblings and spauses travelling with them<br>
**Parch** - number of passenger's parents and children travelling with them<br>
**Ticket** - ticket number<br>
**Fare** - fare<br>
**Cabin** - letters encode deck number and numbers encode cabin number <br>

**Missing Data:** <br>
'Age': 263 missing samples<br>
'Cabin': 1014 missing samples (almost 80% of data missing!)<br>
'Emarked': only 2 missing samples

**Intuition**<br>
Sex, Pclass, Fare and Age may play an important role in classification.<br>
Cabin, and even more the deck encoded in this feature may be a very important factor. There are planty of samples missing though.<br>
Noble title which can be extracted from Name can be a good feature.


In [None]:
train.corr(method='spearman')

## Title Encoding 

Name is determied for each sample. We can use it to extract all the noble titles  

In [None]:
set([i.split(', ')[1].split('.')[0] for i in alldata['Name']])

In [None]:
class TitleEncoder(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        transformed = X.copy()
        for i in range(len(X)):
            transformed[i] = X[i].split(',')[1].split('.')[0]
        return transformed

## Ticket Encoding

In [None]:
len(train['Ticket'].unique())

In [None]:
mapper = DataFrameMapper([
    #features
    ('Pclass', sklearn.preprocessing.LabelBinarizer()),
    ('Sex', sklearn.preprocessing.LabelBinarizer()),
    ('Name', [TitleEncoder(),sklearn.preprocessing.LabelBinarizer()],{'alias': 'title'}),
    ('Fare', None),
    ('SibSp', None),
    ('Parch', None),
    #result
    ('Survived', None)
], input_df=True, df_out=True)

In [None]:
data = mapper.fit_transform(train.copy())

In [None]:
clf = linear_model.LogisticRegression(C=1)

In [None]:
X_train = data.drop(['Survived'], axis=1)

In [None]:
Y_train = data['Survived']

In [None]:
cross_val_score(clf, X_train, Y_train, cv=10)