In [35]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.base import TransformerMixin
from sklearn_pandas import DataFrameMapper
import seaborn as sns
import matplotlib
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score

In [36]:
train = pd.read_csv('train.csv').drop('PassengerId', axis=1)
test = pd.read_csv('test.csv').drop('PassengerId', axis=1)
alldata = pd.concat([train.drop('Survived', axis=1),test])

# Exploring Data

In [37]:
train.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [38]:
alldata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB


**PassengerId** - irrelevant in terms of features (already droped)<br>
**Survived** - "1" if survived, "0" otherwise <br>
**Pclass** - one of three classes (1,2,3) <br>
**Name** - name of a passenger including their titles<br>
**Sex** - male or female<br>
**Age** - age in years <br>
**SibSp** - number of passenger's siblings and spauses travelling with them<br>
**Parch** - number of passenger's parents and children travelling with them<br>
**Ticket** - ticket number<br>
**Fare** - fare<br>
**Cabin** - letters encode deck number and numbers encode cabin number <br>

**Missing Data:** <br>
'Age': 263 missing samples<br>
'Cabin': 1014 missing samples (almost 80% of data missing!)<br>
'Emarked': only 2 missing samples

**Intuition**<br>
Sex, Pclass, Fare and Age may play an important role in classification.<br>
Cabin, and even more the deck encoded in this feature may be a very important factor. There are planty of samples missing though.<br>
Noble title which can be extracted from Name can be a good feature.


In [39]:
train.corr(method='spearman')

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
Survived,1.0,-0.339668,-0.052565,0.088879,0.138266,0.323736
Pclass,-0.339668,1.0,-0.361666,-0.043019,-0.022801,-0.688032
Age,-0.052565,-0.361666,1.0,-0.182061,-0.254212,0.135051
SibSp,0.088879,-0.043019,-0.182061,1.0,0.450014,0.447113
Parch,0.138266,-0.022801,-0.254212,0.450014,1.0,0.410074
Fare,0.323736,-0.688032,0.135051,0.447113,0.410074,1.0


## Title Encoding 

Name is determied for each sample. We can use it to extract all the noble titles  

In [40]:
set([i.split(', ')[1].split('.')[0] for i in alldata['Name']])

{'Capt',
 'Col',
 'Don',
 'Dona',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}

In [41]:
class TitleEncoder(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        transformed = X.copy()
        for i in range(len(X)):
            transformed[i] = X[i].split(',')[1].split('.')[0]
        return transformed

## Ticket Encoding

In [42]:
len(train['Ticket'].unique())

681

In [43]:
mapper = DataFrameMapper([
    #features
    ('Pclass', sklearn.preprocessing.LabelBinarizer()),
    ('Sex', sklearn.preprocessing.LabelBinarizer()),
    ('Name', [TitleEncoder(),sklearn.preprocessing.LabelBinarizer()],{'alias': 'title'}),
    ('Fare', None),
    ('SibSp', None),
    ('Parch', None),
    #result
    #('Survived', None)
], input_df=True, df_out=True)

In [44]:
data = mapper.fit_transform(train, train.Survived)

In [45]:
X = np.array(data.values, dtype='float')
y = np.array(train['Survived'].values, dtype='float')

In [46]:
clf_logistic = LogisticRegression(random_state=42)

In [51]:
scores = cross_val_score(clf_logistic, X, y, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.82 (+/- 0.05)
