In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df = pd.concat([df, df_test])
df = df.reset_index(drop=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [4]:
for c in df.columns:
    if df[c].hasnans:
        print 'column = {} has nans'.format(c)

column = Age has nans
column = Cabin has nans
column = Embarked has nans
column = Fare has nans
column = Survived has nans


In [5]:
names = df['Name']
titles = ['Mr.', 'Miss.', 'Mrs.', 'Master.', 'Dr.', 'Don.', 'Rev.', 'Major.', 'Col.', 'Capt.', 'Ms.', 'Lady.', 'Sir.', 'Mme.', 'Mlle.']
for (idx, n) in enumerate(names):
    match = False
    for t in titles:
        if n.find(t) != -1:
            match = True
            break
    if not match:
        print idx, n
        
title_mapping = {'Mme.': 'Mrs.',
           'Mlle.': 'Miss.',
           'Sir.': 'Mr.',
           'Lady.': 'Miss.',
           'Ms.': 'Miss.'}
        
def norm_title(r):
    name = r['Name']
    for t in titles:
        if name.find(t) != -1:
            if t in title_mapping: return title_mapping[t]
            else: return t
    sex = r['Sex']
    if sex == 'female': return 'Mrs.'
    else: return 'Mr.'

759 Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)
822 Reuchlin, Jonkheer. John George
1305 Oliva y Ocana, Dona. Fermina


In [6]:
df['Title'] = df[['Name', 'Sex']].apply(norm_title, axis = 1)

In [7]:
age_data = df[['Age', 'Title']]
x = age_data['Age']
y = age_data[~x.isnull()]
z = y.groupby('Title').median()
age_mapping = {}
for r in z.iterrows():
    t = r[0]
    v = r[1][0]
    age_mapping[t] = v
age_mapping

{'Capt.': 70.0,
 'Col.': 54.5,
 'Don.': 40.0,
 'Dr.': 49.0,
 'Major.': 48.5,
 'Master.': 4.0,
 'Miss.': 22.0,
 'Mr.': 29.0,
 'Mrs.': 35.0,
 'Rev.': 41.5}

In [8]:
ages = df[df['Age'].isnull()].apply(lambda x: age_mapping[x['Title']], axis = 1)
df.iloc[ages.index, df.columns.get_loc('Age')] = ages

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 13 columns):
Age            1309 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
Title          1309 non-null object
dtypes: float64(3), int64(4), object(6)
memory usage: 133.0+ KB


In [10]:
df['Embarked'].fillna('S', inplace = True)
df['Fare'].fillna(method = 'bfill', inplace = True)
df['Relative'] = df['Parch'] + df['SibSp']

In [11]:
emb = pd.get_dummies(df['Embarked'], prefix = 'emb')
pc = pd.get_dummies(df['Pclass'], prefix='pc')
sex = pd.get_dummies(df['Sex'], prefix = 'sex')
title = pd.get_dummies(df['Title'], prefix = 'tt')

In [12]:
df2 = pd.concat([df, emb, pc, sex, title], axis = 1)
df_train = df2[~df2['Survived'].isnull()]
df_test = df2[df['Survived'].isnull()]

In [13]:
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [14]:
X, y = (df_train.drop(['Cabin', 'SibSp', 'Parch', 'Survived', 'PassengerId', 'Name', 'Ticket', 'Title', 'Sex', 'Embarked', 'Pclass'], axis = 1), df_train['Survived'])
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [16]:
clf = LogisticRegression()
print cross_val_score(clf, X, y, cv = 5)
clf.fit(X, y)

[ 0.83240223  0.82122905  0.80898876  0.80898876  0.8700565 ]


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [17]:
clf2 = RandomForestClassifier(n_estimators=200)
print cross_val_score(clf, X, y, cv = 5)
clf2.fit(X, y)

[ 0.83240223  0.82122905  0.80898876  0.80898876  0.8700565 ]


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [18]:
X_test = df_test.drop(['Cabin', 'SibSp', 'Parch', 'Survived', 'PassengerId', 'Name', 'Ticket', 'Title', 'Sex', 'Embarked', 'Pclass'], axis = 1)
y_test= np.round((clf.predict(X_test) + clf2.predict(X_test)) * 0.5)

In [19]:
df_output = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test.astype(int)})
df_output.to_csv('submission.csv', index=False)