In [74]:
import pandas as pd
import numpy as np

Load and visualize the raw data

In [75]:
raw_train_data = pd.read_csv('train.csv')
raw_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


The Cabin column seems to have a lot of null values. Let's see how many there are.

In [76]:
print(raw_train_data['Cabin'].isnull().sum())
print(len(raw_train_data))

687
891


Most rows are missing Cabin data. We may as well drop that column.

In [77]:
data = raw_train_data.drop(columns=['Cabin'])

Let's see what other columns have null values:

In [78]:
data[data.isnull().any(axis=1)].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,Q


Age seems to be missing for some rows. Let's see how many:

In [79]:
data['Age'].isnull().sum()

177

That's a lot, but not enough to warrant dropping the column entirely. Let's just fill the missing values with the mean.

In [80]:
data['Age'] = data['Age'].fillna(data['Age'].mean())

Let's see if there are still any null values remaining:

In [81]:
data[data.isnull().any(axis=1)].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,


Just a couple of rows are missing the Embarked column. Let's just use the most common value for that column.

In [82]:
data.groupby('Embarked').size()

Embarked
C    168
Q     77
S    644
dtype: int64

'S' (Southampton) was the most common port of embarkation, so let's use that.

In [83]:
data['Embarked'] = data['Embarked'].fillna('S')

Let's put all of our preprocessing in one function so that we can use it later for the test data.

We can also try generating some extra features since we have relatively few.

In [104]:
def preprocess_data(data):
    # fill null entires
    data['Age'] = data['Age'].fillna(data['Age'].mean())
    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
    data['Embarked'] = data['Embarked'].fillna('S')
    # drop columns which we don't use
    data = data.drop(columns=['PassengerId', 'Cabin', 'Name', 'Ticket'])
    if 'Survived' in data.columns:
        data = data.drop(columns=['Survived'])
    # map the columns with string values to numeric values
    data['Sex'] = data['Sex'].map({'female': 1, 'male': 2})
    data['Embarked'] = data['Embarked'].map({'Q': 1, 'C': 2, 'S': 3})
    # create some new features
    data['Family'] = data['SibSp'] + data['Parch'] + 1
    # this feature will be high if a passenger is old and male
    data['OldMaleScore'] = data['Sex'] * data['Age']
    # this feature will be high if a passenger is old and female
    data['OldFemaleScore'] = data['Age'] / data['Sex']
    # normalization
    data = (data - data.mean()) / data.std()
    return data

In [99]:
data = preprocess_data(raw_train_data)
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family,OldMaleScore,OldFemaleScore
0,0.826913,0.737281,-0.592148,0.43255,-0.473408,-0.502163,0.568518,0.059127,-0.200208,-0.791872
1,-1.565228,-1.354813,0.63843,0.43255,-0.473408,0.786404,-1.004617,0.059127,-0.420448,1.631806
2,0.826913,-1.354813,-0.284503,-0.474279,-0.473408,-0.48858,0.568518,-0.56066,-0.860927,0.554616
3,-1.565228,-1.354813,0.407697,0.43255,-0.473408,0.420494,0.568518,0.059127,-0.530567,1.362509
4,0.826913,0.737281,0.407697,-0.474279,-0.473408,-0.486064,0.568518,-0.56066,0.754162,-0.208394


Now let's train some different models with various C values (regularization constant) and compare their performance.

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import ShuffleSplit, cross_val_score
from functools import partial

X = data.values
Y = raw_train_data['Survived'].values
cv = ShuffleSplit(n_splits=20, test_size=0.2)

def avg_score(model):
    return cross_val_score(model, X, Y, cv=cv).mean()

def get_best_c(partial_model):
    best_c = 0
    best_score = 0
    c_values = [0.01, 0.03, 0.1, 0.3, 1., 3., 10., 30.]
    for c in c_values:
        model = partial_model(C=c)
        score = avg_score(model)
        if score > best_score:
            best_score = score
            best_c = c
    return best_score, best_c


print(get_best_c(partial(LogisticRegression, solver='liblinear')))
print(get_best_c(partial(SVC, gamma='auto')))
print(avg_score(RandomForestClassifier(n_estimators=100, max_depth=8)))

(0.8178770949720672, 3.0)
(0.8385474860335196, 1.0)
0.8254189944134079


They all perform about the same, but the SVM performed the best, so we'll use that.

Now we just have to save our predictions to a CSV file.

In [108]:
model = SVC(gamma='auto', C=1.0)
model.fit(X, Y)
raw_test_data = pd.read_csv('test.csv')
test_data = preprocess_data(raw_test_data)
predictions = pd.Series(model.predict(test_data), name='Survived')
output = pd.concat([raw_test_data['PassengerId'], predictions], axis=1)
output.to_csv('predictions.csv', index=False)