In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Define functions for combining and dividing datasets

def concatenate_df(df1, df2):
    """Concatenate two pandas dataframes"""
    
    return pd.concat([df1, df2], sort=True).reset_index(drop=True)


def divide_titanic_df(all_data):
    """Divide concatenated titanic data back up into training and test sets"""
    
    train_data = all_data.loc[:890]
    test_data = all_data.loc[891:].drop(['Survived'], axis=1)
    return train_data, test_data

# Load data

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

# Combine data

all_data = concatenate_df(train_data, test_data)

# Data cleaning

See Version 12 for details of the following.

In [3]:
# Fill in missing ages

# SibSp seems to be as good a predictor of age as any (>= 2 indicates travelling with siblings, which likely means a child)
all_data['Age'] = all_data.groupby(['SibSp'])['Age'].apply(lambda x: x.fillna(x.median()))
# Use Pclass and Sex as a backup
all_data['Age'] = all_data.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))

# Fill in missing fare

mr_thomas_fare = all_data.loc[(all_data['Pclass'] == 3) & (all_data['SibSp'] == 0) & (all_data['Embarked'] == 'S')]['Fare'].median()
all_data.loc[all_data['Fare'].isnull(), 'Fare'] = mr_thomas_fare

# Fill in missing embarkeds

all_data.loc[61, 'Embarked'] = 'S'
all_data.loc[829, 'Embarked'] = 'S'

# Keep first letter of cabin (indicating deck) only and insert 'M' for missing

all_data['Deck'] = all_data['Cabin'].apply(lambda c: c[0] if pd.notnull(c) else 'M')

# Feature engineering

## Binning

See Version 12 for details of the following.

In [4]:
# Outliers can disrupt learning, so split up the distribution for each

# For Fare, use qcut to assign approximately equal number of cases to each category
all_data['Fare'] = pd.qcut(all_data['Fare'], 5)
# For Age, use cut to split according to values
all_data['Age'] = pd.cut(all_data['Age'].astype(int), 5)

## New feature creation

See Version 12 for details of the following.

In [5]:
# Combine sibSp and Parch to create new feature Family_Size
all_data['Family_Size'] = all_data['SibSp'] + all_data['Parch'] + 1
# Bin family size to group to group family size of four of higher into a single value
all_data['Family_Size_Bin'] = all_data['Family_Size'].map(lambda fs: fs if fs <= 3 else (4 if fs > 3 else 0))

# Create Ticket_Frequency feature
all_data['Ticket_Freq'] = all_data.groupby('Ticket')['Ticket'].transform('count')

Create new feature for Title (Mr., Mrs., etc.)

In [6]:
print(all_data['Name'][:20])

0                               Braund, Mr. Owen Harris
1     Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                Heikkinen, Miss. Laina
3          Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                              Allen, Mr. William Henry
5                                      Moran, Mr. James
6                               McCarthy, Mr. Timothy J
7                        Palsson, Master. Gosta Leonard
8     Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                   Nasser, Mrs. Nicholas (Adele Achem)
10                      Sandstrom, Miss. Marguerite Rut
11                             Bonnell, Miss. Elizabeth
12                       Saundercock, Mr. William Henry
13                          Andersson, Mr. Anders Johan
14                 Vestrom, Miss. Hulda Amanda Adolfina
15                     Hewlett, Mrs. (Mary D Kingcome) 
16                                 Rice, Master. Eugene
17                         Williams, Mr. Charles

In [7]:
def title(name):
    """Get the title from a passenger's name
    
    Return False if no title is found"""
    
    parts = name.split()
    ends_with_period = [part for part in parts if part.endswith('.')]
    if ends_with_period:
        return ends_with_period[0]
    else:
        return None

In [8]:
all_data['Title'] = [title(x) for x in all_data['Name']]
print(all_data['Title'].value_counts())

Mr.          757
Miss.        260
Mrs.         197
Master.       61
Rev.           8
Dr.            8
Col.           4
Mlle.          2
Major.         2
Ms.            2
Lady.          1
Sir.           1
Mme.           1
Don.           1
Capt.          1
Countess.      1
Jonkheer.      1
Dona.          1
Name: Title, dtype: int64


We appear to have successfully obtained a title for every passenger.

According to Wikipedia: *Jonkheer (female equivalent: jonkvrouw; French: Écuyer; English: Squire) is an honorific in the Low Countries denoting the lowest rank within the nobility.*

Let's group the less common ones into a single category 'misc'.

In [9]:
rare = (all_data['Title'].value_counts() < 10)
all_data['Title'] = all_data['Title'].apply(lambda x: 'misc' if rare.loc[x] else x)
print(all_data['Title'].value_counts())

Mr.        757
Miss.      260
Mrs.       197
Master.     61
misc        34
Name: Title, dtype: int64


In [10]:
train_data, test_data = divide_titanic_df(all_data)
y = train_data["Survived"]

features = ["Pclass", "Sex", "Family_Size_Bin", "Age", "Fare", "Deck", "Ticket_Freq", "Embarked", "Title"]

all_dummies = pd.get_dummies(all_data[features])
X = all_dummies.loc[:890]
X_test = all_dummies.loc[891:]
display(X_test)

Unnamed: 0,Pclass,Family_Size_Bin,Ticket_Freq,Sex_female,Sex_male,"Age_(-0.08, 16.0]","Age_(16.0, 32.0]","Age_(32.0, 48.0]","Age_(48.0, 64.0]","Age_(64.0, 80.0]",...,Deck_M,Deck_T,Embarked_C,Embarked_Q,Embarked_S,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_misc
891,3,1,1,0,1,0,0,1,0,0,...,1,0,0,1,0,0,0,1,0,0
892,3,2,1,1,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,1,0
893,2,1,1,0,1,0,0,0,1,0,...,1,0,0,1,0,0,0,1,0,0
894,3,1,1,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,1,0,0
895,3,3,2,1,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,1,1,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,1,0,0
1305,1,1,3,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
1306,3,1,1,0,1,0,0,1,0,0,...,1,0,0,0,1,0,0,1,0,0
1307,3,1,1,0,1,0,1,0,0,0,...,1,0,0,0,1,0,0,1,0,0


In [11]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
# Convert 'Survived' to int
output['Survived'] = output['Survived'].astype(int)
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
