In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [2]:
def concatenate_df(df1, df2):
    """Concatenate two pandas dataframes"""
    
    return pd.concat([df1, df2], sort=True).reset_index(drop=True)

# Load data

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

# Combine data

all_data = concatenate_df(train_data, test_data)

# Data cleaning

See Version 12 for details of the following.

In [3]:
# Fill in missing ages

# SibSp seems to be as good a predictor of age as any (>= 2 indicates travelling with siblings, which likely means a child)
all_data['Age'] = all_data.groupby(['SibSp'])['Age'].apply(lambda x: x.fillna(x.median()))
# Use Pclass and Sex as a backup
all_data['Age'] = all_data.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))

# Fill in missing fare

mr_thomas_fare = all_data.loc[(all_data['Pclass'] == 3) & (all_data['SibSp'] == 0) & (all_data['Embarked'] == 'S')]['Fare'].median()
all_data.loc[all_data['Fare'].isnull(), 'Fare'] = mr_thomas_fare

# Fill in missing embarkeds

all_data.loc[61, 'Embarked'] = 'S'
all_data.loc[829, 'Embarked'] = 'S'

# Keep first letter of cabin (indicating deck) only and insert 'M' for missing

all_data['Deck'] = all_data['Cabin'].apply(lambda c: c[0] if pd.notnull(c) else 'M')

# Feature engineering

## Binning

See Version 12 for details of the following.

In [4]:
# Outliers can disrupt learning, so split up the distribution for each

# For Fare, use qcut to assign approximately equal number of cases to each category
all_data['Fare'] = pd.qcut(all_data['Fare'], 5)
# For Age, use cut to split according to values
all_data['Age'] = pd.cut(all_data['Age'].astype(int), 5)

## New feature creation

See Version 12 for details of the following.

In [5]:
# Combine sibSp and Parch to create new feature Family_Size
all_data['Family_Size'] = all_data['SibSp'] + all_data['Parch'] + 1
# Bin family size to group to group family size of four of higher into a single value
all_data['Family_Size_Bin'] = all_data['Family_Size'].map(lambda fs: fs if fs <= 3 else (4 if fs > 3 else 0))

# Create Ticket_Frequency feature
all_data['Ticket_Freq'] = all_data.groupby('Ticket')['Ticket'].transform('count')

See Version 15 for details of the following

In [6]:
def title(name):
    """Get the title from a passenger's name
    
    Return False if no title is found"""
    
    parts = name.split()
    ends_with_period = [part for part in parts if part.endswith('.')]
    if ends_with_period:
        return ends_with_period[0]
    else:
        return None
    

all_data['Title'] = [title(x) for x in all_data['Name']]
rare = (all_data['Title'].value_counts() < 10)
all_data['Title'] = all_data['Title'].apply(lambda x: 'misc' if rare.loc[x] else x)

# Encoding

In [7]:
all_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Deck,Family_Size,Family_Size_Bin,Ticket_Freq,Title
0,"(16.0, 32.0]",,S,"(-0.001, 7.854]","Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171,M,2,2,1,Mr.
1,"(32.0, 48.0]",C85,C,"(41.579, 512.329]","Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599,C,2,2,2,Mrs.
2,"(16.0, 32.0]",,S,"(7.854, 10.5]","Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282,M,1,1,1,Miss.
3,"(32.0, 48.0]",C123,S,"(41.579, 512.329]","Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1.0,113803,C,2,2,2,Mrs.
4,"(32.0, 48.0]",,S,"(7.854, 10.5]","Allen, Mr. William Henry",0,5,3,male,0,0.0,373450,M,1,1,1,Mr.


Label encode non-numeric features

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
non_numeric = ['Age', 'Embarked', 'Fare', 'Sex', 'Deck', 'Title']

for feature in non_numeric:
    all_data[feature] = label_encoder.fit_transform(all_data[feature])

In [9]:
all_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,Deck,Family_Size,Family_Size_Bin,Ticket_Freq,Title
0,1,,2,0,"Braund, Mr. Owen Harris",0,1,3,1,1,0.0,A/5 21171,7,2,2,1,2
1,2,C85,0,4,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,0,1,1.0,PC 17599,2,2,2,2,3
2,1,,2,1,"Heikkinen, Miss. Laina",0,3,3,0,0,1.0,STON/O2. 3101282,7,1,1,1,1
3,2,C123,2,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,0,1,1.0,113803,2,2,2,2,3
4,2,,2,1,"Allen, Mr. William Henry",0,5,3,1,0,0.0,373450,7,1,1,1,2


One-hot encode categorical features

In [10]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

categorical = ["Pclass", "Sex", "Family_Size_Bin", "Age", "Fare", "Deck", "Embarked", "Title"]

for feature in categorical:

    # Create encoder dataframe
    encoder_df = pd.DataFrame(one_hot_encoder.fit_transform(all_data[[feature]]).toarray())
    column_names = one_hot_encoder.get_feature_names([feature])
    encoder_df.columns = column_names
    # Append to all_data
    all_data = all_data.join(encoder_df)
    
# Drop columns that were one-hot encoded
all_data = all_data.drop(columns=categorical)
all_data.head()

Unnamed: 0,Cabin,Name,Parch,PassengerId,SibSp,Survived,Ticket,Family_Size,Ticket_Freq,Pclass_1,...,Deck_7,Deck_8,Embarked_0,Embarked_1,Embarked_2,Title_0,Title_1,Title_2,Title_3,Title_4
0,,"Braund, Mr. Owen Harris",0,1,1,0.0,A/5 21171,2,1,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,C85,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1.0,PC 17599,2,2,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,,"Heikkinen, Miss. Laina",0,3,0,1.0,STON/O2. 3101282,1,1,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,C123,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1.0,113803,2,2,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,,"Allen, Mr. William Henry",0,5,0,0.0,373450,1,1,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [11]:
# Drop columns that will not be used as features
drop_columns = ['Cabin', 'Name', 'Parch', 'PassengerId', 'SibSp', 'Ticket']
all_data = all_data.drop(columns=drop_columns)

train_data = all_data.loc[:890]
y = train_data["Survived"]
X = train_data.drop(columns=['Survived'])

X_test = all_data.loc[891:].drop(columns=['Survived'])

In [12]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
# Convert 'Survived' to int
output['Survived'] = output['Survived'].astype(int)
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
