# Imports

In [49]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import bisect
import re

# Data

In [50]:
train_data = pd.read_csv('data/train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [51]:
test_data = pd.read_csv('data/test.csv')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


# Data exploration

In [52]:
woman = train_data[train_data.Sex == 'female']['Survived']
rate_woman = sum(woman) / len(woman)
print(f'Percentage of woman who survived: {rate_woman*100:.2f}')

Percentage of woman who survived: 74.20


In [53]:
man = train_data[train_data.Sex == 'male']['Survived']
rate_man = sum(man) / len(man)
print(f'Percentage of man who survived: {rate_man*100:.2f}')

Percentage of man who survived: 18.89


# Data preparation

#### Fill missing age values with median of Age

In [54]:
train_data['Age'].fillna(train_data['Age'].median(), inplace=True)
test_data['Age'].fillna(test_data['Age'].median(), inplace=True)

#### Fill missing Fare values with median of Fare

In [55]:
train_data['Fare'].fillna(train_data['Fare'].median(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].median(), inplace=True)

#### Convert 'Sex' column to integers

In [56]:
train_data['Sex'] = train_data['Sex'].map({'male': 0, 'female': 1})
test_data['Sex'] = test_data['Sex'].map({'male': 0, 'female': 1})

#### Convert 'Embarked' column to integers

In [57]:
train_data['Embarked'] = train_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
test_data['Embarked'] = test_data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

#### Fill missing 'Embarked' values with most common value

In [58]:
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)

#### Create a new column 'FamilySize' as a sum of 'SibSp' and 'Parch' columns

In [59]:
train_data['FamilySize'] = train_data['SibSp'] + train_data['Parch']
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch']

#### Extract 'Deck' from 'Cabin' column

In [60]:
train_data['Deck'] = train_data['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'Unknown')
test_data['Deck'] = test_data['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'Unknown')

#### Extract 'NumCabins' from 'Cabin' column

In [61]:
train_data['NumCabins'] = train_data['Cabin'].apply(lambda x: len(str(x).split(' ')) if pd.notna(x) else 0)
test_data['NumCabins'] = test_data['Cabin'].apply(lambda x: len(str(x).split(' ')) if pd.notna(x) else 0)

#### Extract 'TicketPrefix' from 'Ticket' column

In [62]:
train_data['TicketPrefix'] = train_data['Ticket'].apply(lambda x: str(x).split()[0] if len(str(x).split()) > 1 else 'None')
test_data['TicketPrefix'] = test_data['Ticket'].apply(lambda x: str(x).split()[0] if len(str(x).split()) > 1 else 'None')


#### Extract 'TicketFrequency' from 'Ticket' column

In [63]:
train_data['TicketFrequency'] = train_data.groupby('Ticket')['Ticket'].transform('count')
test_data['TicketFrequency'] = test_data.groupby('Ticket')['Ticket'].transform('count')


#### Drop 'Cabin', 'Ticket' and fill missing values in 'Embarked' 

In [64]:
if 'Cabin' in train_data.columns:
    train_data.drop('Cabin', axis=1, inplace=True)

if 'Cabin' in test_data.columns:
    test_data.drop('Cabin', axis=1, inplace=True)

embarked_mode = train_data['Embarked'].mode()[0]
train_data['Embarked'].fillna(embarked_mode, inplace=True)
test_data['Embarked'].fillna(embarked_mode, inplace=True)  # using mode from train_data

if 'Ticket' in train_data.columns:
    train_data.drop('Ticket', axis=1, inplace=True)

if 'Ticket' in test_data.columns:
    test_data.drop('Ticket', axis=1, inplace=True)


#### Extracting titles from 'Name' column

In [65]:
def extract_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    return ''

In [66]:
train_data['Title'] = train_data['Name'].apply(extract_title)
test_data['Title'] = test_data['Name'].apply(extract_title)

#### Encoding caterogical features

In [67]:
label_encoders = {}

for column in ['Deck', 'TicketPrefix', 'Title']:  # Added 'Title' to the list
    
    # Convert the columns to string type
    train_data[column] = train_data[column].astype(str)
    test_data[column] = test_data[column].astype(str)
    
    # Initialize and fit the label encoder
    le = LabelEncoder()
    le.fit(train_data[column])
    train_data[column] = le.transform(train_data[column])
    
    # Handle unseen labels in the test data
    test_data[column] = test_data[column].map(lambda x: 'Unknown' if x not in le.classes_ else x)
    
    # Add 'Unknown' to the classes of the encoder and sort them
    le_classes = le.classes_.tolist()
    if 'Unknown' not in le_classes:
        bisect.insort_left(le_classes, 'Unknown')
        le.classes_ = np.array(le_classes)
    
    # Transform the test data
    test_data[column] = le.transform(test_data[column])
    
    # Store the label encoder for future reference
    label_encoders[column] = le


# Check data after preparation

In [68]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,Deck,NumCabins,TicketPrefix,TicketFrequency,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,7.25,0.0,1,8,0,4,1,12
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,71.2833,1.0,1,2,1,18,1,13
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,7.925,0.0,0,8,0,37,1,9
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,53.1,0.0,1,2,1,16,2,13
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,8.05,0.0,0,8,0,16,1,12


# Split data

In [80]:
X_train = train_data.drop(columns = ['Name', 'Survived'], axis=1)
y_train = train_data['Survived']
X_test = test_data.drop('Name', axis=1)

# Model

In [78]:
print(X_train.dtypes)

PassengerId          int64
Pclass               int64
Sex                  int64
Age                float64
SibSp                int64
Parch                int64
Fare               float64
Embarked           float64
FamilySize           int64
Deck                 int32
NumCabins            int64
TicketPrefix         int32
TicketFrequency      int64
Title                int32
dtype: object


In [79]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)