Libraries

In [769]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Reading dataset

In [770]:
df = pd.read_csv("Titanic-Dataset.csv")

Cleaning

In [771]:
print(df.isnull().sum())
print(df['Survived'].value_counts())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64
Survived
0    549
1    342
Name: count, dtype: int64


In [772]:
df = df.drop(columns= 'Ticket')
df['Age'] = df['Age'].fillna(value=df['Age'].mean())
df['Cabin'] = df['Cabin'].fillna(value = 'U') # U for Unknown
df['Cabin'] = df['Cabin'].str[0]
df['Embarked'] = df['Embarked'].fillna(value= df['Embarked'].mode()[0])
df['Title'] = df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
df.drop(columns= 'Name', inplace=True)
df.drop(columns = 'PassengerId', inplace = True)

In [773]:
print(df.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
Title       0
dtype: int64


In [774]:
print(df.dtypes)

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked     object
Title        object
dtype: object


One hot encoding

In [775]:
df = pd.get_dummies(columns= ['Sex','Cabin','Embarked','Title'], data = df, drop_first= False)

splitting

In [776]:

X = df.drop(columns=['Survived'])
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

Building the model

In [777]:
model = RandomForestClassifier(n_estimators = 100, max_depth = 10,criterion='entropy')
model.fit(X_train, y_train)

In [781]:
# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='binary'))  # or 'macro', 'micro'
print("Recall   :", recall_score(y_test, y_pred, average='binary'))
print("F1 Score :", f1_score(y_test, y_pred, average='binary'))

Accuracy : 0.8100558659217877
Precision: 0.7941176470588235
Recall   : 0.7297297297297297
F1 Score : 0.7605633802816901


Trying XGBoost

In [779]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators = 100)
model.fit(X_train, y_train)

In [780]:
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='binary'))  # or 'macro', 'micro'
print("Recall   :", recall_score(y_test, y_pred, average='binary'))
print("F1 Score :", f1_score(y_test, y_pred, average='binary'))

Accuracy : 0.8100558659217877
Precision: 0.7941176470588235
Recall   : 0.7297297297297297
F1 Score : 0.7605633802816901
