In [138]:
# Extracting Titanic dataset from OpenML

from sklearn.datasets import fetch_openml

titanic = fetch_openml('titanic', version=1, as_frame=True)

train_data = titanic.data
train_target = titanic.target

In [139]:
print(train_data.head())
print(train_data.info())

   pclass                                             name     sex      age  \
0       1                    Allen, Miss. Elisabeth Walton  female  29.0000   
1       1                   Allison, Master. Hudson Trevor    male   0.9167   
2       1                     Allison, Miss. Helen Loraine  female   2.0000   
3       1             Allison, Mr. Hudson Joshua Creighton    male  30.0000   
4       1  Allison, Mrs. Hudson J C (Bessie Waldo Daniels)  female  25.0000   

   sibsp  parch  ticket      fare    cabin embarked boat   body  \
0      0      0   24160  211.3375       B5        S    2    NaN   
1      1      2  113781  151.5500  C22 C26        S   11    NaN   
2      1      2  113781  151.5500  C22 C26        S  NaN    NaN   
3      1      2  113781  151.5500  C22 C26        S  NaN  135.0   
4      1      2  113781  151.5500  C22 C26        S  NaN    NaN   

                         home.dest  
0                     St Louis, MO  
1  Montreal, PQ / Chesterville, ON  
2  Montreal

In [140]:
print(train_target.head())
print(train_target.info())

0    1
1    1
2    0
3    0
4    0
Name: survived, dtype: category
Categories (2, object): ['0', '1']
<class 'pandas.core.series.Series'>
RangeIndex: 1309 entries, 0 to 1308
Series name: survived
Non-Null Count  Dtype   
--------------  -----   
1309 non-null   category
dtypes: category(1)
memory usage: 1.5 KB
None


In [141]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Combine features and target into one DataFrame and drop irrelevant columns.
titanic_data = train_data.copy()
titanic_data['Survived'] = train_target
titanic_data = titanic_data.drop(columns=['boat', 'body', 'home.dest', 'name', 'ticket', 'cabin'])

# features and target variable as an alias of X and y.
X = titanic_data.drop(columns=['Survived'])
y = titanic_data['Survived']

# Identify and process numerical and categorical columns
numerical_cols = ['age', 'fare']
categorical_cols = ['sex', 'embarked', 'pclass']

numerical_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocessing the entire dataset
X_processed = preprocessor.fit_transform(X)

In [142]:
feature_names = preprocessor.get_feature_names_out()

# Converting X_processed back to a DataFrame
X_processed_df = pd.DataFrame(X_processed, columns=feature_names)
X_processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   num__age         1309 non-null   float64
 1   num__fare        1309 non-null   float64
 2   cat__sex_female  1309 non-null   float64
 3   cat__sex_male    1309 non-null   float64
 4   cat__embarked_C  1309 non-null   float64
 5   cat__embarked_Q  1309 non-null   float64
 6   cat__embarked_S  1309 non-null   float64
 7   cat__pclass_1    1309 non-null   float64
 8   cat__pclass_2    1309 non-null   float64
 9   cat__pclass_3    1309 non-null   float64
dtypes: float64(10)
memory usage: 102.4 KB


In [143]:
from sklearn.model_selection import train_test_split

# Perform train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_processed_df, y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Target train shape:", y_train.shape)
print("Target test shape:", y_test.shape)

Training set shape: (1047, 10)
Test set shape: (262, 10)
Target train shape: (1047,)
Target test shape: (262,)


**Logistic Regression**

In [144]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)

# Metrics
log_reg_accuracy = accuracy_score(y_test, y_pred_logistic)
log_reg_classification_report = classification_report(y_test, y_pred_logistic)
log_reg_confusion_matrix = confusion_matrix(y_test, y_pred_logistic)

# Results
print("Logistic Regression Results:")
print(f"Accuracy: {log_reg_accuracy:.2f}")
print("Classification Report:\n", log_reg_classification_report)
print("Confusion Matrix:\n", log_reg_confusion_matrix)

Logistic Regression Results:
Accuracy: 0.77
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.88      0.81       144
           1       0.81      0.64      0.72       118

    accuracy                           0.77       262
   macro avg       0.78      0.76      0.76       262
weighted avg       0.78      0.77      0.77       262

Confusion Matrix:
 [[126  18]
 [ 42  76]]


**Random Forest Classifier**

In [145]:
from sklearn.ensemble import RandomForestClassifier
# Model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_classification_report = classification_report(y_test, y_pred_rf)
rf_confusion_matrix = confusion_matrix(y_test, y_pred_rf)

# Results
print("\nRandom Forest Results:")
print(f"Accuracy: {rf_accuracy:.2f}")
print("Classification Report:\n", rf_classification_report)
print("Confusion Matrix:\n", rf_confusion_matrix)



Random Forest Results:
Accuracy: 0.79
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.87      0.82       144
           1       0.81      0.69      0.74       118

    accuracy                           0.79       262
   macro avg       0.79      0.78      0.78       262
weighted avg       0.79      0.79      0.78       262

Confusion Matrix:
 [[125  19]
 [ 37  81]]


**Neural Networks**

In [146]:
# Additional data cleaning for neural networks and converting dataframes into np.arrays
y_train = y_train.astype(int)
y_test = y_test.astype(int)

X_train_clean = X_train.dropna()
X_test_clean = X_test.dropna()
y_train_clean = y_train.loc[X_train_clean.index]
y_test_clean = y_test.loc[X_test_clean.index]

import numpy as np
X_train_np = np.array(X_train_clean)
X_test_np = np.array(X_test_clean)
y_train_np = np.array(y_train_clean)
y_test_np = np.array(y_test_clean)

# Models, Layers and Optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Input(shape=(X_train_np.shape[1],)),  # Use Input layer instead of passing input_shape directly
    Dense(64, activation='relu'),  # First hidden layer
    Dropout(0.3),  # Dropout for regularization
    Dense(32, activation='relu'),  # Second hidden layer
    Dropout(0.3),  # Dropout for regularization
    Dense(1, activation='sigmoid')  # Output layer (binary classification)
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train_np, y_train_np, validation_split=0.2, epochs=250, batch_size=32, verbose=1)

y_pred_nn = (model.predict(X_test_np) > 0.5).astype("int32")

# Evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("\nNeural Network Results:")
print(f"Accuracy: {accuracy_score(y_test_np, y_pred_nn):.2f}")
print("Classification Report:\n", classification_report(y_test_np, y_pred_nn))
print("Confusion Matrix:\n", confusion_matrix(y_test_np, y_pred_nn))

Epoch 1/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 139ms/step - accuracy: 0.5503 - loss: 1.6583 - val_accuracy: 0.6476 - val_loss: 0.7232
Epoch 2/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6040 - loss: 1.3904 - val_accuracy: 0.6333 - val_loss: 0.6624
Epoch 3/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6168 - loss: 1.0336 - val_accuracy: 0.6381 - val_loss: 0.6439
Epoch 4/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6126 - loss: 1.0311 - val_accuracy: 0.6143 - val_loss: 0.6497
Epoch 5/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6505 - loss: 0.8541 - val_accuracy: 0.6190 - val_loss: 0.6520
Epoch 6/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6198 - loss: 0.8680 - val_accuracy: 0.6238 - val_loss: 0.6431
Epoch 7/250
[1m27/27[0m [32m

**Comparing accuracy metrices of Logistic, Random Forest, Neural Networks**

In [147]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Convert predicted labels to integers
y_pred_logistic = y_pred_logistic.astype(int)
y_pred_rf = y_pred_rf.astype(int)


# Logistic Regression Evaluation
log_reg_accuracy = accuracy_score(y_test_np, y_pred_logistic)
log_reg_precision = precision_score(y_test_np, y_pred_logistic)
log_reg_recall = recall_score(y_test_np, y_pred_logistic)
log_reg_f1 = f1_score(y_test_np, y_pred_logistic)

# Random Forest Evaluation
rf_accuracy = accuracy_score(y_test_np, y_pred_rf)
rf_precision = precision_score(y_test_np, y_pred_rf)
rf_recall = recall_score(y_test_np, y_pred_rf)
rf_f1 = f1_score(y_test_np, y_pred_rf)

# Neural Network Evaluation
nn_accuracy = accuracy_score(y_test_np, y_pred_nn)
nn_precision = precision_score(y_test_np, y_pred_nn)
nn_recall = recall_score(y_test_np, y_pred_nn)
nn_f1 = f1_score(y_test_np, y_pred_nn)

# Print the evaluation metrics
print(f"Logistic Regression Performance:")
print(f"Accuracy: {log_reg_accuracy:.2f}")
print(f"Precision: {log_reg_precision:.2f}")
print(f"Recall: {log_reg_recall:.2f}")
print(f"F1-Score: {log_reg_f1:.2f}")
print("\n")

print(f"Random Forest Performance:")
print(f"Accuracy: {rf_accuracy:.2f}")
print(f"Precision: {rf_precision:.2f}")
print(f"Recall: {rf_recall:.2f}")
print(f"F1-Score: {rf_f1:.2f}")
print("\n")

print(f"Neural Network Performance:")
print(f"Accuracy: {nn_accuracy:.2f}")
print(f"Precision: {nn_precision:.2f}")
print(f"Recall: {nn_recall:.2f}")
print(f"F1-Score: {nn_f1:.2f}")


Logistic Regression Performance:
Accuracy: 0.77
Precision: 0.81
Recall: 0.64
F1-Score: 0.72


Random Forest Performance:
Accuracy: 0.79
Precision: 0.81
Recall: 0.69
F1-Score: 0.74


Neural Network Performance:
Accuracy: 0.76
Precision: 0.88
Recall: 0.53
F1-Score: 0.66


In [148]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, SpatialDropout1D
from tensorflow.keras.optimizers import Adam
import numpy as np


# Reshaping the data for LSTM's 3D shape (samples, timesteps, features)
X_train_lstm = X_train_np.reshape((X_train_np.shape[0], 1, X_train_np.shape[1]))
X_test_lstm = X_test_np.reshape((X_test_np.shape[0], 1, X_test_np.shape[1]))

# Model
model_lstm = Sequential()

model_lstm.add(LSTM(50, activation='relu', input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]), return_sequences=False))
model_lstm.add(Dropout(0.2))

model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

history_lstm = model_lstm.fit(X_train_lstm, y_train_np, validation_split=0.2, epochs=250, batch_size=32, verbose=1)

y_pred_lstm = (model_lstm.predict(X_test_lstm) > 0.5).astype("int32")

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

lstm_accuracy = accuracy_score(y_test_np, y_pred_lstm)
lstm_precision = precision_score(y_test_np, y_pred_lstm)
lstm_recall = recall_score(y_test_np, y_pred_lstm)
lstm_f1 = f1_score(y_test_np, y_pred_lstm)

print(f"LSTM Performance:")
print(f"Accuracy: {lstm_accuracy:.2f}")
print(f"Precision: {lstm_precision:.2f}")
print(f"Recall: {lstm_recall:.2f}")
print(f"F1-Score: {lstm_f1:.2f}")

Epoch 1/250



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 95ms/step - accuracy: 0.5097 - loss: 1.2360 - val_accuracy: 0.6333 - val_loss: 0.6724
Epoch 2/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6552 - loss: 0.8169 - val_accuracy: 0.6286 - val_loss: 0.6742
Epoch 3/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6965 - loss: 0.7724 - val_accuracy: 0.6286 - val_loss: 0.6546
Epoch 4/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6607 - loss: 0.7592 - val_accuracy: 0.6714 - val_loss: 0.6384
Epoch 5/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7010 - loss: 0.6244 - val_accuracy: 0.6952 - val_loss: 0.6318
Epoch 6/250
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6937 - loss: 0.6563 - val_accuracy: 0.7000 - val_loss: 0.6242
Epoch 7/250
[1m27/27[0m [32m━━━━━━━━━━━━━━

***Other Models which performs well on the Titanic Dataset***

**XGBOOST**

In [149]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)

xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {xgb_accuracy:.4f}")


XGBoost Accuracy: 0.8053


**LightBGM**

In [150]:
import lightgbm as lgb
from sklearn.metrics import accuracy_score

lgb_model = lgb.LGBMClassifier(random_state=42)

lgb_model.fit(X_train, y_train)

y_pred_lgb = lgb_model.predict(X_test)

lgb_accuracy = accuracy_score(y_test, y_pred_lgb)
print(f"LightGBM Accuracy: {lgb_accuracy:.4f}")


[LightGBM] [Info] Number of positive: 382, number of negative: 665
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 301
[LightGBM] [Info] Number of data points in the train set: 1047, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364852 -> initscore=-0.554366
[LightGBM] [Info] Start training from score -0.554366
LightGBM Accuracy: 0.8092


**CatBoost**

In [151]:
!pip install catboost



In [152]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

cat_model = CatBoostClassifier(iterations=1000, learning_rate=0.05, depth=10, random_state=42, verbose=0)

cat_model.fit(X_train, y_train)

y_pred_cat = cat_model.predict(X_test)

cat_accuracy = accuracy_score(y_test, y_pred_cat)
print(f"CatBoost Accuracy: {cat_accuracy:.4f}")


CatBoost Accuracy: 0.8206


**Metrices of XGBOOST, LIGHTBGM, CATBOOST**

In [153]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    return accuracy, precision, recall, f1

xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

xgb_accuracy, xgb_precision, xgb_recall, xgb_f1 = compute_metrics(y_test, xgb_pred)

print("XGBoost Metrics:")
print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")
print("-" * 50)

lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)

lgb_accuracy, lgb_precision, lgb_recall, lgb_f1 = compute_metrics(y_test, lgb_pred)

print("LightGBM Metrics:")
print(f"Accuracy: {lgb_accuracy:.4f}")
print(f"Precision: {lgb_precision:.4f}")
print(f"Recall: {lgb_recall:.4f}")
print(f"F1-Score: {lgb_f1:.4f}")
print("-" * 50)

cat_model.fit(X_train, y_train)
cat_pred = cat_model.predict(X_test)

catboost_accuracy, catboost_precision, catboost_recall, catboost_f1 = compute_metrics(y_test, cat_pred)

print("CatBoost Metrics:")
print(f"Accuracy: {catboost_accuracy:.4f}")
print(f"Precision: {catboost_precision:.4f}")
print(f"Recall: {catboost_recall:.4f}")
print(f"F1-Score: {catboost_f1:.4f}")


XGBoost Metrics:
Accuracy: 0.8053
Precision: 0.8384
Recall: 0.7034
F1-Score: 0.7650
--------------------------------------------------
[LightGBM] [Info] Number of positive: 382, number of negative: 665
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 301
[LightGBM] [Info] Number of data points in the train set: 1047, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.364852 -> initscore=-0.554366
[LightGBM] [Info] Start training from score -0.554366
LightGBM Metrics:
Accuracy: 0.8092
Precision: 0.8400
Recall: 0.7119
F1-Score: 0.7706
--------------------------------------------------
CatBoost Metrics:
Accuracy: 0.8206
Precision: 0.8660
Recall: 0.7119
F1-Score: 0.7814


**Comparision chart of all the models using *Plotly***

In [154]:
import plotly.graph_objects as go

metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']

log_reg_scores = [log_reg_accuracy, log_reg_precision, log_reg_recall, log_reg_f1]
rf_scores = [rf_accuracy, rf_precision, rf_recall, rf_f1]
nn_scores = [nn_accuracy, nn_precision, nn_recall, nn_f1]
lstm_scores = [lstm_accuracy, lstm_precision, lstm_recall, lstm_f1]
xgb_scores = [xgb_accuracy, xgb_precision, xgb_recall, xgb_f1]
lgb_scores = [lgb_accuracy, lgb_precision, lgb_recall, lgb_f1]
catboost_scores = [catboost_accuracy, catboost_precision, catboost_recall, catboost_f1]

fig = go.Figure()

fig.add_trace(go.Bar(
    x=metrics,
    y=log_reg_scores,
    name='Logistic Regression',
    marker_color='blue'
))

fig.add_trace(go.Bar(
    x=metrics,
    y=rf_scores,
    name='Random Forest',
    marker_color='green'
))

fig.add_trace(go.Bar(
    x=metrics,
    y=nn_scores,
    name='Neural Network',
    marker_color='orange'
))

fig.add_trace(go.Bar(
    x=metrics,
    y=lstm_scores,
    name='LSTM',
    marker_color='purple'
))

fig.add_trace(go.Bar(
    x=metrics,
    y=xgb_scores,
    name='XGBoost',
    marker_color='red'
))

fig.add_trace(go.Bar(
    x=metrics,
    y=lgb_scores,
    name='LightGBM',
    marker_color='cyan'
))

fig.add_trace(go.Bar(
    x=metrics,
    y=catboost_scores,
    name='CatBoost',
    marker_color='magenta'
))

fig.update_layout(
    title='Model Performance Comparison',
    xaxis_title='Metrics',
    yaxis_title='Score',
    barmode='group',
    legend_title='Models',
    template='plotly_dark',
    xaxis=dict(tickmode='array'),
    yaxis=dict(range=[0, 1]),
)

fig.show()
