# Task 2: Titanic Survival Prediction (Classification)


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay


In [None]:
# Load the dataset
df = pd.read_csv("titanic.csv")

# Display first few rows
df.head()


In [None]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())


In [None]:
# Drop columns that are not useful
df.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis=1, inplace=True)

# Fill missing 'Age' with the median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Drop rows where 'Embarked' is missing
df.dropna(subset=['Embarked'], inplace=True)

# Confirm that no missing values remain
print(df.isnull().sum())


In [None]:
# Convert categorical columns to numeric
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})


In [None]:
# Define features (X) and label (y)
X = df.drop('Survived', axis=1)
y = df['Survived']


In [None]:
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create and train the model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


In [None]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Plot survival count
sns.countplot(data=df, x='Survived', palette='Set2')
plt.title("Survival Count (0 = Died, 1 = Survived)")
plt.xlabel("Survival")
plt.ylabel("Passenger Count")
plt.show()


In [None]:
# Plot survival by gender
sns.countplot(data=df, x='Sex', hue='Survived', palette='Set1')
plt.title("Survival by Gender (0 = Male, 1 = Female)")
plt.xlabel("Sex")
plt.ylabel("Count")
plt.legend(title="Survived")
plt.show()


In [None]:
# Plot age distribution
sns.histplot(data=df, x='Age', hue='Survived', bins=30, kde=True, palette='pastel')
plt.title("Age Distribution: Survived vs Not Survived")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()


## Summary

In Task 2, I learned how to build a classification model using a Decision Tree to predict Titanic passenger survival.
I started by cleaning and preprocessing the dataset — handling missing values, dropping irrelevant columns, and encoding categorical features.
I trained a Decision Tree Classifier and evaluated it using accuracy and a confusion matrix.
I also visualized survival trends by gender and age, which helped me better understand the data distribution and model performance.
This task improved my understanding of data cleaning, feature encoding, and classification evaluation.
