In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set the visual style for our charts
sns.set_theme(style="whitegrid")

# Load the Titanic dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Display the first 5 rows
print("Dataset Loaded Successfully!")
df.head()


# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handling Missing Values
# 1. Fill 'Age' with the median age
df['Age'] = df['Age'].fillna(df['Age'].median())

# 2. Fill 'Embarked' with the most common value (mode)
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# 3. Drop 'Cabin' because it has too many missing values (over 70%)
df.drop(columns=['Cabin'], inplace=True)

# Confirm cleaning
print("\nMissing values after cleaning:")
print(df.isnull().sum())

# Summary statistics for numerical columns
print("Statistical Summary:")
display(df.describe())

# Check unique values for categorical columns
print("\nUnique values in 'Survived' (0 = No, 1 = Yes):")
print(df['Survived'].value_counts())


# Create a figure with multiple subplots
plt.figure(figsize=(15, 10))

# 1. Distribution of Age
plt.subplot(2, 2, 1)
sns.histplot(df['Age'], kde=True, color='skyblue')
plt.title('Age Distribution of Passengers')

# 2. Survival Rate by Gender
plt.subplot(2, 2, 2)
sns.barplot(x='Sex', y='Survived', data=df, palette='viridis')
plt.title('Survival Rate: Male vs Female')

# 3. Passenger Class vs Survival
plt.subplot(2, 2, 3)
sns.countplot(x='Pclass', hue='Survived', data=df, palette='Set2')
plt.title('Survival Count by Ticket Class')

# 4. Correlation Heatmap
plt.subplot(2, 2, 4)
# We only use numeric columns for the heatmap
numeric_df = df.select_dtypes(include=[np.number])
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')

plt.tight_layout()
plt.show()