# 🧭 Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

# 🧩 Step 2: Load Dataset

In [None]:
df = sns.load_dataset('titanic')
df.head()

# 🔍 Step 3: Explore Dataset

In [None]:
df.info()
df.describe(include='all')

# 🧹 Step 4: Data Cleaning

In [None]:
df = df.copy()

# Fill missing values
df['age'] = df['age'].fillna(df['age'].median())
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])

# Drop unnecessary columns
df = df.drop(['deck'], axis=1)

# Drop duplicates
df = df.drop_duplicates()

# Confirm clean data
df.info()

# 📊 Step 5: Univariate Analysis

In [None]:
sns.countplot(x='survived', data=df)
plt.title("Survival Count (0 = Died, 1 = Survived)")
plt.show()

sns.histplot(df['age'], kde=True)
plt.title("Age Distribution of Passengers")
plt.show()

# 🎯 Step 6: Bivariate Analysis

In [None]:
sns.countplot(x='sex', hue='survived', data=df)
plt.title("Survival by Gender")
plt.show()

sns.countplot(x='class', hue='survived', data=df)
plt.title("Survival by Class")
plt.show()

sns.boxplot(x='survived', y='age', data=df)
plt.title("Age vs Survival")
plt.show()

# 🌍 Step 7: Multivariate Analysis

In [None]:
sns.catplot(x='class', hue='sex', col='survived', data=df, kind='count')

# 📈 Step 8: Correlation Heatmap

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# 🧾 Step 9: Save Cleaned Dataset

In [None]:
df.to_csv("clean_titanic.csv", index=False)

# 💡 Insights

In [None]:
# Insights Summary:
# 1. Most passengers were from Third Class.
# 2. Females had a much higher survival rate than males.
# 3. Higher-class passengers paid higher fares and had better survival chances.
# 4. Median age ≈ 29 years; younger passengers had better survival chances.
# 5. Data is now fully cleaned and ready for ML or dashboarding.