In [4]:
# titanic_eda.py
# Python 3.8+
# Purpose: load Titanic dataset, perform data cleaning, basic feature engineering,
# exploratory data analysis (visualizations and summary statistics).
# Usage: edit DATA_PATH to point to your CSV (local path or raw URL). Run in a notebook
# for inline plots or as a script to save figures to disk.

import os
from io import StringIO
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests

warnings.filterwarnings("ignore")
sns.set(style="whitegrid", context="notebook")
plt.rcParams["figure.figsize"] = (10, 6)

# ---------- CONFIG ----------
# Use raw local path or raw URL. On Windows use raw string for backslashes or forward slashes.
DATA_PATH = r"D:\prodigy infotech internship tasks\Titanic-Dataset.csv"
OUT_DIR = "titanic_plots"
os.makedirs(OUT_DIR, exist_ok=True)
# ----------------------------

# ---------- UTIL: load data robustly ----------
def load_csv(path: str) -> pd.DataFrame:
    """Load CSV from local path or HTTP URL; validate content to avoid HTML pages."""
    if isinstance(path, str) and path.startswith("http"):
        r = requests.get(path, timeout=15)
        r.raise_for_status()
        text = r.text
        if text.lstrip().startswith("<!DOCTYPE") or "Repository" in text[:200]:
            raise RuntimeError("URL returned HTML; use raw CSV URL (raw.githubusercontent.com) or local file.")
        return pd.read_csv(StringIO(text))
    else:
        # local path
        return pd.read_csv(path)

df = load_csv(DATA_PATH)

# ---------- Initial inspection ----------
print("Rows, cols:", df.shape)
print("\nColumn names:", df.columns.tolist())
print("\nSample rows:")
print(df.head())

# ---------- Basic cleaning ----------
# Trim column names
df.columns = [c.strip() for c in df.columns]

# Standardize common columns and types
# Convert numeric columns
for col in ["Age", "Fare"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Fill obvious NaNs that represent missing categories
if "Embarked" in df.columns:
    df["Embarked"] = df["Embarked"].replace({"": np.nan}).astype("object")
    if df["Embarked"].isna().any():
        df["Embarked"].fillna(df["Embarked"].mode().iloc[0], inplace=True)

# If Cabin exists, create binary feature HasCabin
if "Cabin" in df.columns:
    df["HasCabin"] = df["Cabin"].notna().astype(int)

# Create family size feature
if {"SibSp", "Parch"}.issubset(df.columns):
    df["SibSp"] = pd.to_numeric(df["SibSp"], errors="coerce").fillna(0).astype(int)
    df["Parch"] = pd.to_numeric(df["Parch"], errors="coerce").fillna(0).astype(int)
    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

# Extract Title from Name
if "Name" in df.columns:
    df["Title"] = df["Name"].str.extract(r",\s*([^\.]+)\.").iloc[:, 0].str.strip()
    common_titles = df["Title"].value_counts().loc[lambda s: s > 10].index
    df["Title"] = df["Title"].where(df["Title"].isin(common_titles), "Other")

# Impute Age with median per Title or median overall
if "Age" in df.columns:
    if "Title" in df.columns:
        df["Age"] = df.groupby("Title")["Age"].transform(lambda x: x.fillna(x.median()))
    df["Age"].fillna(df["Age"].median(), inplace=True)

# Fill missing Fare with median
if "Fare" in df.columns:
    df["Fare"].fillna(df["Fare"].median(), inplace=True)

# Convert categorical types
for col in ["Pclass", "Sex", "Embarked", "Title"]:
    if col in df.columns:
        df[col] = df[col].astype("category")

print("\nAfter cleaning - null counts:")
print(df.isna().sum())

# ---------- Helper to save figures ----------
def save_fig(fig, name):
    path = os.path.join(OUT_DIR, name)
    fig.tight_layout()
    fig.savefig(path, dpi=200)
    print("Saved:", path)

# ---------- Exploratory Data Analysis ----------

# 1) Target distribution: Survived
if "Survived" in df.columns:
    fig, ax = plt.subplots()
    sns.countplot(x="Survived", data=df, palette="Set2", ax=ax)
    ax.set_title("Survival counts")
    save_fig(fig, "survived_counts.png")
    plt.close(fig)

# 2) Survival by Sex
if {"Survived", "Sex"}.issubset(df.columns):
    fig, ax = plt.subplots()
    sns.countplot(x="Sex", hue="Survived", data=df, palette="Set1", ax=ax)
    ax.set_title("Survival by Sex")
    save_fig(fig, "survival_by_sex.png")
    plt.close(fig)

# 3) Age distribution and survival
if "Age" in df.columns:
    fig, ax = plt.subplots()
    sns.histplot(data=df, x="Age", bins=30, kde=True, hue="Survived", multiple="stack", palette="crest", ax=ax)
    ax.set_title("Age distribution (stacked by Survival)")
    save_fig(fig, "age_distribution_by_survival.png")
    plt.close(fig)

# 4) Fare distribution (log scale) and survival
if "Fare" in df.columns:
    fig, ax = plt.subplots()
    # ensure Fare > 0 for log scale plotting; add small value if necessary
    df_plot_fare = df[df["Fare"].notna() & (df["Fare"] >= 0)]
    sns.histplot(data=df_plot_fare, x="Fare", bins=40, kde=False, hue="Survived", multiple="stack", ax=ax)
    ax.set_xscale("log")
    ax.set_title("Fare distribution by Survival (log x-scale)")
    save_fig(fig, "fare_distribution_by_survival.png")
    plt.close(fig)

# 5) Survival rate by Pclass
if {"Pclass", "Survived"}.issubset(df.columns):
    fig, ax = plt.subplots()
    sr = df.groupby("Pclass")["Survived"].mean().reset_index()
    sns.barplot(x="Pclass", y="Survived", data=sr, palette="Blues", ax=ax)
    ax.set_ylabel("Survival rate")
    ax.set_title("Survival rate by Passenger Class")
    save_fig(fig, "survival_by_pclass.png")
    plt.close(fig)

# 6) Survival by Embarked
if {"Embarked", "Survived"}.issubset(df.columns):
    fig, ax = plt.subplots()
    sns.barplot(x="Embarked", y="Survived", data=df, estimator=np.mean, ci=None, palette="Set2", ax=ax)
    ax.set_ylabel("Survival rate")
    ax.set_title("Survival rate by Embarked")
    save_fig(fig, "survival_by_embarked.png")
    plt.close(fig)

# 7) Family size effect
if {"FamilySize", "Survived"}.issubset(df.columns):
    fig, ax = plt.subplots()
    sns.pointplot(x="FamilySize", y="Survived", data=df, ci=None, ax=ax)
    ax.set_title("Survival rate vs Family Size")
    save_fig(fig, "survival_vs_family_size.png")
    plt.close(fig)

# 8) Title and survival
if {"Title", "Survived"}.issubset(df.columns):
    fig, ax = plt.subplots()
    title_rates = df.groupby("Title")["Survived"].mean().sort_values(ascending=False).reset_index()
    sns.barplot(x="Title", y="Survived", data=title_rates, palette="tab10", ax=ax)
    ax.set_ylabel("Survival rate")
    ax.set_title("Survival rate by Title")
    save_fig(fig, "survival_by_title.png")
    plt.close(fig)

# 9) Correlation heatmap for numeric features
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if len(num_cols) >= 2:
    corr = df[num_cols].corr()
    fig, ax = plt.subplots(figsize=(8, max(4, len(num_cols) * 0.4)))
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
    ax.set_title("Correlation matrix (numeric features)")
    save_fig(fig, "correlation_numeric.png")
    plt.close(fig)

# 10) Pairplot sample (smaller sample to keep it fast)
# Choose only numeric variables appropriate for pairplot
candidate = ["Age", "Fare", "FamilySize", "SibSp", "Parch", "Pclass", "HasCabin"]
vars_numeric = [c for c in candidate if c in df.columns]

# Ensure numeric types for plotting
for c in vars_numeric:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Drop columns with too many NaNs or zero variance
vars_filtered = [c for c in vars_numeric if df[c].notna().sum() > 20 and df[c].nunique() > 1]

# Final plot dataframe; ensure Survived used as hue if present
hue_col = "Survived" if "Survived" in df.columns else None
plot_cols = vars_filtered + ([hue_col] if hue_col else [])
if len(vars_filtered) >= 2 and (hue_col is None or df[hue_col].notna().any()):
    plot_df = df[plot_cols].dropna()
    plot_df = plot_df.sample(min(500, len(plot_df)), random_state=1)

    # Make sure hue is categorical and has reasonable number of levels
    if hue_col:
        plot_df[hue_col] = plot_df[hue_col].astype("category")
        if plot_df[hue_col].nunique() > 6:
            top_levels = plot_df[hue_col].value_counts().nlargest(6).index
            plot_df = plot_df[plot_df[hue_col].isin(top_levels)]

    # diag_kind must be one of {'auto','hist','kde'}
    g = sns.pairplot(plot_df, vars=vars_filtered, hue=hue_col, diag_kind="hist", corner=True,
                     plot_kws={"alpha": 0.6, "s": 30}, diag_kws={"alpha": 0.6}, palette="Set2")
    g.fig.suptitle("Pairplot of selected numeric Titanic features", y=1.02)
    pairplot_path = os.path.join(OUT_DIR, "pairplot_selected.png")
    g.fig.savefig(pairplot_path, dpi=200, bbox_inches="tight")
    plt.close(g.fig)
    print("Saved:", pairplot_path)
else:
    print("Not enough numeric variables or hue missing for pairplot. vars_filtered:", vars_filtered)

# ---------- Summary statistics ----------
print("\nDescriptive statistics (numeric):")
print(df.describe().T)

print("\nSurvival rates by key categories:")
for col in ["Sex", "Pclass", "Embarked", "Title"]:
    if col in df.columns:
        print(f"\n{col}:\n", df.groupby(col)["Survived"].agg(["mean", "count"]).sort_values("mean", ascending=False).head(10))

# ---------- Save cleaned dataset ----------
clean_path = os.path.join(OUT_DIR, "titanic_cleaned.csv")
df.to_csv(clean_path, index=False)
print("\nCleaned dataset saved to:", clean_path)

print("\nAll plots saved to folder:", OUT_DIR)

Rows, cols: (891, 12)

Column names: ['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

Sample rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85     