# Exploratory Data Analysis - Insurance Dataset

#### Step 0. Environment Setup

Purpose: Import all required libraries for data manipulation and visualization. 
- Optional installation commands are kept separate and commented to avoid accidental execution.

In [None]:
# Optional: install required packages
# !pip install seaborn

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display


#### Step 1. Load Dataset

Purpose: Load the prepared insurance dataset to be analyzed.
- This assumes preprocessing has already been completed upstream.

In [None]:
# Load datasets 
df = pd.read_csv('../../data/output/insurance_new.csv', parse_dates=['Date_birth', 'Date_driving_licence'])


#### Step 2. Inspect Dataset Shape

Purpose: Understand the size of the dataset.
- This immediately sets expectations around scale and feasibility of analysis.

In [None]:
 
##### Step 2. Inspect Dataset Shape
print(f"Dataset Shape: {df.shape[0]:,} rows and {df.shape[1]:,} columns")


#### Step 3. Preview the Data

Purpose: Visually inspect the first few rows to confirm schema and basic sanity.

In [None]:
#### Step 3. Preview the Data
display(df.head())


#### Step 4. Dataset Structure and Data Types

Purpose: Examine column data types and identify potential type issues early
- E.g. numeric values stored as strings, unexpected nulls.

In [None]:
#### Step 4. Dataset Structure and Data Types
df.info()


#### Step 5. Statistical Summary

Purpose: Review central tendency and spread of numerical variables.
- This helps detect outliers, skewness, and invalid ranges.

In [None]:
#### Step 5. Statistical Summary
display(df.describe())


#### Step 6. Duplicate Row Analysis

Purpose: Identify whether duplicated records exist, which may bias modeling results.

In [None]:
# #### Step 6. Duplicate Row Analysis
print(f"Number of duplicated rows: {df.duplicated().sum():,}")


#### Step 7. Missing Value Analysis (Tabular)

Purpose: Quantify missing values per column before visual inspection.

In [None]:
#### Step 7. Missing Value Analysis (Tabular)
df.isnull().sum()


#### Step 8. Missing Value Visualization

Purpose: Visualize missingness patterns to detect structural gaps
- E.g. entire columns or correlated missing fields.

In [None]:
## ### Step 8. Missing Value Visualization
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Values Heatmap - Insurance Dataset")
plt.show()


#### Step 9. Categorical Feature Distribution

Purpose: Analyze class balance for categorical variables.
Percent-based plots make imbalance immediately visible.

In [None]:
## ### Step 9. Categorical Feature Distribution
categorical_cols = df.select_dtypes(include=["object"]).columns

for col in categorical_cols:
    plt.figure(figsize=(6, 5))
    ax = sns.countplot(
        data=df,
        x=col,
        order=df[col].value_counts().index,
        stat="percent"
    )

    plt.title(f"Distribution of {col} (%)")
    plt.ylabel("Percentage")
    plt.xticks(rotation=45)

    for p in ax.patches:
        value = p.get_height()
        ax.annotate(
            f"{value:.1f}%",
            (p.get_x() + p.get_width() / 2, value),
            ha="center",
            va="bottom",
            fontsize=9,
            xytext=(0, 3),
            textcoords="offset points",
        )

    plt.show()


#### Step 10. Numerical Feature Distribution

Purpose: Inspect distribution shape, skewness, potential outliers, and correlation
for all numerical variables.

In [None]:
numerical_cols = df.select_dtypes(exclude=["object", "datetime"]).columns

for col in numerical_cols:
    # histogram with KDE
    plt.figure(figsize=(10, 4))
    sns.histplot(data=df, x=col, kde=True)
    plt.title(f"Distribution of {col}")
    plt.show()
    
    # boxplot for outlier detection
    plt.figure(figsize=(6, 5))
    sns.boxplot(data=df,y=col)
    plt.title(f"Boxplot of {col}")
    plt.show()
    
# check correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df[numerical_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix of Numerical Features")
plt.show()