In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
                         
# Configure plot settings
%matplotlib inline
sns.set_style('whitegrid')  # Changed from plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [10, 6]

# Load the dataset
data_path = 'C:/Users/Denny/insurance-risk-modeling/data/processed/cleaned_insurance_data.csv'
df = pd.read_csv(data_path)
                
# Convert categorical columns to category type
categorical_columns = ['Gender', 'Driving_Experience', 'Vehicle_Type', 'Region', 'Accident']
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Display first few rows
df.head()


## Relationship Analysis with Target Variable (Accident)

In [None]:

# Defining the target variable
target_var = 'Accident'

# Numerical columns for analysis (excluding Accident since it's categorical)
numerical_columns = ['Age', 'Previous_Accidents', 'Annual_Mileage', 'Premium']

# Boxplots and Violinplots for numerical columns against the target variable
for col in numerical_columns:
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    sns.boxplot(x=target_var, y=col, data=df)
    plt.title(f'Boxplot of {col} by {target_var}')
    plt.xlabel(target_var)
    plt.ylabel(col)

    plt.subplot(1, 2, 2)
    sns.violinplot(x=target_var, y=col, data=df)
    plt.title(f'Violin Plot of {col} by {target_var}')
    plt.xlabel(target_var)
    plt.ylabel(col)
    
    plt.tight_layout()
    plt.show()


## Bar Plots for Categorical Variables by Target Variable

In [None]:

# Categorical columns for analysis (excluding Accident since it's the target)
categorical_columns = ['Gender', 'Driving_Experience', 'Vehicle_Type', 'Region']
target_var = 'Accident'

for col in categorical_columns:
    plt.figure(figsize=(10, 6))
    ax = sns.countplot(data=df, x=col, hue=target_var)
    
    # Rotate x-labels if needed
    plt.xticks(rotation=45)
    
    # Add title and labels
    plt.title(f'Distribution of {col} by {target_var}')
    plt.xlabel(col)
    plt.ylabel('Count')
    
    # Add value labels on top of bars
    for container in ax.containers:
        ax.bar_label(container)
    
    plt.tight_layout()
    plt.show()


## Crosstabulation Analysis for Categorical Variables

In [None]:

# Crosstabs for each categorical variable with the target variable
for col in categorical_columns:
    print(f"\nCrosstab for {col} vs {target_var}:\n")
    
    # Raw counts
    ct_counts = pd.crosstab(df[col], df[target_var])
    print("Counts:")
    print(ct_counts)
    print("\n")
    
    # Percentages
    ct_pct = pd.crosstab(df[col], df[target_var], normalize='index') * 100
    print("Percentages (%):")
    print(ct_pct.round(2))
    print("\n" + "="*50)
