In [None]:
import pandas as pd 
import numpy as np
import  matplotlib.pyplot as plt
import seaborn as sns


# path to the csv file

df = pd.read_csv("../data/MachineLearningRating_v3.txt", sep="|")

# Data Understanding 

df.info()

df.describe(include='all')

# converting date

if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])

#  checking missing values

df.isnull().sum()

# # Univariate Analysis

num_cols = ['TotalPremium', 'TotalClaims', 'CustomValueEstimate']
cat_cols = ['Province', 'VehicleType', 'Gender']

# # Histograms for numeric features

for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.tight_layout()
    plt.show()


# # Bar for categorical features

for col in cat_cols:
    plt.figure(figsize=(6, 4))
    df[col].value_counts().plot(kind='bar')
    plt.title(f"Distribution of {col}")
    plt.tight_layout()
    plt.show()


# Bivariate / Multivariate Analysis

# Loss ratio, checking efficiency and profitability of the insurance campany 

df['Loss_Ratio'] = df['TotalClaims'] / df['TotalPremium']

#  group-wise loss ratios

group_cols = ['Province', 'VehicleType', 'Gender']
for col in group_cols:
    ratio_by_group = df.groupby(col)['Loss_Ratio'].mean().sort_values(ascending=False)
    print("Average Loss Ratio by {col}:\n", ratio_by_group)

    ratio_by_group.plot(kind='bar', figsize=(6, 4), title='Avg Loss Ratio by {col}')
    plt.tight_layout()
    plt.show()

# Correlation matrix

plt.figure(figsize=(8, 6))
sns.heatmap(df[num_cols + ['Loss_Ratio']].corr(), annot=True, cmap='coolwarm')
plt.tight_layout()
plt.show()

# Trends Over Geography

if 'Date' in df.columns:
    df['Month'] = df['Date'].dt.to_period('M')
    monthly = df.groupby('Month')[['TotalClaims', 'TotalPremium']].sum()
    monthly.plot(figsize=(10, 5), title='Monthly Claims vs Premiums')
    plt.ylabel('Amount')
    plt.tight_layout()
    plt.show()

#  Vehicle Make/Model Analysis

if 'VehicleMake' in df.columns:
    make_claims = df.groupby('VehicleMake')['TotalClaims'].mean().sort_values(ascending=False)
    print("Top 5 Vehicle Makes by Avg Claims:\n", make_claims.head())
    print("Bottom 5 Vehicle Makes by Avg Claims:\n", make_claims.tail())

    make_claims.head(10).plot(kind='barh', title='Top 10 Vehicle Makes - Avg Claims', figsize=(8, 5))
    plt.tight_layout()
    plt.show()

    # Outlier Detection
    
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col])
    plt.title(f"Outlier Detection: {col}")
    plt.tight_layout()
    plt.show()

# Visualizations

# Loss Ratio by Provice and Gender combined

plt.figure(figsize=(10, 6))
sns.boxplot(x="Province", y="Loss_Ratio", hue="Gender", data=df)
plt.title("Loss Ratio Distribution by Province and Gender")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



#  Heatmap of Claims by Vehicle Type and Province 

pivot = df.pivot_table(values='TotalClaims', index='VehicleType', columns='Province', aggfunc='mean')
sns.heatmap(pivot, annot=True, fmt=".0f", cmap="YlGnBu")
plt.title("Avg Claim Amount by Vehicle Type and Province")
plt.tight_layout()
plt.show()


# Temporal trend line plot
if 'Date' in df.columns:
    df['Month'] = df['Date'].dt.to_period('M')
    monthly = df.groupby('Month')[['TotalClaims', 'TotalPremium']].sum()
    monthly['Loss_Ratio'] = monthly['TotalClaims'] / monthly['TotalPremium']
    monthly.index = monthly.index.to_timestamp()  

    plt.figure(figsize=(10, 4))
    monthly['Loss_Ratio'].plot(marker='o')
    plt.title("Monthly Loss Ratio Over Time")
    plt.ylabel("Loss Ratio")
    plt.xlabel("Month")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

      
