In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from uuid import uuid4

In [3]:
# Set plot style
plt.style.use('seaborn-v0_8')

In [5]:
# Load data
df = pd.read_csv('../data/MachineLearningRating_v3_sample.csv')
df['TransactionMonth'] = pd.to_datetime(df['TransactionMonth'], format='%Y-%m')

In [6]:
# Data Summarization
print("Descriptive Statistics:")
print(df[['TotalPremium', 'TotalClaims', 'SumInsured', 'CustomValueEstimate']].describe())

Descriptive Statistics:
       TotalPremium  TotalClaims    SumInsured  CustomValueEstimate
count    100.000000        100.0  1.000000e+02            57.000000
mean      47.317669          0.0  7.777350e+05        127029.824561
std      130.093275          0.0  1.718078e+06         30008.034430
min        0.000000          0.0  1.000000e-02         52700.000000
25%        0.000000          0.0  3.500000e+03        119300.000000
50%        1.108860          0.0  7.500000e+03        119300.000000
75%       25.280447          0.0  1.729500e+05        146800.000000
max      709.980000          0.0  5.000000e+06        161000.000000


In [7]:
# Data Quality Assessment
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
UnderwrittenCoverID           0
PolicyID                      0
TransactionMonth            100
IsVATRegistered               0
Citizenship                   0
LegalType                     0
Title                         0
Language                      0
Bank                          0
AccountType                   0
MaritalStatus                 0
Gender                        0
Country                       0
Province                      0
PostalCode                    0
MainCrestaZone                0
SubCrestaZone                 0
ItemType                      0
mmcode                        0
VehicleType                   0
RegistrationYear              0
make                          0
Model                         0
Cylinders                     0
cubiccapacity                 0
kilowatts                     0
bodytype                      0
NumberOfDoors                 0
VehicleIntroDate              0
CustomValueEstimate          43
AlarmImmobiliser       

In [8]:
# Remove or impute missing values (example: drop rows with missing critical fields)
df = df.dropna(subset=['TotalPremium', 'TotalClaims', 'Province', 'Gender'])

In [10]:
# Univariate Analysis
plt.figure(figsize=(10, 6))
sns.histplot(df['TotalClaims'], bins=50, kde=True)
plt.title('Distribution of Total Claims')
plt.xlabel('Total Claims (Rand)')
plt.savefig('../reports/total_claims_histogram.png')
plt.close()

In [11]:
# Bivariate Analysis: Loss Ratio by Province
df['LossRatio'] = df['TotalClaims'] / df['TotalPremium']
plt.figure(figsize = (12, 6))
sns.barplot(x='Province', y='LossRatio', data=df)
plt.title('Loss Ratio by Province')
plt.xticks(rotation=45)
plt.savefig('../reports/loss_ratio_province.png')
plt.close()

In [12]:
# Claim Frequency by Vehicle Type
claim_freq = df.groupby('VehicleType')['TotalClaims'].apply(lambda x: (x > 0).mean()).reset_index()
plt.figure(figsize=(12, 6))
sns.barplot(x='VehicleType', y='TotalClaims', data=claim_freq)
plt.title('Claim Frequency by Vehicle Type')
plt.xticks(rotation=45)
plt.savefig('../reports/claim_freq_vehicle_type.png')
plt.close()

In [13]:
# Temporal Trend of Total Claims
monthly_claims = df.groupby('TransactionMonth')['TotalClaims'].sum().reset_index()
plt.figure(figsize=(12, 6))
sns.lineplot(x='TransactionMonth', y='TotalClaims', data=monthly_claims)
plt.title('Total Claims Over Time')
plt.savefig('../reports/total_claims_trend.png')
plt.close()

In [14]:
# Outlier Detection
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['TotalClaims'])
plt.title('Box Plot of Total Claims')
plt.savefig('../reports/total_claims_boxplot.png')
plt.close()

In [15]:
# Correlation Matrix
corr = df[['TotalPremium', 'TotalClaims', 'SumInsured', 'CustomValueEstimate']].corr()
plt.figure(figsize=(8, 6))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.savefig('../reports/correlation_matrix.png')
plt.close()