# Full Insurance Claims Case Study
Complete EDA, Visualisations & Hypothesis Testing

In [None]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from scipy import stats

plt.rcParams['figure.figsize']=(10,5)
sns.set()


## Load Data

In [None]:

claims = pd.read_csv('/mnt/data/claims.csv')
cust = pd.read_csv('/mnt/data/cust_demographics.csv')
data = claims.merge(cust, on='customer_id', how='left')
data.head()


## Data Audit

In [None]:
data.dtypes

## Clean claim_amount

In [None]:

data['claim_amount'] = data['claim_amount'].replace('[\$,]', '', regex=True).astype(float)
data['claim_amount'].head()


## Injury Unreported Flag

In [None]:

data['injury_unreported_flag'] = ((data['injury_claim']==1) & (data['police_report']==0)).astype(int)
data['injury_unreported_flag'].value_counts()


## Remove Duplicate Customers

In [None]:

data['claim_date']=pd.to_datetime(data['claim_date'])
data=data.sort_values('claim_date').drop_duplicates('customer_id',keep='last')
data.shape


## Missing Value Imputation

In [None]:

for col in data.columns:
    if data[col].dtype=='object':
        data[col].fillna(data[col].mode()[0], inplace=True)
    else:
        data[col].fillna(data[col].mean(), inplace=True)
data.isna().sum()


## Age & Age Groups

In [None]:

data['dob']=pd.to_datetime(data['dob'])
data['age']=(pd.to_datetime('2018-10-01')-data['dob']).dt.days//365

def age_cat(a):
    if a<18: return 'Children'
    elif a<30: return 'Youth'
    elif a<60: return 'Adult'
    else: return 'Senior'

data['age_group']=data['age'].apply(age_cat)
data['age_group'].value_counts()


## Q8 - Avg Amount by Segment

In [None]:
data.groupby('segment')['claim_amount'].mean()

## Q9 - Total Claim Amount for incidents â‰¥20 days before 1 Oct 2018

In [None]:

cutoff=pd.to_datetime('2018-10-01')-pd.Timedelta(days=20)
data[data['claim_date']<=cutoff].groupby('incident_cause')['claim_amount'].sum()


## Q10 - Adults from TX, DE, AK with driver related issues

In [None]:

driver_mask=data['incident_cause'].str.contains('Driver', case=False)
subset=data[(data['age_group']=='Adult') & (data['state'].isin(['TX','DE','AK'])) & driver_mask]
subset.shape[0]


## Q11 Pie Chart - Claim Amount by Gender & Segment

In [None]:

pie_data=data.groupby(['gender','segment'])['claim_amount'].sum()
pie_data.plot(kind='pie', autopct='%1.1f%%')
plt.ylabel('')
plt.show()


## Q12 Gender with Most Driver Related Claims

In [None]:

drv=data[driver_mask].groupby('gender')['claim_amount'].sum()
sns.barplot(x=drv.index, y=drv.values)
plt.show()
drv


## Q13 Age group with max fraudulent claims

In [None]:

fraud=data[data['fraudulent']==1].groupby('age_group')['claim_amount'].sum()
sns.barplot(x=fraud.index, y=fraud.values)
plt.show()
fraud


## Q14 Monthly Trend of Total Claim Amount

In [None]:

data['month']=data['claim_date'].dt.to_period('M')
month_sum=data.groupby('month')['claim_amount'].sum()
month_sum.plot(kind='line', marker='o')
plt.show()


## Q15 Facet Bar Chart: Avg Claim by Gender & Age Group (Fraud vs Non-Fraud)

In [None]:

import seaborn as sns
g=sns.catplot(
    data=data, kind='bar',
    x='gender', y='claim_amount',
    hue='age_group', col='fraudulent'
)
plt.show()


# Hypothesis Testing

## Q16 Similarity in claim amounts between genders (t-test)

In [None]:

m=data[data['gender']=='M']['claim_amount']
f=data[data['gender']=='F']['claim_amount']
stats.ttest_ind(m,f, equal_var=False)


## Q17 Relationship between age category and segment (Chi-square)

In [None]:

ct=pd.crosstab(data['age_group'], data['segment'])
stats.chi2_contingency(ct)


## Q18 Rise in current year claim amount vs 10000 baseline

In [None]:

stats.ttest_1samp(data['claim_amount'], 10000)


## Q19 Difference between age groups claim amounts (ANOVA)

In [None]:

groups=[g['claim_amount'].values for _,g in data.groupby('age_group')]
stats.f_oneway(*groups)


## Q20 Relationship between number of policies and claimed amount

In [None]:

stats.pearsonr(data['total_policy_claims'], data['claim_amount'])
