In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,mean_squared_error,mean_absolute_error,r2_score
import mlflow 
from mlflow.models import infer_signature
import os
import s3fs


sns.set_theme()
sns.set_context("paper")

warnings.filterwarnings("ignore")

In [92]:
# using s3FileSystem to load data
s3= s3fs.S3FileSystem()

file_path= "s3://insurance-claims-deploy/raw-data/Insurance claims data.csv"

with s3.open (file_path,"rb") as f:
    data= pd.read_csv(f)
data.head()

Unnamed: 0,policy_id,subscription_length,vehicle_age,customer_age,region_code,region_density,segment,model,fuel_type,max_torque,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,claim_status
0,POL045360,9.3,1.2,41,C8,8794,C2,M4,Diesel,250Nm@2750rpm,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,3,0
1,POL016745,8.2,1.8,35,C2,27003,C1,M9,Diesel,200Nm@1750rpm,...,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,4,0
2,POL007194,9.5,0.2,44,C8,8794,C2,M4,Diesel,250Nm@2750rpm,...,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,3,0
3,POL018146,5.2,0.4,44,C10,73430,A,M1,CNG,60Nm@3500rpm,...,No,No,No,Yes,No,No,No,Yes,0,0
4,POL049011,10.1,1.0,56,C13,5410,B2,M5,Diesel,200Nm@3000rpm,...,No,Yes,Yes,Yes,No,No,Yes,Yes,5,0


In [93]:
data.describe()

Unnamed: 0,subscription_length,vehicle_age,customer_age,region_density,airbags,displacement,cylinder,turning_radius,length,width,gross_weight,ncap_rating,claim_status
count,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0,58592.0
mean,6.111688,1.388473,44.823935,18826.858667,3.137066,1162.355851,3.626963,4.852893,3850.476891,1672.233667,1385.276813,1.75995,0.063968
std,4.14279,1.134413,6.935604,17660.174792,1.832641,266.304786,0.483616,0.228061,311.457119,112.089135,212.423085,1.389576,0.244698
min,0.0,0.0,35.0,290.0,1.0,796.0,3.0,4.5,3445.0,1475.0,1051.0,0.0,0.0
25%,2.1,0.4,39.0,6112.0,2.0,796.0,3.0,4.6,3445.0,1515.0,1185.0,0.0,0.0
50%,5.7,1.2,44.0,8794.0,2.0,1197.0,4.0,4.8,3845.0,1735.0,1335.0,2.0,0.0
75%,10.4,2.2,49.0,27003.0,6.0,1493.0,4.0,5.0,3995.0,1755.0,1510.0,3.0,0.0
max,14.0,20.0,75.0,73430.0,6.0,1498.0,4.0,5.2,4300.0,1811.0,1720.0,5.0,1.0


#### Data Cleaning 

In [97]:
# creating a claim status 
data['claim_status_text']= data["claim_status"].map({1:'Yes', 0:'No'})

In [99]:
# function to encode categorrical variabe
def car(x):
    if x == "No":
        return 0
    else : 
        return 1

In [101]:
category= ["is_esc","is_adjustable_steering","is_power_door_locks","is_parking_sensors",
           "is_parking_camera","rear_brakes_type","is_front_fog_lights","is_rear_window_wiper","is_rear_window_washer","is_tpms",
           'is_rear_window_defogger', 'is_brake_assist','is_power_door_locks',"is_central_locking","is_power_steering","is_driver_seat_height_adjustable",
           "is_day_night_rear_view_mirror","is_ecw","is_speed_alert"]

for i in category:
    data[i]=data[i].apply(car)

#### Feature Engineering

In [105]:
column=["segment","model","fuel_type","max_torque","max_power","engine_type","transmission_type","steering_type"]
for i in column:
    data_encoded=pd.get_dummies(data,columns=column,drop_first=False)

for col in data_encoded.columns:
    if data_encoded[col].dtype == bool:
        data_encoded[col] = data_encoded[col].astype(int)


In [107]:
data_encoded

Unnamed: 0,policy_id,subscription_length,vehicle_age,customer_age,region_code,region_density,airbags,is_esc,is_adjustable_steering,is_tpms,...,engine_type_F8D Petrol Engine,engine_type_G12B,engine_type_K Series Dual jet,engine_type_K10C,engine_type_i-DTEC,transmission_type_Automatic,transmission_type_Manual,steering_type_Electric,steering_type_Manual,steering_type_Power
0,POL045360,9.3,1.2,41,C8,8794,6,1,1,1,...,0,0,0,0,0,1,0,0,0,1
1,POL016745,8.2,1.8,35,C2,27003,2,0,1,0,...,0,0,0,0,1,0,1,1,0,0
2,POL007194,9.5,0.2,44,C8,8794,6,1,1,1,...,0,0,0,0,0,1,0,0,0,1
3,POL018146,5.2,0.4,44,C10,73430,2,0,0,0,...,1,0,0,0,0,0,1,0,0,1
4,POL049011,10.1,1.0,56,C13,5410,2,0,1,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58587,POL019269,10.6,2.6,48,C5,34738,2,0,1,0,...,0,0,1,0,0,0,1,1,0,0
58588,POL001254,2.3,2.2,37,C3,4076,6,1,1,1,...,0,0,0,0,0,1,0,0,0,1
58589,POL019859,6.6,2.2,35,C8,8794,2,0,1,0,...,0,0,1,0,0,0,1,1,0,0
58590,POL014061,4.1,3.6,44,C8,8794,2,0,1,0,...,0,0,1,0,0,0,1,1,0,0


In [109]:
## Droped columns not needed for machine learning
dropped_columns=['policy_id', 'region_code',"claim_status_text"]

data_encoded.drop(columns=dropped_columns,inplace=True)

In [115]:
data_encoded

Unnamed: 0,subscription_length,vehicle_age,customer_age,region_density,airbags,is_esc,is_adjustable_steering,is_tpms,is_parking_sensors,is_parking_camera,...,engine_type_F8D Petrol Engine,engine_type_G12B,engine_type_K Series Dual jet,engine_type_K10C,engine_type_i-DTEC,transmission_type_Automatic,transmission_type_Manual,steering_type_Electric,steering_type_Manual,steering_type_Power
0,9.3,1.2,41,8794,6,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
1,8.2,1.8,35,27003,2,0,1,0,1,1,...,0,0,0,0,1,0,1,1,0,0
2,9.5,0.2,44,8794,6,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
3,5.2,0.4,44,73430,2,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,1
4,10.1,1.0,56,5410,2,0,1,0,1,0,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58587,10.6,2.6,48,34738,2,0,1,0,1,0,...,0,0,1,0,0,0,1,1,0,0
58588,2.3,2.2,37,4076,6,1,1,1,1,1,...,0,0,0,0,0,1,0,0,0,1
58589,6.6,2.2,35,8794,2,0,1,0,1,0,...,0,0,1,0,0,0,1,1,0,0
58590,4.1,3.6,44,8794,2,0,1,0,1,0,...,0,0,1,0,0,0,1,1,0,0


In [111]:
import boto3
from io import StringIO
import csv

In [137]:
s3= boto3.resource(
    's3',
    region_name="us-east-1",
    aws_access_key_id= "xxxx",
    aws_secret_access_key = "qQhoBPHf+xxx+czmaxuyVXU"
,
)
csv_buffer = StringIO()
data_encoded.to_csv(csv_buffer,index=False)

# Define the file name you want to save as in S3
BUCKET = "insurance-claims-deploy"
s3_key = "training_data/train.csv"  # Replace with the desired key (filename) in the S3 bucket

#Upload the CSV to S3
s3.Object(BUCKET, s3_key).put(Body=csv_buffer.getvalue())

print(f"CSV data uploaded to S3 bucket '{BUCKET}' with key '{s3_key}'.")

CSV data uploaded to S3 bucket 'insurance-claims-deploy' with key 'training_data/train.csv'.
