In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.pandas.set_option("display.max_columns", None)
# Create Dataframe
df = pd.read_csv(r"C:\Users\Mahesh\Downloads\insuranceFraud_Dataset.csv")
# Print shape of dataset
print(df.shape)

(1000, 39)


In [2]:
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,insured_sex,insured_education_level,insured_occupation,insured_hobbies,insured_relationship,capital-gains,capital-loss,incident_date,incident_type,collision_type,incident_severity,authorities_contacted,incident_state,incident_city,incident_location,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,10/17/2014,OH,250/500,1000,1406.91,0,466132,MALE,MD,craft-repair,sleeping,husband,53300,0,1/25/2015,Single Vehicle Collision,Side Collision,Major Damage,Police,SC,Columbus,9935 4th Drive,5,1,YES,1,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,6/27/2006,IN,250/500,2000,1197.22,5000000,468176,MALE,MD,machine-op-inspct,reading,other-relative,0,0,1/21/2015,Vehicle Theft,?,Minor Damage,Police,VA,Riverwood,6608 MLK Hwy,8,1,?,0,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,9/6/2000,OH,100/300,2000,1413.14,5000000,430632,FEMALE,PhD,sales,board-games,own-child,35100,0,2/22/2015,Multi-vehicle Collision,Rear Collision,Minor Damage,Police,NY,Columbus,7121 Francis Lane,7,3,NO,2,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,5/25/1990,IL,250/500,2000,1415.74,6000000,608117,FEMALE,PhD,armed-forces,board-games,unmarried,48900,-62400,1/10/2015,Single Vehicle Collision,Front Collision,Major Damage,Police,OH,Arlington,6956 Maple Drive,5,1,?,1,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,6/6/2014,IL,500/1000,1000,1583.91,6000000,610706,MALE,Associate,sales,board-games,unmarried,66000,-46000,2/17/2015,Vehicle Theft,?,Minor Damage,,NY,Arlington,3041 3rd Ave,20,1,NO,0,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [3]:
# Checking data types
print(df.dtypes)

months_as_customer               int64
age                              int64
policy_number                    int64
policy_bind_date                object
policy_state                    object
policy_csl                      object
policy_deductable                int64
policy_annual_premium          float64
umbrella_limit                   int64
insured_zip                      int64
insured_sex                     object
insured_education_level         object
insured_occupation              object
insured_hobbies                 object
insured_relationship            object
capital-gains                    int64
capital-loss                     int64
incident_date                   object
incident_type                   object
collision_type                  object
incident_severity               object
authorities_contacted           object
incident_state                  object
incident_city                   object
incident_location               object
incident_hour_of_the_day 

In [4]:
# Checking Duplicates
print("Number of duplicate rows:", df.duplicated().sum())

Number of duplicate rows: 0


In [5]:
# Removing duplicates
df.drop_duplicates(inplace=True)

In [6]:
# Remove policy_number from the dataset as it cannot used in Model Training
df.drop(columns=['policy_number'], inplace=True)

In [7]:
# Check for null values
nulls = df.columns[df.isnull().sum() > 0].tolist()
nulls

['authorities_contacted']

In [8]:
# Prcentage of null values in each column
pd.DataFrame(df[nulls].isnull().mean()*100)

Unnamed: 0,0
authorities_contacted,9.1


<!-- # Type of Features -->

In [9]:
# Numeric Features
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print("Numeric Features:", numeric_features)
print("Number of Numeric Features:", len(numeric_features))

Numeric Features: ['months_as_customer', 'age', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'capital-gains', 'capital-loss', 'incident_hour_of_the_day', 'number_of_vehicles_involved', 'bodily_injuries', 'witnesses', 'total_claim_amount', 'injury_claim', 'property_claim', 'vehicle_claim', 'auto_year']
Number of Numeric Features: 17


In [10]:
# Categorical Features
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print("Categorical Features:", categorical_features)
print("Number of Categorical Features:", len(categorical_features))

Categorical Features: ['policy_bind_date', 'policy_state', 'policy_csl', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_date', 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 'incident_state', 'incident_city', 'incident_location', 'property_damage', 'police_report_available', 'auto_make', 'auto_model', 'fraud_reported']
Number of Categorical Features: 21


In [11]:
X = df.drop('fraud_reported', axis=1)
y = df['fraud_reported']

In [12]:
# Manual encoding target column
y= np.where(y=='Y', 1,0)

In [13]:
# Check Skewness
X[numeric_features].skew(axis=0, skipna=True)

months_as_customer             0.362177
age                            0.478988
policy_deductable              0.477887
policy_annual_premium          0.004402
umbrella_limit                 1.806712
insured_zip                    0.816554
capital-gains                  0.478850
capital-loss                  -0.391472
incident_hour_of_the_day      -0.035584
number_of_vehicles_involved    0.502664
bodily_injuries                0.014777
witnesses                      0.019636
total_claim_amount            -0.594582
injury_claim                   0.264811
property_claim                 0.378169
vehicle_claim                 -0.621098
auto_year                     -0.048289
dtype: float64

In [14]:
skew_values = X[numeric_features].skew()
skewed_cols = skew_values[skew_values.abs() > 0.5]
skewed_cols

umbrella_limit                 1.806712
insured_zip                    0.816554
number_of_vehicles_involved    0.502664
total_claim_amount            -0.594582
vehicle_claim                 -0.621098
dtype: float64

In [15]:
# Positive skew: apply log1p
X['umbrella_limit'] = np.sqrt(X['umbrella_limit'])

In [16]:
X['number_of_vehicles_involved'] = np.sqrt(X['number_of_vehicles_involved'])

In [17]:
X['vehicle_claim'] = np.power(X['vehicle_claim'], 2)

In [18]:
X['policy_bind_date'] = pd.to_datetime(X['policy_bind_date'])
X['incident_date'] = pd.to_datetime(X['incident_date'])

In [33]:

policy_csl_order = ['100/300', '250/500', '500/1000']
insured_education_level_order = ["High School", "Associate", "College", "Masters", "JD", "MD", "PhD"]
incident_severity_order = ["Trivial Damage", "Minor Damage", "Major Damage", "Total Loss"]


or_cols = ['policy_csl', 'insured_education_level', 'incident_severity']
transform_columns = ['umbrella_limit', 'number_of_vehicles_involved', 'vehicle_claim']
oh_columns = ['policy_state', 'insured_sex', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type', 'authorities_contacted', 'incident_state', 'incident_city', 'incident_location', 'property_damage', 'police_report_available', 'auto_make', 'auto_model', 'fraud_reported']

In [21]:
# All categorical columns you provided
cat_cols = [
    'policy_state', 'policy_csl', 'insured_sex', 'insured_education_level',
    'insured_occupation', 'insured_hobbies', 'insured_relationship', 
    'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted', 
    'incident_state', 'incident_city', 'incident_location', 'property_damage', 
    'police_report_available', 'auto_make', 'auto_model'
]

# Ordinal columns
or_columns = ['policy_csl', 'insured_education_level', 'incident_severity']

# Separate nominal columns by excluding ordinal columns
oh_columns = [col for col in cat_cols if col not in or_cols]

print("Ordinal Columns:", or_columns)
print("Nominal Columns:", oh_columns)


Ordinal Columns: ['policy_csl', 'insured_education_level', 'incident_severity']
Nominal Columns: ['policy_state', 'insured_sex', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'incident_type', 'collision_type', 'authorities_contacted', 'incident_state', 'incident_city', 'incident_location', 'property_damage', 'police_report_available', 'auto_make', 'auto_model']


In [32]:
from sklearn.impute import SimpleImputer  # handling missing values
from sklearn.preprocessing import StandardScaler  # handling featuer scaling
from sklearn.preprocessing import OrdinalEncoder   # ordinal encoding
from sklearn.pipeline import Pipeline    # pipeline
from sklearn.compose import ColumnTransformer

In [35]:
# Numerical Pipeline

num_pipeline = Pipeline(
    steps=[
        ('SimpleImputer',SimpleImputer(strategy='median')),
        ('StandardScaler',StandardScaler())
    ]
)

# Categorical Pipeline

cat_pipeline = Pipeline(
    steps=[
        ('SimpleImputer',SimpleImputer(strategy='most_frequent')),
        ('OrdinalEncoder',OrdinalEncoder(categories=[or_cols]))
    ]
)

# combine num_pipeline and cat_pipeline

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,numeric_features),
    ('categorical_pipeline',cat_pipeline,categorical_features)
])

In [38]:
preprocessor.fit(df)

ValueError: Shape mismatch: if categories is an array, it has to be of shape (n_features,).