In [1]:
import pandas as pd 
import numpy as np

In [2]:
data=pd.read_csv('insuranceFraud.csv')

In [3]:
data.head(5)

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported
0,328,48,521585,10/17/2014,OH,250/500,1000,1406.91,0,466132,...,2,YES,71610,6510,13020,52080,Saab,92x,2004,Y
1,228,42,342868,6/27/2006,IN,250/500,2000,1197.22,5000000,468176,...,0,?,5070,780,780,3510,Mercedes,E400,2007,Y
2,134,29,687698,9/6/2000,OH,100/300,2000,1413.14,5000000,430632,...,3,NO,34650,7700,3850,23100,Dodge,RAM,2007,N
3,256,41,227811,5/25/1990,IL,250/500,2000,1415.74,6000000,608117,...,2,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y
4,228,44,367455,6/6/2014,IL,500/1000,1000,1583.91,6000000,610706,...,1,NO,6500,1300,650,4550,Accura,RSX,2009,N


In [4]:
data=data.replace('?',np.nan)

In [5]:
data.columns


Index(['months_as_customer', 'age', 'policy_number', 'policy_bind_date',
       'policy_state', 'policy_csl', 'policy_deductable',
       'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex',
       'insured_education_level', 'insured_occupation', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_date', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_location', 'incident_hour_of_the_day',
       'number_of_vehicles_involved', 'property_damage', 'bodily_injuries',
       'witnesses', 'police_report_available', 'total_claim_amount',
       'injury_claim', 'property_claim', 'vehicle_claim', 'auto_make',
       'auto_model', 'auto_year', 'fraud_reported'],
      dtype='object')

In [6]:
data['policy_state'].nunique()

3

In [7]:
data['auto_model'].nunique()

39

In [8]:
data['insured_relationship'].nunique()

6

In [9]:
data['insured_occupation'].value_counts()

insured_occupation
machine-op-inspct    93
prof-specialty       85
tech-support         78
sales                76
exec-managerial      76
craft-repair         74
transport-moving     72
other-service        71
priv-house-serv      71
armed-forces         69
adm-clerical         65
protective-serv      63
handlers-cleaners    54
farming-fishing      53
Name: count, dtype: int64

In [10]:
data['insured_education_level'].nunique()

7

In [11]:
# Create the contingency table
contingency_table = pd.crosstab(data['insured_zip'], data['fraud_reported'])

print(contingency_table)

# Perform the Chi-square test
from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(contingency_table)

alpha = 0.05
if p < alpha:
    print("There is a significant association between insured occupation and fraud status (reject H0)")
else:
    print("There is no significant association between insured occupation and fraud status (fail to reject H0)")

fraud_reported  N  Y
insured_zip         
430104          1  0
430141          0  1
430232          1  0
430380          1  0
430567          1  0
...            .. ..
620737          1  0
620757          1  0
620819          0  1
620869          1  0
620962          1  0

[995 rows x 2 columns]
There is no significant association between insured occupation and fraud status (fail to reject H0)


In [12]:
contingency_table = pd.crosstab(data['insured_relationship'], data['fraud_reported'])

print(contingency_table)

# Perform the Chi-square test
from scipy.stats import chi2_contingency
chi2, p, dof, expected = chi2_contingency(contingency_table)

alpha = 0.05
if p < alpha:
    print("There is a significant association between insured occupation and fraud status (reject H0)")
else:
    print("There is no significant association between insured occupation and fraud status (fail to reject H0)")

fraud_reported          N   Y
insured_relationship         
husband               135  35
not-in-family         129  45
other-relative        125  52
own-child             144  39
unmarried             107  34
wife                  113  42
There is no significant association between insured occupation and fraud status (fail to reject H0)


In [14]:
data['active_days'] = abs(pd.to_datetime(data['policy_bind_date'], format='%m/%d/%Y')-pd.to_datetime(data['incident_date'], format='%m/%d/%Y'))

In [15]:
data['active_days']

0      100 days
1     3130 days
2     5282 days
3     8996 days
4      256 days
         ...   
995   8622 days
996    384 days
997   4358 days
998   1196 days
999   6681 days
Name: active_days, Length: 1000, dtype: timedelta64[ns]

In [None]:
col_to_delete = ['policy_number','incident_date','policy_bind_date', 'insured_zip', 'insured_hobbies','policy_state', 
                 'incident_location', 'incident_state', 'incident_city', 'auto_make' , 'auto_model' ,'insured_education_level','insured_occupation'  ]

In [None]:
data.drop(columns=col_to_delete,inplace=True)

In [None]:
data

In [None]:
data.dtypes[data.dtypes=='object']

In [None]:
data.isna().sum()

In [None]:
data['collision_type'].fillna(value='Other',inplace=True)

In [None]:
data['authorities_contacted']

In [None]:
data['authorities_contacted'].fillna(value='Other',inplace=True)

In [None]:
data['property_damage'].fillna(value='Other',inplace=True)

In [None]:
data['police_report_available'].fillna(value='Other',inplace=True)

In [None]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, fill_value='Other')

In [None]:
data['collision_type'][data['collision_type']==False]=np.nan

In [None]:
from sklearn.impute import SimpleImputer

imp_mean = SimpleImputer(missing_values=np.nan, fill_value='Other', strategy='constant')
data[['collision_type']] = imp_mean.fit_transform(data[['collision_type']])
data[['authorities_contacted']] = imp_mean.fit_transform(data[['authorities_contacted']])
data[['property_damage']] = imp_mean.fit_transform(data[['property_damage']])
data[['police_report_available']] = imp_mean.fit_transform(data[['police_report_available']])


In [None]:
# Extracting the categorical columns
cat_df = data.select_dtypes(include=['object']).copy()
cat_df.columns

In [None]:
cat_df['policy_csl'].apply(lambda x: int(x.split('/')[0]) + int(x.split('/')[1]))

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
X = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(X)
enc.categories_
enc.transform([['Female', 3], ['Male', 1]])

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
ohe_categorical_cols=['insured_relationship', 'incident_type',
       'collision_type', 'authorities_contacted',
       'property_damage', 'police_report_available']
#One-hot-encode the categorical columns.
#Unfortunately outputs an array instead of dataframe.
array_hot_encoded = ohe.fit_transform(cat_df[ohe_categorical_cols])
pd.DataFrame(array_hot_encoded,  index=cat_df.index)

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Assuming cat_df is your DataFrame containing categorical columns

ohe = OneHotEncoder()
ohe_categorical_cols=['insured_relationship', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'property_damage', 'police_report_available']

# One-hot-encode the categorical columns and convert to DataFrame
array_hot_encoded = ohe.fit_transform(cat_df[ohe_categorical_cols])
hot_encoded_df = pd.DataFrame(array_hot_encoded.toarray(), columns=ohe.get_feature_names_out(), index=cat_df.index)

# Concatenate hot_encoded_df with the original DataFrame cat_df
#cat_df_encoded = pd.concat([cat_df.drop(ohe_categorical_cols, axis=1), hot_encoded_df], axis=1)

# Now cat_df_encoded contains all columns including the one-hot-encoded columns


In [None]:
from sklearn.preprocessing import OrdinalEncoder
ordenc = OrdinalEncoder()

In [None]:
ord_categorical_cols =['insured_sex','fraud_reported', 'incident_severity' ]
array_ord_encoded = ordenc.fit_transform(cat_df[ord_categorical_cols])
ord_encoded_df = pd.DataFrame(array_ord_encoded, columns=ordenc.get_feature_names_out(), index=cat_df.index)

In [None]:
ord_encoded_df

In [None]:
hot_encoded_df

In [None]:
num_df = data.select_dtypes(exclude=['object']).copy()


In [None]:
num_df['active_days'] = num_df['active_days'].dt.days

In [None]:
num_df

In [None]:
# num_df 
# ord_encoded_df
# hot_encoded_df

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
num_df_scaled = pd.DataFrame(scaler.fit_transform(num_df), columns=num_df.columns, index=num_df.index)


In [None]:
num_df_scaled

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector, make_column_transformer


In [None]:
impute_cols= ['collision_type', 'authorities_contacted', 'property_damage', 'police_report_available']

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, fill_value='Other', strategy='constant')
data[['collision_type']] = imp_mean.fit_transform(data[['collision_type']])
data[['authorities_contacted']] = imp_mean.fit_transform(data[['authorities_contacted']])
data[['property_damage']] = imp_mean.fit_transform(data[['property_damage']])
data[['police_report_available']] = imp_mean.fit_transform(data[['police_report_available']])

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, fill_value='Other', strategy='constant')


In [None]:
cat_df['policy_csl'].apply(lambda x: int(x.split('/')[0]) + int(x.split('/')[1]))

In [None]:
ohe = OneHotEncoder()
ohe_categorical_cols=['insured_relationship', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'property_damage', 'police_report_available']
ord_categorical_cols =['insured_sex','fraud_reported', 'incident_severity' ]
array_ord_encoded = ordenc.fit_transform(cat_df[ord_categorical_cols])
ordenc = OrdinalEncoder()

In [None]:
ord_cat_cols_pipe = Pipeline([("ordenc", OrdinalEncoder() )])

In [None]:
ohe_cat_cols_pipe = Pipeline([("ohenc", OneHotEncoder() )])

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class ConvertDaysInt(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # Nothing to do here
 
    def transform(self, X):
        X['active_days'] = X['active_days'].dt.days
        return X

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CSLSum(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # Nothing to do here
 
    def transform(self, X):
         X['policy_csl']= X['policy_csl'].apply(lambda x: int(x.split('/')[0]) + int(x.split('/')[1]))
         return X

In [None]:
# Column Impute 
impute_cols= ['collision_type', 'authorities_contacted', 'property_damage', 'police_report_available']
impute_cols_pipe = Pipeline([("impute", SimpleImputer(missing_values=np.nan, fill_value='Other', strategy='constant'))])

ord_categorical_cols =['insured_sex','fraud_reported', 'incident_severity' ]
ord_cat_cols_pipe = Pipeline([("ordenc", OrdinalEncoder() )])


ohe_categorical_cols=['insured_relationship', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'property_damage', 'police_report_available']
ohe_cat_cols_pipe = Pipeline([("ohenc", OneHotEncoder() )])

from sklearn.base import BaseEstimator, TransformerMixin
class CSLSum(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # Nothing to do here
 
    def transform(self, X):
         X['policy_csl']= X['policy_csl'].apply(lambda x: int(x.split('/')[0]) + int(x.split('/')[1]))
         return X
    def get_feature_names_out(self, input_features=None):
        return ['policy_csl']



class ConvertDaysInt(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # Nothing to do here
 
    def transform(self, X):
        X['active_days'] = X['active_days'].dt.days
        return X
    def get_feature_names_out(self, input_features=None):
        return ['active_days']

In [None]:
ord_categorical_cols =['insured_sex','fraud_reported', 'incident_severity' ]
ord_cat_cols_pipe = Pipeline([("ordenc", OrdinalEncoder() ),("standscale", StandardScaler())])

In [None]:
ohe_categorical_cols=['insured_relationship', 'incident_type',
       'collision_type', 'incident_severity', 'authorities_contacted',
       'property_damage', 'police_report_available']
ohe_cat_cols_pipe = Pipeline([("ohenc", OneHotEncoder(sparse_output=False) ),("standscale", StandardScaler())])

In [None]:
# Define the ColumnTransformer
preprocessing = ColumnTransformer([
    ("impute_cols_pipe", impute_cols_pipe, impute_cols),
    ("ord_cat_cols_pipe", ord_cat_cols_pipe, ord_categorical_cols),
    ("ohe_cat_cols_pipe", ohe_cat_cols_pipe, ohe_categorical_cols),
    ("date_format", ConvertDaysInt(), ['active_days']),  # Apply ConvertDaysInt to 'active_days' column
    ("csl_sum", CSLSum(), ['policy_csl'])  # Apply CSLSum to 'policy_csl' column
], remainder='passthrough', verbose_feature_names_out=True)

In [None]:
temp_df = pd.DataFrame(preprocessing.fit_transform(data))

In [None]:
preprocessing.get_feature_names_out()


In [None]:
temp_df.columns=preprocessing.get_feature_names_out()

In [None]:
temp_df