In [1]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

np.random.seed(1)


## Building a baseline model

In [2]:
fake_data = pd.read_csv('../data/generated_data_numpy.csv')
original_data = fake_data.copy()
fake_data = fake_data.drop(columns=['Policy_Id', 'Policy_Year', 'Policy_Month'])
fake_data['Accident_Reported'] = np.where(fake_data['Accident_Reported'] == 1, 'Accident', 'No Accident')


In [3]:
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(
    fake_data[[x for x in fake_data.columns if x != 'Accident_Reported']],
    fake_data['Accident_Reported'],
    train_size=0.80)

train = pd.DataFrame(train)
test = pd.DataFrame(test)

In [4]:
# get a list of categorical features
categorical_features = ['Make', 'Body_Style', 'Model_Color', 'Driver_Hair_Color']

# get a list of numeric features
numeric_features = [x for x in fake_data.columns if x not in categorical_features]


In [5]:
# get categorical features idx
categorical_features_idx = list(np.where(np.isin(train.columns, categorical_features))[0])

# get numeric features idx
numeric_features_idx = list(np.where(np.isin(train.columns, numeric_features))[0])


We have to convert text to numbers for LimeExplainer to work unfortunately...

In [6]:
# Encode categorical features using LabelEncoder
label_encoders = {}
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    train[feature] = label_encoders[feature].fit_transform(train[feature])
    

In [7]:
# use the label encoders to encode the test data
for feature in categorical_features:
    test[feature] = label_encoders[feature].transform(test[feature])


Let's go ahead and store the label encoded values so that we can have the proper names in the lime explainer.

In [8]:
categorical_values = {}
for feature in categorical_features:
    # get column number
    col_num = train.columns.get_loc(feature)
    categorical_values[col_num] = list(label_encoders[feature].classes_)

categorical_values


{0: ['Honda', 'Subaru', 'Toyota'],
 1: ['sedan', 'suv', 'truck'],
 3: ['Blue', 'Red', 'White'],
 5: ['Black', 'Brown']}

We build a quick pipeline model to use for evaluating the method

In [9]:
# build a numeric transformer 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())], 
    verbose=True)

# build a categorical transformer with simple imputer and one hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))], 
    verbose=True)

# build a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features_idx),
        ('cat', categorical_transformer, categorical_features_idx)])

# build a classifier
clf = RandomForestClassifier(n_estimators=100)

# build a pipeline to apply the preprocessing and the classifier
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])


In [10]:
# fit the pipeline
pipe.fit(train.to_numpy(), labels_train)


[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s


In [11]:
train

Unnamed: 0,Make,Body_Style,Model_Year,Model_Color,Miles_Driven,Driver_Hair_Color,Years_Customer
2694,2,1,2017,0,17139,0,17
5140,1,2,2022,1,4442,0,4
2568,0,0,2020,2,15083,0,6
3671,2,0,2016,1,7027,1,19
7427,0,0,2009,2,22724,1,15
...,...,...,...,...,...,...,...
2895,0,2,2001,0,8149,0,17
7813,1,0,2017,2,6078,1,6
905,1,2,2000,1,17544,0,12
5192,0,1,2023,0,14011,0,9


In [12]:
# evaluate the pipeline on the test set
roc_auc_score(labels_test, pipe.predict_proba(test.to_numpy())[:, 1])


0.7934813140540994

In [13]:
# evaluate accuracy of the pipeline on the test set
pipe.score(test.to_numpy(), labels_test)


0.788

In [14]:
predictions = pipe.predict(test.to_numpy())

In [15]:
original_test_data = original_data[original_data.index.isin(test.index)]
original_test_data.loc[test.index, 'Predictions'] = predictions
original_test_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  original_test_data.loc[test.index, 'Predictions'] = predictions


Unnamed: 0,Policy_Id,Policy_Year,Policy_Month,Make,Body_Style,Model_Year,Model_Color,Miles_Driven,Driver_Hair_Color,Years_Customer,Accident_Reported,Predictions
6,7,2005,Oct,Toyota,suv,2010,Blue,17354,Brown,10,0,Accident
7,8,2018,Jul,Toyota,suv,2007,Red,15849,Black,6,1,No Accident
21,22,2021,May,Toyota,suv,2020,Red,20971,Black,4,0,No Accident
28,29,2021,Sep,Subaru,suv,2006,Red,16560,Brown,11,0,No Accident
32,33,2009,Aug,Toyota,suv,2010,Blue,13133,Brown,8,1,Accident
...,...,...,...,...,...,...,...,...,...,...,...,...
9979,9980,2004,Oct,Subaru,suv,2020,Red,23911,Brown,9,0,No Accident
9981,9982,2007,Mar,Subaru,truck,2005,Red,11658,Black,1,0,No Accident
9982,9983,2007,Nov,Toyota,sedan,2009,White,6430,Black,3,1,Accident
9984,9985,2018,Jul,Subaru,sedan,2020,White,9913,Brown,9,0,Accident


In [16]:
original_test_data.to_csv('../data/predictions.csv', index=False)