In [1]:
import sklearn
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

np.random.seed(1)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Building a baseline model

In [19]:
fake_data = pd.read_csv('../data/generated_data_numpy.csv')
original_data = fake_data.copy()
fake_data = fake_data.drop(columns=['Policy_Id', 'Policy_Year'])
fake_data['Accident_Reported'] = np.where(fake_data['Accident_Reported'] == 1, 'Accident', 'No Accident')


In [20]:
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(
    fake_data[[x for x in fake_data.columns if x != 'Accident_Reported']],
    fake_data['Accident_Reported'],
    train_size=0.80)

train = pd.DataFrame(train)
test = pd.DataFrame(test)

In [21]:
# get a list of categorical features
categorical_features = ['Make', 'Body_Style', 'Model_Color', 'Driver_Hair_Color']

# get a list of numeric features
numeric_features = [x for x in fake_data.columns if x not in categorical_features]


In [22]:
# get categorical features idx
categorical_features_idx = list(np.where(np.isin(train.columns, categorical_features))[0])

# get numeric features idx
numeric_features_idx = list(np.where(np.isin(train.columns, numeric_features))[0])


We have to convert text to numbers for LimeExplainer to work unfortunately...

In [23]:
# Encode categorical features using LabelEncoder
label_encoders = {}
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    train[feature] = label_encoders[feature].fit_transform(train[feature])
    

In [24]:
# use the label encoders to encode the test data
for feature in categorical_features:
    test[feature] = label_encoders[feature].transform(test[feature])


Let's go ahead and store the label encoded values so that we can have the proper names in the lime explainer.

In [25]:
categorical_values = {}
for feature in categorical_features:
    # get column number
    col_num = train.columns.get_loc(feature)
    categorical_values[col_num] = list(label_encoders[feature].classes_)

categorical_values


{0: ['Honda', 'Subaru', 'Toyota'],
 1: ['sedan', 'suv', 'truck'],
 3: ['Blue', 'Red', 'White'],
 5: ['Black', 'Brown']}

We build a quick pipeline model to use for evaluating the method

In [27]:
# build a numeric transformer 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())], 
    verbose=True)

# build a categorical transformer with simple imputer and one hot encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))], 
    verbose=True)

# build a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features_idx),
        ('cat', categorical_transformer, categorical_features_idx)])

# build a classifier
clf = RandomForestClassifier(n_estimators=100)

# build a pipeline to apply the preprocessing and the classifier
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('classifier', clf)])


In [28]:
# fit the pipeline
pipe.fit(train.to_numpy(), labels_train)


[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing scaler, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing imputer, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing onehot, total=   0.0s


In [12]:
train

Unnamed: 0,Make,Body_Style,Model_Year,Model_Color,Miles_Driven,Driver_Hair_Color,Years_Customer
2694,2,0,2006,1,14148,0,3
5140,1,0,2009,1,19374,1,20
2568,0,0,2011,0,24674,1,19
3671,2,0,2011,1,5900,1,7
7427,2,2,2018,1,10529,1,15
...,...,...,...,...,...,...,...
2895,2,0,2005,0,19125,0,6
7813,0,0,2015,2,14695,0,20
905,1,2,2004,1,20047,0,7
5192,2,1,2012,1,2759,1,2


In [13]:
# evaluate the pipeline on the test set
roc_auc_score(labels_test, pipe.predict_proba(test.to_numpy())[:, 1])


0.8122584845430108

In [14]:
# evaluate accuracy of the pipeline on the test set
pipe.score(test.to_numpy(), labels_test)


0.791

In [30]:
predictions = pipe.predict(test.to_numpy())

In [34]:
original_test_data = original_data[original_data.index.isin(test.index)]
original_test_data.loc[test.index, 'Predictions'] = predictions
original_test_data

Unnamed: 0,Policy_Id,Policy_Year,Make,Body_Style,Model_Year,Model_Color,Miles_Driven,Driver_Hair_Color,Years_Customer,Accident_Reported,Predictions
2,3,2013,Toyota,suv,2013,Red,22169,Black,16,0,No Accident
4,5,2008,Honda,suv,2016,White,24575,Black,4,0,No Accident
6,7,2022,Toyota,sedan,2016,Red,3671,Brown,20,1,No Accident
7,8,2008,Toyota,sedan,2016,Blue,6737,Brown,17,0,No Accident
9,10,2010,Honda,suv,2007,Blue,2957,Black,18,0,No Accident
...,...,...,...,...,...,...,...,...,...,...,...
9973,9974,2008,Honda,sedan,2014,White,2546,Black,7,0,No Accident
9984,9985,2000,Toyota,sedan,2014,Red,23901,Brown,16,0,No Accident
9985,9986,2011,Subaru,suv,2012,Blue,6646,Brown,10,0,No Accident
9996,9997,2012,Toyota,suv,2013,Red,24482,Brown,4,1,No Accident


In [None]:
original_test_data.to_csv('../data/predictions.csv', index=False)