## Testing out LIME

In [None]:
import sklearn
import numpy as np
import pandas as pd
import lime

from lime import lime_tabular
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

np.random.seed(1)

In [None]:
fake_data = pd.read_csv('../data/generated_data_numpy.csv')
fake_data = fake_data.drop(columns=['Policy_Id', 'Policy_Year'])
fake_data['Accident_Reported'] = np.where(fake_data['Accident_Reported'] == 1, 'Reported', 'Not Reported')


In [None]:
train, test, labels_train, labels_test = sklearn.model_selection.train_test_split(
    fake_data[[x for x in fake_data.columns if x != 'Accident_Reported']],
    fake_data['Accident_Reported'],
    train_size=0.80)

train = pd.DataFrame(train)
test = pd.DataFrame(test)


## Using Label Encoding for Categorical Features

In [None]:
# Define the numeric and categorical features
numeric_features = ['Model_Year', 'Miles_Driven', 'Years_Customer']
categorical_features = ['Make', 'Body_Style', 'Model_Color', 'Driver_Hair_Color']

# label encode the categorical features keep all names in a dictionary with keys for the column number
label_encoders = {}
for feature in categorical_features:
    le = LabelEncoder()
    train[feature] = le.fit_transform(train[feature])
    test[feature] = le.transform(test[feature])
    label_encoders[feature] = le

# Create the transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# build categorical transformer with simple imputer and label encoder
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant'))])

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# preprocess train and test data
train = preprocessor.fit_transform(train)
test = preprocessor.transform(test)


We create a one-hot encoder for just the classifier- not the explainer!

## Random Forest Example

In [None]:
# get pandas dataframe with names
train_df = pd.DataFrame(train, columns=numeric_features + categorical_features)

ohe = OneHotEncoder()

# Get the column indices of the categorical features
categorical_features_idx = list(np.where(np.isin(train_df.columns, categorical_features))[0])

# build a column transformer to apply the one hot encoder to the categorical features
preprocessor_ohe = ColumnTransformer(
    transformers=[
        ('cat', ohe, categorical_features_idx)],
    remainder='passthrough')

# build a classifier
clf = RandomForestClassifier(n_estimators=100)

# build a pipeline to apply the column transformer and the classifier
pipe = Pipeline(steps=[('preprocessor', preprocessor_ohe),
                       ('classifier', clf)])


In [None]:
# fit the pipe
pipe.fit(train, labels_train)


In [None]:
print(pipe.predict_proba(test[0:1]))


In [None]:
# using the label encoders we create a dictionary with the column indices as keys and a list of values for each label encoded variable
categorical_values = {}
for feature in categorical_features:
    categorical_values[pd.DataFrame(train, columns=numeric_features + categorical_features).columns.get_loc(feature)] = list(label_encoders[feature].classes_)

categorical_values


In [None]:
# build the lime explainer
explainer = lime.lime_tabular.LimeTabularExplainer(train, 
                                                   feature_names=numeric_features + categorical_features, 
                                                   class_names=['Not Reported', 'Reported'], 
                                                   categorical_features=categorical_features_idx, 
                                                   categorical_names=categorical_values,
                                                   mode='classification',
                                                   discretize_continuous=True,)


In [None]:
# explain the first instance in the test set
exp = explainer.explain_instance(train[0], pipe.predict_proba)

# print the explanation
exp.show_in_notebook(show_table=True, show_all=False)
