This project is about using LIME to explain individual predictions in a Machine Learning application.
Instructor: Dr. E. Kapetanios (PhD, ETH Zurich).

## 1. Data exploration and Understanding

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv("winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
df.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,5.636023
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.807569
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,3.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,5.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,6.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,6.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,8.0


In [4]:
print("Rows, columns: " + str(df.shape))

Rows, columns: (1599, 12)


In [5]:
print(df.isna().sum())

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


 ## 2. Transformation into a classification problem and dataset

In [6]:
X_feature_names = [i for i in df.columns if df[i].dtype in [np.float]]
print(X_feature_names)

['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']


In [7]:
X_feature_names = [i for i in df.columns if df[i].dtype in [np.int64]]
print(X_feature_names)

['quality']


In [8]:
#Creating Classification version of target variable
df['goodquality'] = [1 if x>=7 else 0 for x in df['quality']]

In [9]:
#Separate feature variable and target variable
X = df.drop(['quality', 'goodquality'], axis=1)
y = df['goodquality']

In [10]:
df['goodquality'].value_counts() #1 as goodquality red wines

0    1382
1     217
Name: goodquality, dtype: int64

## 3. Preparing the data for training and validation purposes

In [11]:
#Normalize feature variables
from sklearn.preprocessing import StandardScaler
X_features = X
X = StandardScaler().fit_transform(X)

In [12]:
#Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=0)

In [13]:
print(y_test.value_counts())

0    430
1     50
Name: goodquality, dtype: int64


## 4. Train, validate, estimate and contrast the performance of 3 regression classifiers

In [14]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

In [15]:
model1 = DecisionTreeClassifier(random_state=1)
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

In [16]:
print(classification_report(y_test, y_pred1))

              precision    recall  f1-score   support

           0       0.96      0.92      0.94       430
           1       0.49      0.68      0.57        50

    accuracy                           0.89       480
   macro avg       0.72      0.80      0.75       480
weighted avg       0.91      0.89      0.90       480



In [17]:
from sklearn.ensemble import RandomForestClassifier
model2 = RandomForestClassifier(random_state=1)
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

In [18]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.95      0.96      0.96       430
           1       0.64      0.58      0.61        50

    accuracy                           0.92       480
   macro avg       0.80      0.77      0.78       480
weighted avg       0.92      0.92      0.92       480



In [19]:
from sklearn.ensemble import AdaBoostClassifier
model3 = AdaBoostClassifier(random_state=1)
model3.fit(X_train, y_train)
y_pred3 = model3.predict(X_test)

In [20]:
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94       430
           1       0.47      0.48      0.48        50

    accuracy                           0.89       480
   macro avg       0.70      0.71      0.71       480
weighted avg       0.89      0.89      0.89       480



## 5. Prepare and train the LIME explainer

In [27]:
#pip install lime

In [28]:
import lime
import lime.lime_tabular

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(np.array(X_train),
                                                   feature_names=X_feature_names,
                                                   class_names=['goodquality'],
                                                   verbose=True, mode='regression')

In [30]:
print(y_test)

1109    0
1032    0
1002    1
487     0
979     0
       ..
801     0
61      0
431     0
1210    0
713     0
Name: goodquality, Length: 480, dtype: int64


In [31]:
print(y_test[0:3])

1109    0
1032    0
1002    1
Name: goodquality, dtype: int64


In [32]:
print(model1.predict(X_test[0:3]))

[0 0 1]


In [33]:
print(model2.predict(X_test[20:25]))

[0 0 0 0 0]


In [None]:
exp = explainer.explain_instance(X_test[20], model2.predict)

In [None]:
exp.as_pyplot_figure()

In [None]:
exp = explainer.explain_instance(X_test[2], model2.predict)

In [None]:
exp.as_pyplot_figure()