# Mixed-Integer Linear Program (MILP) for Local Interpretable Model-agnostic Explanations (LIME)

This study aims to formulate (and test) LIME with MILP optimization.

TODO:
- Pre-process text
- Run SVM more

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Setup" data-toc-modified-id="Setup-1">Setup</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-2">Model</a></span><ul class="toc-item"><li><span><a href="#Data" data-toc-modified-id="Data-2.1">Data</a></span></li><li><span><a href="#Classifier" data-toc-modified-id="Classifier-2.2">Classifier</a></span></li></ul></li><li><span><a href="#Optimization" data-toc-modified-id="Optimization-3">Optimization</a></span><ul class="toc-item"><li><span><a href="#LIME" data-toc-modified-id="LIME-3.1">LIME</a></span></li><li><span><a href="#Calculation-of-parameters" data-toc-modified-id="Calculation-of-parameters-3.2">Calculation of parameters</a></span></li><li><span><a href="#Linear-optimization" data-toc-modified-id="Linear-optimization-3.3">Linear optimization</a></span></li></ul></li><li><span><a href="#References" data-toc-modified-id="References-4">References</a></span></li></ul></div>

## Setup

In [None]:
from IPython.display import HTML
from pulp import LpVariable, LpProblem, value, LpStatus, LpMinimize
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import numpy as np
import pandas as pd

In [None]:
%config Completer.use_jedi = False

## Model

### Data

In [None]:
df = pd.read_csv('../data/IMDB Dataset.csv')
df

In [None]:
df.review = df.review.apply(lambda x: x.replace('<br />', ''))

In [None]:
X = df.review.to_list()
y = df.sentiment.to_list()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

### Classifier

In [None]:
tf_idf = TfidfVectorizer(
    strip_accents=None,
    lowercase=True,
    smooth_idf=True,
)
X_train = tf_idf.fit_transform(X_train)
X_train.shape

In [None]:
t_svd = TruncatedSVD(n_components=50, random_state=42)
X_train = t_svd.fit_transform(X_train)
X_train.shape

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
svc = SVC(
    shrinking=True,
    probability=True,
    tol=0.001,
    cache_size=200,
    verbose=True,
    max_iter=-1,
    decision_function_shape='ovr',
    random_state=42,
)

svm = GridSearchCV(
    svc,
    param_grid={'C': [1, 10], 'kernel': ['linear']},
    n_jobs=4,
    cv=5,
    verbose=3
)

In [None]:
%%time
svm.fit(X_train[:500], y_train[:500])

In [None]:
vector = Pipeline([
    ('tf_idf', tf_idf),
    ('t_svd', t_svd)
])

In [None]:
model = Pipeline([
    ('tf_idf', tf_idf),
    ('t_svd', t_svd),
    ('scaler', scaler),
    ('svm', svm)
])

In [None]:
model.score(X_test, y_test)

## Optimization

### LIME

### Calculation of parameters

In [None]:
example = 'This movie is awful, I regret seing it, it is a bad movie.'
example

In [None]:
model.predict([example])

In [None]:
model.predict_proba([example])

In [None]:
model.classes_

In [None]:
def f(text):
    '''Probability of a text'''
    return model.predict_proba([text])[0]

In [None]:
def pi(x, z, sigma=0.5):
    '''Weights of locallity.'''
    x = vector.transform([x])[0]
    z = vector.transform([z])[0]
    # If null vector
    if np.abs(z).sum() == 0:
        return 0
    # Cosine of angle between vectors
    cos = np.dot(x, z)/(np.linalg.norm(x)*np.linalg.norm(z))
    # If cosine is like 1.00008
    if cos > 1:
        cos = 1
    # Angle between vectors, normalized to between 0 and 1
    D = np.arccos(cos)*2/np.pi
    return np.exp(-D**2/sigma**2)

In [None]:
def parameters(split, which_class, M, N, K):
    '''Return the parameters for LIME optimization.'''
    # Perturbations
    z_line = []
    # Probabilities
    f_z = []
    # Weights
    pi_x = []
    
    for i in range(N):
        # Choose a random number of words to remove, between 1 and (split - 1)
        n = np.random.choice(range(1, M))
        # Remove n random words
        indices = np.random.choice(range(M), size=n, replace=False)
        
        # The pertubartion
        perturbation = np.ones(M)
        for index in indices:
            perturbation[index] = 0
        z_line.append(perturbation)
            
        # The probability and weight
        text = ' '.join([word for (j, word) in enumerate(split) if perturbation[j]])
        f_z.append(f(text)[which_class])
        pi_x.append(pi(example, text))
    
    return z_line, f_z, pi_x

### Linear optimization

In [None]:
def optimization(z_line, f_z, pi_x, M, N, K):
    '''The MILP for LIME.'''
    prob = LpProblem("LIME", LpMinimize)
    
    # Variables
    L = LpVariable('L')
    epsilon = [LpVariable('epsilon_{}'.format(i)) for i in range(N)]
    g = [LpVariable("g(z'_{})".format(i)) for i in range(N)]
    x = [LpVariable('x_{}'.format(j)) for j in range(M)]
    y = [LpVariable('y_{}'.format(j), 0, 1, cat='Integer') for j in range(M)]
    
    # Objective
    prob += L
    
    # Constraints
    prob += L == sum([pi_x[i]*epsilon[i] for i in range(N)])

    for i in range(N):
        prob += -epsilon[i] <= f_z[i] - g[i]
        prob += epsilon[i] >= f_z[i] - g[i]
        prob += g[i] == sum([z_line[i][j]*x[j] for j in range(M)])

    infinity = 100000
    for j in range(M):
        prob += -infinity*y[j] <= x[j]
        prob += infinity*y[j] >= x[j]

    prob += sum(y) <= K

    print('Solving MILP...')
    status = prob.solve()
    print('Done.')
    
    return prob, status, x

In [None]:
def visualize(split, importances):
    '''Visualize the importance of each word in the classification.'''
    max_abs_importance = np.max(np.abs(importances))
    # Green
    positive = np.array([0, 255, 0])
    white = np.array([255, 255, 255])
    # Red
    negative = np.array([255, 0, 0])
    spans = []
    for i, word in enumerate(split):
        if importances[i] >= 0:
            color = white + (positive - white)/max_abs_importance*importances[i]
        else:
            color = white + (negative - white)/(-1)*max_abs_importance*importances[i]
        spans.append(
            '<span style="background-color: RGB({R}, {G}, {B})">{word}</span>'.format(
                word=word,
                R=color[0],
                G=color[1],
                B=color[2]
            )
        )
            
    html = ' '.join(spans)
    return HTML(html)

In [None]:
def lime(text, which_class, N=None, K=None):
    # Split 
    split = text.split()
    M = len(split)
    if N is None:
        N = 3*M
    if K is None:
        K = min([M, 20])
                
    z_line, f_z, pi_x = parameters(split, which_class, M, N, K)
        
    prob, status, x = optimization(z_line, f_z, pi_x, M, N, K)
    
    importances = [value(i) for i in x]    
    print(dict(zip(split, importances)))    
    
    return visualize(split, importances)

In [None]:
lime(example, 0, K=1000)

In [None]:
lime(X_test[5], 0, N=30)

## References

- https://arxiv.org/pdf/1602.04938.pdf
- https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
- https://vanderbei.princeton.edu/tex/talks/MOPTA14/L1_reg.pdf