<a href="https://colab.research.google.com/github/mnassar/py2vpy3xai/blob/main/py2vpy3XaiLemnaLikeScript.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

@author: mnassar

This is the code used to generate the results for the Lime text explainer at the char level with a surrounding window of the best position found. We consider this approach as inspired by the Lemna paper so we call it the Lemna like explainer.

This is used to generate the results for Table 10: Results for LIMETextExplainer at the char level
with a custom fused LASSO regressor

In [None]:
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283835 sha256=207d172cea2a1fe2439efed93330e2cdb4849c3dbb2f57a141dcf77295296b06
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
%%time

# -*- coding: utf-8 -*-

# !pip install lime


import lime
from lime.lime_text import LimeTextExplainer
from keras.utils import pad_sequences
from keras import models
import numpy as np
import pandas as pd
from sklearn.linear_model._base import LinearModel,RegressorMixin
from sklearn.linear_model import Ridge, LinearRegression
import cvxpy as cp
import re


#longest common subsequence
def lcs(X, Y):
    # find the length of the strings
    m = len(X)
    n = len(Y)
    # declaring the array for storing the dp values
    L = [[None]*(n + 1) for i in range(m + 1)]
    """Following steps build L[m + 1][n + 1] in bottom up fashion
    Note: L[i][j] contains length of LCS of X[0..i-1]
    and Y[0..j-1]"""
    for i in range(m+1):
        for j in range(n+1):
            if i == 0 or j == 0 :
                L[i][j] = 0
            elif X[i-1] == Y[j-1]:
                L[i][j] = L[i-1][j-1]+1
            else:
                L[i][j] = 0
    return max ( max(l) for l in L )

def eval_expl(r, col):
  if re.search(re.compile(r'\bu[\'|\"]\s*\S+'), r[col]):
    return True
  return lcs(r["explanation"], r[col]) >= EXPL_EVAL_THR



def predict (lines, shape = 1):
  samples = [  bytearray(line, encoding="utf-8") for line in lines ]
  samples_padded = pad_sequences(samples, maxlen=MAX_LEN, padding='post', truncating='post')
  preds = m.predict(samples_padded)
  # if pred < 0.5:
  #   label="python2"
  # else:
  #   label="python3"
  if shape == 2:
    return np.concatenate ([1-preds, preds], axis=1)
  else:
    return preds


def explain(line):
  # exp = explainer.explain_instance(line, lambda x: predict(x, shape=2), num_features=1, model_regressor=FusedLassoRegressor(),
                                  #  num_samples=10*32) # more makes the code very slow
  exp = explainer.explain_instance(line, lambda x: predict(x, shape=2), num_features=1,
                                   num_samples=10*32) # more makes the code very slow
  idx = exp.as_map()[1][0][0]
  # print(idx)
  return line [max(0, idx - WIN_SIZE//2 - 1):min(idx + WIN_SIZE//2 + 1, len(line))]


class FusedLassoRegressor(LinearModel, RegressorMixin):
  def fit(self, X, y, sample_weight=None):
    n_samples, n_feat = X.shape
    beta = cp.Variable(n_feat)
    forwardDiff = beta[1:] - beta[0:-1]
    objective = cp.Minimize(cp.sum_squares(X @ beta - y) + 10**4 * cp.norm(forwardDiff,2))
    prob = cp.Problem(objective)
    result = prob.solve()
    self.coef_ = beta.value
    self.intercept_ = 0
    return self


model_str = 'MM_dataset10k_Win5_NF32.keras'
# model_str = 'MM_dataset10k_Win5_NF64.keras'
# model_str = 'MM_dataset10k_Win5_NF128.keras'
# model_str = 'MM_dataset10k_Win7_NF32.keras'
# model_str = 'MM_dataset10k_Win7_NF64.keras'
# model_str = 'MM_dataset10k_Win7_NF128.keras'
# model_str = 'MM_dataset10k_Win10_NF32.keras'
# model_str = 'MM_dataset10k_Win10_NF64.keras'
# model_str = 'MM_dataset10k_Win10_NF128.keras'
m = models.load_model(model_str)
MAX_LEN = 100
WIN_SIZE = int (model_str.split("_")[2][3:])
EXPL_WIN = WIN_SIZE + 3
EXPL_EVAL_THR = 3
dataset = model_str.split("_")[1]+".csv"
m.summary()

print(WIN_SIZE)
print(dataset)


# Explain for dataset

# prepare data once for all
df = pd.read_csv(dataset, dtype={
    '__future__': 'object',
    'xrange': 'object'})
df = df.drop ( df[df["lines of code"].isnull()].index )

# consider only lines with annotated explanations
df_expl = df[df["explanation"].notnull()]
# add predictions
df_expl = df_expl.join( pd.Series(name="pred", data = predict(df_expl["lines of code"]).reshape(-1), index=df_expl.index) )
# consider only correct predictions
df_expl["pred_class"] = df_expl["pred"].map(lambda x: 3 if x>0.5 else 2)
df_expl = df_expl.loc[df_expl["pred_class"] == df_expl["class"]]

#char level explainer
explainer = LimeTextExplainer(class_names=["py2", "py3"], char_level=True, bow=False)


df_small = df_expl.copy()
df_small['lemna'] = df_small['lines of code'].map(explain)

df_small["acc_expl_lemna"] = df_small.apply(eval_expl, col="lemna", axis=1)
acc = sum(df_small["acc_expl_lemna"])
print (f"{acc}/{df_small.shape[0]}")
print (f"{acc * 100 / df_small.shape[0]:.2f}")


In [None]:
# -*- coding: utf-8 -*-
"""py2vpy3XaiLemna.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/14c9sxygiZNsY6OpPvlxvyrgj5TazVQDU

Shap
"""

# !pip install lime


import lime
from lime.lime_text import LimeTextExplainer
from keras.utils import pad_sequences
from keras import models
import numpy as np
import pandas as pd
from sklearn.linear_model._base import LinearModel,RegressorMixin
from sklearn.linear_model import Ridge, LinearRegression
import cvxpy as cp
import re


#longest common subsequence
def lcs(X, Y):
    # find the length of the strings
    m = len(X)
    n = len(Y)
    # declaring the array for storing the dp values
    L = [[None]*(n + 1) for i in range(m + 1)]
    """Following steps build L[m + 1][n + 1] in bottom up fashion
    Note: L[i][j] contains length of LCS of X[0..i-1]
    and Y[0..j-1]"""
    for i in range(m+1):
        for j in range(n+1):
            if i == 0 or j == 0 :
                L[i][j] = 0
            elif X[i-1] == Y[j-1]:
                L[i][j] = L[i-1][j-1]+1
            else:
                L[i][j] = 0
    return max ( max(l) for l in L )

def eval_expl(r, col):
  if re.search(re.compile(r'\bu[\'|\"]\s*\S+'), r[col]):
    return True
  return lcs(r["explanation"], r[col]) >= EXPL_EVAL_THR



def predict (lines, shape = 1):
  samples = [  bytearray(line, encoding="utf-8") for line in lines ]
  samples_padded = pad_sequences(samples, maxlen=MAX_LEN, padding='post', truncating='post')
  preds = m.predict(samples_padded)
  # if pred < 0.5:
  #   label="python2"
  # else:
  #   label="python3"
  if shape == 2:
    return np.concatenate ([1-preds, preds], axis=1)
  else:
    return preds


def explain(line):
  # exp = explainer.explain_instance(line, lambda x: predict(x, shape=2), num_features=1, model_regressor=FusedLassoRegressor(),
                                  #  num_samples=10*32) # more makes the code very slow
  exp = explainer.explain_instance(line, lambda x: predict(x, shape=2), num_features=1, num_samples=10*32)
  idx = exp.as_map()[1][0][0]
  # print(idx)
  return line [max(0, idx - WIN_SIZE//2 - 1):min(idx + WIN_SIZE//2 + 1, len(line))]


class FusedLassoRegressor(LinearModel, RegressorMixin):
  def fit(self, X, y, sample_weight=None):
    n_samples, n_feat = X.shape
    beta = cp.Variable(n_feat)
    forwardDiff = beta[1:] - beta[0:-1]
    objective = cp.Minimize(cp.sum_squares(X @ beta - y) + 10**4 * cp.norm(forwardDiff,2))
    prob = cp.Problem(objective)
    result = prob.solve()
    self.coef_ = beta.value
    self.intercept_ = 0
    return self

with open("output.txt", 'w') as f:

  for model_str in ['MM_dataset10k_Win10_NF32.keras',
                    'MM_dataset10k_Win10_NF64.keras',
                    'MM_dataset10k_Win10_NF128.keras']:
                    # ,
                    # 'MM_dataset10k_Win5_NF128.keras',
                    # 'MM_dataset10k_Win7_NF32.keras',
                    # 'MM_dataset10k_Win7_NF64.keras',
                    # 'MM_dataset10k_Win7_NF128.keras',
                    # 'MM_dataset10k_Win10_NF32.keras',
                    # 'MM_dataset10k_Win10_NF64.keras']:

    m = models.load_model(model_str)
    MAX_LEN = 100
    WIN_SIZE = int (model_str.split("_")[2][3:])
    EXPL_WIN = WIN_SIZE + 3
    EXPL_EVAL_THR = 3
    dataset = model_str.split("_")[1]+".csv"
    m.summary()

    print(WIN_SIZE)
    print(dataset)


    # Explain for dataset

    # prepare data once for all
    df = pd.read_csv(dataset, dtype={
        '__future__': 'object',
        'xrange': 'object'})
    df = df.drop ( df[df["lines of code"].isnull()].index )

    # consider only lines with annotated explanations
    df_expl = df[df["explanation"].notnull()]
    # add predictions
    df_expl = df_expl.join( pd.Series(name="pred", data = predict(df_expl["lines of code"]).reshape(-1), index=df_expl.index) )
    # consider only correct predictions
    df_expl["pred_class"] = df_expl["pred"].map(lambda x: 3 if x>0.5 else 2)
    df_expl = df_expl.loc[df_expl["pred_class"] == df_expl["class"]]

    #char level explainer
    explainer = LimeTextExplainer(class_names=["py2", "py3"], char_level=True, bow=False)


    df_small = df_expl.copy()
    df_small['lemna'] = df_small['lines of code'].map(explain)

    df_small["acc_expl_lemna"] = df_small.apply(eval_expl, col="lemna", axis=1)
    acc = sum(df_small["acc_expl_lemna"])
    print (f"{acc}/{df_small.shape[0]}")
    print (f"{acc * 100 / df_small.shape[0]:.2f}")
    f.write(f"{acc * 100 / df_small.shape[0]:.2f}\n")
