In [1]:
#%pip install bertviz
#%pip install ipywidgets
#%pip install lime

In [2]:
import pandas as pd
import numpy as np
import pickle
import os
from bertviz import model_view, head_view
from transformers import AutoTokenizer, utils, AutoModelForSeq2SeqLM
utils.logging.set_verbosity_error()  # Suppress standard warnings
import matplotlib.pyplot as plt

from lime import lime_text

from sklearn.feature_extraction.text import TfidfVectorizer
from utils.int_training import vectorize_input_df, train_classifier, classifier_inference


from lime.lime_text import LimeTextExplainer

os.environ["CUDA_VISIBLE_DEVICES"] = "3"


ANALYSIS_POSTFIX = "mined_sudden_2024-08-16"

In [3]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/s2_model_results.pickle", "rb") as handle:
    s2_predictions = pickle.load(handle)


In [None]:
model_name = "Salesforce/codet5-base-multi-sum"
input_text = "sum(d * 10 ** i for i, d in enumerate(x[::-1]))"  
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, output_attentions=True)  # Configure model to return attention values
tokenizer = AutoTokenizer.from_pretrained(model_name)

# PREPARE AN EXAMPLE
input_ids = tokenizer(input_text, return_tensors="pt").input_ids
generated_ids = model.generate(input_ids, max_length=20)
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(output_text)

encoder_input_ids = tokenizer(input_text, return_tensors="pt", add_special_tokens=True).input_ids
with tokenizer.as_target_tokenizer():
    decoder_input_ids = tokenizer(output_text, return_tensors="pt", add_special_tokens=True).input_ids
outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])
print(decoder_text)

In [None]:
model_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens = decoder_text,
    display_mode="light"
)

In [6]:

explainer = LimeTextExplainer(class_names=['non-acceptable', 'acceptable'])
vectorizer = TfidfVectorizer()

res = vectorize_input_df(df=cv_predictions, vectorizer=vectorizer, fit=True, acc_rouge=0.15)
X, y, vectorizer = res["X"], res["y"], res["vectorizer"]

classifier = train_classifier(X, y)

features = list(vectorizer.get_feature_names_out())
features = list(cv_predictions.model_set.unique()) + features

In [None]:

feature_importance = classifier.feature_importances_
sorted_idx = np.argsort(feature_importance)
fig = plt.figure(figsize=(12, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(features)[sorted_idx])
plt.title('Feature Importance')


In [12]:
def vectorize_obs(input_texts, cv_predictions):

    global vectorizer

    if isinstance(input_texts, str):
        input_texts = [input_texts]

    dummy_df = cv_predictions.groupby("model_set").head(1).reset_index(drop=True)[["input_sequence", "model_set", "catboost_perf_hat"]]
    for i_text in input_texts:
        row_copy = dummy_df.iloc[[-1]].copy()
        row_copy["input_sequence"] = i_text
        row_copy["model_set"] = 0

        dummy_df = pd.concat([dummy_df, row_copy], axis=0)

    vectorized = vectorize_input_df(df=dummy_df, vectorizer=vectorizer, fit=False, acc_rouge=0.15)["X"]
    print(len(input_texts))
    X = vectorized[-len(input_texts):]
    return X 


In [13]:
def lime_classifier(txt):
  
  global classfier 
  global cv_predictions 
  txt = vectorize_obs(input_texts=txt, cv_predictions=cv_predictions)
  probs = classifier.predict_proba(txt)
  return probs

In [None]:
expl = explainer.explain_instance("""model_0 trainer = Seq2SeqTrainer(
                model=model,
                args=training_args,
                data_collator=data_collator,
                train_dataset=fold_train,
                eval_dataset=fold_val,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
            )""", lime_classifier,  num_samples=500)

In [None]:
expl.show_in_notebook(text=False)