In [231]:
import time
import itertools
import re
import os
import pickle
import numpy as np

from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from lime.lime_text import LimeTextExplainer

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from collections import defaultdict




import tensorflow as tf

# PART 0: DATA LOADING AND PREPROCESSING

In [232]:

def clean_text(text):
    cleaned = re.sub(r'<.*?>', '', text).lower()
    return cleaned

def load_imdb_subset(
    num_samples=5000, 
    min_df=1, 
    max_features=15, 
    stopwords_option=True,
    stop_words = 'english'
):
    
    data = load_files(
        r"C:/Users/migue/Downloads/aclImdb_v1/aclImdb/train",
        categories=['pos','neg'], 
        encoding="utf-8", 
        decode_error="replace"                  
    )
    X_text_all, y_all = data.data, data.target


    X_text_all = [clean_text(txt) for txt in X_text_all]

    # Shuffle & truncate to num_samples
    full_idx = np.arange(len(X_text_all))
    #np.random.shuffle(full_idx)
    subset_idx = full_idx[:num_samples]
    global X_text 
    X_text = [X_text_all[i] for i in subset_idx]
    y = y_all[subset_idx]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_text, y, test_size=0.2, random_state=0
    )

    # Vectorizer: presence/absence
    if stopwords_option:
        vectorizer = CountVectorizer(
            binary=True, stop_words=stop_words, 
            min_df=min_df, max_features=max_features
        )
    else:
        vectorizer = CountVectorizer(
            binary=True, stop_words='english', 
            min_df=min_df, max_features=max_features
        )

    vectorizer.fit(X_train)
    return X_train, X_test, y_train, y_test, vectorizer



def train_NN_classifier(X_train, y_train, X_test, y_test, vectorizer):
    """
    Trains a neural network on the binary presence/absence of words.
    Returns the fitted model.
    """
    X_train_bow = vectorizer.transform(X_train)
    X_valid_bow = vectorizer.transform(X_test)
    input_dim = X_train_bow.shape[1]

    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),  # First hidden layer
        Dropout(0.3),  # Dropout with 30% probability
        Dense(32, activation='relu'),  # Second hidden layer
        Dropout(0.2),  # Dropout with 20% probability
        Dense(1, activation='sigmoid')  # Output layer for binary classification
    ])

    # Compile the model
    model.compile(optimizer=Adam(learning_rate = 0.0001), loss='binary_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    model.fit(X_train_bow, y_train, epochs=50, batch_size=20, validation_data=(X_valid_bow, y_test), verbose=1, callbacks=[early_stopping])
    return model

def get_cached_NN(X_train, y_train, vectorizer, num_samples, max_features, stop_words, X_valid, y_valid):
    """
    Checks if a classifier trained with the given parameters exists.
    If so, load it; otherwise, train it and save it.
    """
    filename = f"cached_classifier_ns{num_samples}_mf{max_features}_sw{stop_words}_NN_classifier_seed0.pkl"
    if os.path.exists(filename):
        print("Loading cached logistic from", filename)
        with open(filename, 'rb') as f:
            clNN = pickle.load(f)
    else:
        print("No cached classifier found. Training a new one...")
        clNN = train_NN_classifier(X_train, y_train, X_valid, y_valid, vectorizer)
        with open(filename, 'wb') as f:
            pickle.dump(clNN, f)
        print("Cached classifier saved as", filename)
    return clNN



# CLASSICAL LIME

In [None]:
def run_classical_lime(
    text_sample, clNN, vectorizer,  
    k_features=20, num_samples=50   
):
    
    """
    Runs classical LIME on a single text instance.
    Returns the top (word, weight) pairs.
    """
    class_names = ["negative", "positive"]
    explainer = LimeTextExplainer(class_names=class_names, feature_selection="auto")

    def predict_proba(texts):
        bow = vectorizer.transform(texts) 
        print('shaspe of box', bow.shape, ', text_sample:',text_sample, 'features: ', k_features, 'samples: ', num_samples)
        # print(bow)
        proba = clNN.predict(bow.toarray())
        if proba.ndim == 1:  # If 1D, reshape to (num_samples, 1)
            proba = proba.reshape(-1, 1)
        #print('proba', proba, 'dimension', proba.shape, 'return', np.hstack((1 - proba, proba)))
        return np.column_stack((1 - proba, proba))  # Return probabilities for both classes
        
        

    explanation = explainer.explain_instance(
        text_sample,
        predict_proba,
        num_features=k_features,
        num_samples=num_samples 
    )
    return explanation.as_list() 

# EXPERIMENTAL ROUTINE

In [None]:
def run_experiment( 
    num_samples=500,
    min_df=1,
    max_features=20,
    stopwords_option=True,
    lime_num_samples=300,
    stop_words = 'english',
):

    # A) Load data
    X_train, X_test, y_train, y_test, vectorizer = load_imdb_subset(
        num_samples=num_samples,
        min_df=min_df,
        max_features=max_features,
        stopwords_option=stopwords_option,
        stop_words = stop_words
    )
    # B) Train model

    clNN = get_cached_NN(X_train, y_train, vectorizer, num_samples, max_features, stop_words, X_test, y_test)

    # Evaluate
    X_test_bow = vectorizer.transform(X_test)
    test_acc = accuracy_score(y_test, clNN.predict(X_test_bow.toarray()) > 0.5)

    lime_times = []

    instance_local_accuracies = []
    #5,6,12,11,10, 0, 1, 2, 3, 4 
    #sample_indices = [5,6,12,11,10, 0, 1, 2, 3, 4]

    X_all = X_train + X_test  # assuming they are lists
    y_all = np.concatenate([y_train, y_test])  # assuming y_train and y_test are numpy arrays

    for idx in range(len(X_all)):
        text_sample = X_all[idx]
        y_true = y_all[idx]

        # 1) Classical LIME
        start_lime = time.time()
        explanation_lime = run_classical_lime(
            text_sample, clNN, vectorizer, 
            k_features=max_features, num_samples=lime_num_samples
        )

        bow = vectorizer.transform([text_sample])
        bin_features = bow.toarray()[0]

        y_pred = clNN.predict(bow.toarray())[0].item()
        y_pred_label = 1 if y_pred >= 0.5 else 0

        instance_accuracy = int(y_pred_label == y_true)

        print("instance_accuracy:", instance_accuracy)
        instance_local_accuracies.append(instance_accuracy)

   
        lime_time = time.time() - start_lime
        lime_times.append(lime_time)

        contributions_lime_abs = [(word, abs(score)) for word, score in explanation_lime]
        
        print("idx", idx, "text sample", text_sample)#, "bin_features", bin_features)
     
        word_weights = defaultdict(list)

        for word, weight in contributions_lime_abs:
        
            word_weights[word].append(weight)

        global_avg_weights = {word: sum(weights) / len(weights) for word, weights in word_weights.items()}

        threshold = 0.01
        filtered_words = {word: avg for word, avg in global_avg_weights.items() if avg >= threshold}
        rubish_words = {word: avg for word, avg in global_avg_weights.items() if avg <= threshold}

        for word in filtered_words:
            local_weights = word_weights[word]
            print(f"Word: {word}, local weights: {local_weights}")


    results = {
        "local_accuracy": np.mean(instance_local_accuracies),
        "lime_time_avg": round(np.mean(lime_times), 4),
        "global_acc": np.mean(test_acc)
    }
    return results



# MAIN

In [None]:
import pandas as pd
import sys, os

sys.path.append(os.getcwd())
sys.path.append(os.path.dirname(os.getcwd()))

if __name__ == "__main__":


    # Parameter grid to systematically vary certain settings
    param_grid = {
        "num_samples": [1000],
        "max_features": [500],
        "stopwords_option": [True],
        "lime_num_samples": [300],
        "stop_words": ['english'],
           
    }

    combos = list(itertools.product(*param_grid.values()))
    all_results = []

    for combo in combos:
        (num_samples_, max_features_, stopwords_, lime_samps_, stop_words_) = combo
        
        print("\n==================================")
        print(f"Running experiment with: "
              f"num_samples={num_samples_}, "
              f"max_features={max_features_}, "
              f"stopwords={stopwords_}, "
              f"lime_num_samples={lime_samps_}, "
              f"stop_words={stop_words_},")
        
        res = run_experiment(
            num_samples=num_samples_,
            max_features=max_features_,
            stopwords_option=stopwords_,
            lime_num_samples=lime_samps_,
            stop_words=stop_words_,
        )
        res_row = {
            "num_samples": num_samples_,
            "max_features": max_features_,
            "stopwords": stopwords_,
            "lime_num_samples": lime_samps_,
            "local_accuracy": res["local_accuracy"],
            "lime_time_avg": res["lime_time_avg"],
            "global_acc": res["global_acc"]
        }
        print("Results =>", res_row)
        all_results.append(res_row)

    # Save results to CSV
    df = pd.DataFrame(all_results)
    df.to_csv("results_expanded_flips.csv", index=False)
    print("\nAll done! Saved results to 'results_expanded_flips.csv'.")


Running experiment with: num_samples=10, max_features=50, stopwords=True, lime_num_samples=300, stop_words=english,
Loading cached logistic from cached_classifier_ns10_mf50_swenglish_NN_classifier_seed0.pkl
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
shaspe of box (300, 50) , text_sample: i've just had the evidence that confirmed my suspicions. a bunch of kids, 14 to 22 put on the dvd of "titanic" on a fantastic state of the art mega screen home entertainment type deal. only two of them had actually seen it before. but they all had seen the moment of kate, leo and celine dion so many times that most of them felt they had seen the whole movie. shortly after the epic started, they started to get restless, some of them left asking the others -- "call us when the iceberg appears" over an hour and a half into the movie, only the two girls who had seen the movie before, were still there. they started shouting: iceberg, iceberg. a stampede followed, they all came