<a href="https://colab.research.google.com/github/nidharap/Notebooks/blob/master/IMDB_Classification_RoBERTa_LIME.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Run LIME on a finetuned roBERTa model

In [38]:
!pip install transformers
!pip install eli5



In [39]:
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import RobertaModel, RobertaTokenizer
import torch
import numpy as np
import os
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from collections import defaultdict
from torch import nn, optim
import time
from bs4 import BeautifulSoup
from torch.utils.data import Dataset, DataLoader
import glob
import eli5
from eli5.lime import TextExplainer
%matplotlib inline
# %config InlineBackend.figure_format='retina'
# sns.set(style='whitegrid', palette='muted', font_scale=1.2)
# HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
# sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [40]:
#we will load the dataset from google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Dataset
### The dataset can be downloaded from multiple places , but I used this - https://www.kaggle.com/atulanandjha/imdb-50k-movie-reviews-test-your-bert

In [41]:
df = pd.read_csv("/content/drive/My Drive/Datasets/IMDBMovieReviews/train.csv")
df_test = pd.read_csv("/content/drive/My Drive/Datasets/IMDBMovieReviews/test.csv")
df['label_encoded'] = df['sentiment'].apply(lambda x: 0 if x=='neg' else 1)
df_test['label_encoded'] = df_test['sentiment'].apply(lambda x: 0 if x=='neg' else 1)
df['review_len'] = df['text'].apply(lambda x : len(x.split()))
class_names = ['neg', 'pos']

#remove html tags
df['text'] = df['text'].apply(lambda x: BeautifulSoup(x, "lxml").text)
df_test['text'] = df_test['text'].apply(lambda x: BeautifulSoup(x, "lxml").text)

#separate the train set into train and val
df_train, df_val = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)

df.shape, df_train.shape, df_val.shape, df_test.shape

((25000, 4), (20000, 4), (5000, 4), (25000, 3))

In [42]:
PRE_TRAINED_MODEL_NAME = 'roberta-base' #'bert-base-cased'
MAX_LEN = 512     #for the most part, things are below 500, lets set it to max len possible in bert
MODEL_SAVE_NAME = "imdb_movie_large_roberta_state"
LOAD_PREV_MODEL = True     #set to True to load a trained model from local 


#batch size is tricky . Larger batch size lead to memory issue on GPU
#I have a colab pro subscription that gives me larger memory and gpus, but for free versions,
#I suggest using batch size of 8. It will take longer run train with smaller batch sizes
BATCH_SIZE = 16    

In [43]:
tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
bert_model = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [44]:
class SentimentClassifier(nn.Module):
  '''
  Wrapper class to add dropout and FC layer on top of the 
  pooled output provided by hugging face transformers
  '''
  def __init__(self, n_classes, bertmodel, dropout_p=0.3):
    super(SentimentClassifier, self).__init__()
    self.bert = bertmodel
    self.dropout = nn.Dropout(p=dropout_p)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    last_hidden_state, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    output = self.dropout(pooled_output)
    return self.out(output)

In [45]:
#init the model
model = SentimentClassifier(len(class_names), bertmodel=bert_model).to(device)

In [46]:
EPOCHS = 10

#adam w is a special version of Adam optimizer that transformers library provides
#https://huggingface.co/transformers/main_classes/optimizer_schedules.html

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) 
loss_fn = nn.CrossEntropyLoss().to(device)
print("len(train_data_loader): ", len(train_data_loader))
total_steps = len(train_data_loader) * EPOCHS
print("total_steps: ", total_steps)

#define LR scheduler
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

len(train_data_loader):  1250
total_steps:  12500


In [47]:
history = None
best_accuracy = None

In [48]:
if torch.cuda.is_available():
    map_location=lambda storage, loc: storage.cuda()
else:
    map_location='cpu'

In [49]:
if LOAD_PREV_MODEL and len(list(glob.glob("/content/drive/My Drive/Datasets/IMDBMovieReviews/{}*".format(MODEL_SAVE_NAME)))) > 0:
  #Load previous model to continue training
  state_file_name = sorted(list(glob.glob("/content/drive/My Drive/Datasets/IMDBMovieReviews/{}*".format(MODEL_SAVE_NAME))))[-1]
  print("Loading : {}".format(state_file_name))
  state = torch.load(state_file_name, map_location=map_location)
  print(state.keys())
  

  model.load_state_dict(state['model'])

  optimizer.load_state_dict(state['optimizer'])
  scheduler.load_state_dict(state['scheduler'])
  history = state['history']
  best_accuracy = state['history']

  EPOCHS = EPOCHS - state['epoch']-1
  state = None

Loading : /content/drive/My Drive/Datasets/IMDBMovieReviews/imdb_movie_large_roberta_state_3
dict_keys(['epoch', 'model', 'optimizer', 'scheduler', 'history', 'best_accuracy'])




In [50]:
class_names

['neg', 'pos']

In [51]:
class SentimentDataset(Dataset):
  '''
  custom pytorch dataset class
  we will use this to pass into a dataloader which will break the dataset down
  '''
  def __init__( self, all_text, tokenizer, max_len=512, add_special_tokens=True , 
                 pad_to_max_length=True, truncation=True, 
                 return_attention_mask=True,return_token_type_ids=False,
                 return_tensors='pt'):
      self.return_token_type_ids=False
      self.pad_to_max_length = pad_to_max_length
      self.add_special_tokens = add_special_tokens
      self.truncation = truncation
      self.return_tensors = return_tensors
      self.return_attention_mask = return_attention_mask

      self.max_len = max_len
      self.text = all_text
      self.tokenizer = tokenizer
  
  def __len__(self):
    return len(self.text)

  def __getitem__(self, item):
    text = str(self.text[item])

    encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            add_special_tokens=self.add_special_tokens,
            return_token_type_ids=self.return_token_type_ids,
            pad_to_max_length=self.pad_to_max_length,
            return_attention_mask=self.return_attention_mask,
            return_tensors=self.return_tensors,
            truncation=self.truncation
          ) 

    return {
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
    }

In [52]:
class ClassifierWrapper:
  def __init__( self, input_model, tokenizer,
                batch_size=512, max_len=512, add_special_tokens=True , 
                 pad_to_max_length=True, truncation=True, 
                 return_attention_mask=True,return_token_type_ids=False,
                 return_tensors='pt'):
      '''
      Class that will divide the samples into batches and feed those into the
      model. This way, when LIME passes a large number of samples like 5000
      we do not overload the machine
      since we are just evaluating with no_grad, the default batch_size is set to 
      512, but can be changed based on hardware availability
      '''
      self.dataset_params = {
        "return_token_type_ids":return_token_type_ids,
        "pad_to_max_length": pad_to_max_length,
        "add_special_tokens": add_special_tokens,
        "truncation": truncation,
        "max_len" : max_len,
        "return_tensors" : return_tensors,
        "return_attention_mask": return_attention_mask,
        "tokenizer": tokenizer
      }
      self.batch_size = batch_size
      self.model = input_model
      self.model.eval();

  def create_data_loader(self):
    '''
    create a dataloader for the dataset
    '''
    #create a dataset object first
    ds = SentimentDataset(all_text=self.X, **self.dataset_params)

    return DataLoader(
      ds,
      batch_size=self.batch_size,
      num_workers=4
    )

  def predict_proba(self, X):
    '''
    function to get prediction probabilities for all the examples
    returns a NxM numpy array, that is required by LIME
    '''
    self.X = np.array(X)
    data_loader = self.create_data_loader()

    all_prediction_probs = []

    with torch.no_grad():
      for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)

        outputs = self.model(
          input_ids=input_ids,
          attention_mask=attention_mask
        )

        # _, predictions = torch.max(outputs, dim=1)
        prediction_probs = nn.functional.softmax(outputs, dim=1)
        all_prediction_probs.extend(prediction_probs)

    all_prediction_probs = torch.stack(all_prediction_probs).cpu()
    return all_prediction_probs.numpy()

In [53]:
# example = 'Hated it with all my being. Worst movie ever. Mentally- scarred. Help me. It was that bad.TRUST ME'

In [54]:
clf = ClassifierWrapper(model, tokenizer, batch_size=BATCH_SIZE)
te = TextExplainer(random_state=RANDOM_SEED)

In [55]:
for i, row in df_val.head(2).iterrows():
  print(row['text'])
  te.fit(row['text'], clf.predict_proba)
  te.show_prediction(target_names=class_names)

A woman (Sylvia Kristel) seduces a 15 year old boy (Eric Brown). They have sex...but it's all tied into some stupid plot or something.Easily one of the most disturbing sex comedies ever. Does anyone realize this movie is making light of child molestation? I suppose it's OK cause it's a teenage boy--if we had one with a man seducing a teenage girl there would (rightfully) be outrage. Sorry, but having it done to a boy doesn't excuse it. It's still sick. I realize Brown was of age (he was actually 18 when this was made) but he LOOKS 15. I just find it disturbing that some people find this OK.Plot aside the acting sucks (Kristel is beautiful--but can't act; Brown is easily one of the worst child actors I've ever seen) and the constant nudity gets boring and isn't even remotely erotic.I saw this drivel at a theatre back in 1981. I was 19 and with my 14 year old cousin (who could easily pass for 18). HE wanted to see it--I didn't but I decideD what the heck? We got in and I actually bought 

In [57]:
te.fit(df_val['text'].iloc[0], clf.predict_proba)
te.show_prediction(target_names=class_names)

Contribution?,Feature
1.551,boring
1.461,this
1.32,1
0.692,movie
0.679,sex
0.641,a
0.547,worst
0.502,three
0.497,heck
0.489,it
