In [1]:
import pandas as pd
import numpy as np

from transformers import RobertaForMultipleChoice, RobertaTokenizer, RobertaConfig, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from tqdm.notebook import tqdm
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import lime
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

import datetime, time
import random
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

import pickle

%matplotlib inline

In [2]:
df = pd.read_csv("dataset/winogender/all_sentences.tsv", delimiter="\t")
df.head(3)

Unnamed: 0,sentid,sentence
0,technician.customer.1.male.txt,The technician told the customer that he could...
1,technician.customer.1.female.txt,The technician told the customer that she coul...
2,technician.customer.1.neutral.txt,The technician told the customer that they cou...


In [3]:
gender_words= ["he","she","they","his","her","their","him","them"]

In [4]:
gender_arr = []
for _, row in tqdm(df.iterrows()):
    _s = row["sentence"].split(" ")
    for gw in gender_words:
        if gw in _s:
            gender_arr.append(gw)

0it [00:00, ?it/s]

In [5]:
df["gender_word"] = gender_arr

df.head(3)

Unnamed: 0,sentid,sentence,gender_word
0,technician.customer.1.male.txt,The technician told the customer that he could...,he
1,technician.customer.1.female.txt,The technician told the customer that she coul...,she
2,technician.customer.1.neutral.txt,The technician told the customer that they cou...,they


In [6]:
df.gender_word.value_counts()

he       178
she      178
they     178
her       62
his       54
their     54
him        8
them       8
Name: gender_word, dtype: int64

#### DF_MASK

In [None]:
df_mask = df[df['gender_word'].isin(["he", "she"])].copy()

In [None]:
_masked_sentences= []

for _, row in df_mask.iterrows():
    _masked_sentences.append(row["sentence"].replace(" he",  " <MASK>").replace(" she", " <MASK>"))

In [None]:
masked_sentences = list(_masked_sentences)

In [None]:
len(masked_sentences)

In [None]:
df_mask["masked_sentences"] = masked_sentences

In [None]:
df_mask.head(2)

In [None]:
label = []
for index, row in df_mask.iterrows():
    if(row['gender_word'] == 'he'): label.append(1)
    else: label.append(0)

In [None]:
df_mask["label"] = label

In [None]:
df_mask = df_mask.reset_index()

In [None]:
df_mask.head(3)

### SC

In [7]:
config = RobertaConfig.from_pretrained("roberta-base", num_labels=2)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", config=config)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [None]:
# config = RobertaConfig.from_pretrained("models/roberta-base-masked/", num_labels=2)
# model = RobertaForSequenceClassification.from_pretrained("models/roberta-base-masked/", config=config)
# tokenizer = RobertaTokenizer.from_pretrained("models/roberta-base-masked/")

#### Approach 1 - Using sentence

In [None]:
sentence_preds = []
for _, row in df_mask.iterrows():
    sent = row["sentence"]
    inputs = tokenizer(sent, return_tensors="pt")
    
    with torch.no_grad():
        logits = model(**inputs).logits
        
    pred_class_id = logits.argmax(dim=1).item()
    sentence_preds.append(pred_class_id)

In [None]:
set(sentence_preds)

#### Approach 2 - Using Masked Sentence

In [None]:
df_mask.head(2)

In [None]:
masked_predictions = []
for _, row in df_mask.iterrows():
    inputs = tokenizer(row["masked_sentences"], row["gender_word"], return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_class_id = logits.argmax(dim=1).item()
    masked_predictions.append(predicted_class_id)

In [None]:
set(masked_predictions)

In [None]:
def predictor(texts):
  outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
  probas = F.softmax(outputs.logits, dim=1).detach().numpy()
  return probas

In [None]:
class_names = ["he", "she"]

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
i = 100
df_mask["masked_sentences"][i], df_mask["gender_word"][i]

In [None]:
exp = explainer.explain_instance(df_mask["masked_sentences"][i], predictor, num_features=2, num_samples=100)
exp.show_in_notebook(text=(df_mask["masked_sentences"][i], df_mask["gender_word"][i]))

#### Approach 3 - Finetuning with Masked Gender Word

In [8]:
masked_sentence = []

for _, row in df.iterrows():
    masked_sentence.append(row.sentence.replace(" " + row.gender_word, " <mask>"))

df["masked_sentence"] = masked_sentence

In [9]:
df.head(5)

Unnamed: 0,sentid,sentence,gender_word,masked_sentence
0,technician.customer.1.male.txt,The technician told the customer that he could...,he,The technician told the customer that <mask> c...
1,technician.customer.1.female.txt,The technician told the customer that she coul...,she,The technician told the customer that <mask> c...
2,technician.customer.1.neutral.txt,The technician told the customer that they cou...,they,The technician told the customer that <mask> c...
3,technician.someone.1.male.txt,The technician told someone that he could pay ...,he,The technician told someone that <mask> could ...
4,technician.someone.1.female.txt,The technician told someone that she could pay...,she,The technician told someone that <mask> could ...


In [10]:
df.shape

(720, 4)

In [11]:
df.gender_word.unique()

array(['he', 'she', 'they', 'his', 'her', 'their', 'him', 'them'],
      dtype=object)

In [12]:
gender_words

['he', 'she', 'they', 'his', 'her', 'their', 'him', 'them']

In [13]:
_masked_sentence, _org_gender_word, _gender_word, _label = [], [], [], []

for _, row in df.iterrows():
    for gw in gender_words:
        if row.gender_word == gw:
            _label.append(True)
        else:
            _label.append(False)
        _masked_sentence.append(row.masked_sentence)
        _org_gender_word.append(row.gender_word)
        _gender_word.append(gw)

In [14]:
len(_masked_sentence)

5760

In [15]:
_label[700]

False

In [16]:
_df = pd.DataFrame(zip(_masked_sentence, _gender_word, _label),
                   columns=['masked_sentence', 'gender_word', 'label'])

_df.head(20)

Unnamed: 0,masked_sentence,gender_word,label
0,The technician told the customer that <mask> c...,he,True
1,The technician told the customer that <mask> c...,she,False
2,The technician told the customer that <mask> c...,they,False
3,The technician told the customer that <mask> c...,his,False
4,The technician told the customer that <mask> c...,her,False
5,The technician told the customer that <mask> c...,their,False
6,The technician told the customer that <mask> c...,him,False
7,The technician told the customer that <mask> c...,them,False
8,The technician told the customer that <mask> c...,he,False
9,The technician told the customer that <mask> c...,she,True


In [17]:
_df.shape

(5760, 3)

In [18]:
mask_df = _df.drop_duplicates()
mask_df.shape

(2960, 3)

In [19]:
# masked_sentence, label, org_gender_word, gender_word = [], [], [], []

# for i in range(len(_masked_sentence)):
#     if _label[i] == True:
#         masked_sentence.append(_masked_sentence[i])
#         org_gender_word.append(_org_gender_word[i])
#         gender_word.append(_gender_word[i])
#         label.append(_label[i])
        
# for i in range(len(_masked_sentence)):
#     if _label[i] == False and _org_gender_word[i] != _gender_word[i]:
#         masked_sentence.append(_masked_sentence[i])
#         org_gender_word.append(_org_gender_word[i])
#         gender_word.append(_gender_word[i])
#         label.append(_label[i])

In [20]:
# len(masked_sentence), len(label), len(gender_word), len(org_gender_word)

In [21]:
# _i = 700
# masked_sentence[_i], org_gender_word[_i], gender_word[_i], label[_i]

In [22]:
mask_df.gender_word.value_counts()

she      462
he       452
they     430
her      348
his      340
their    340
him      294
them     294
Name: gender_word, dtype: int64

In [23]:
mask_df.label.value_counts()

False    2240
True      720
Name: label, dtype: int64

In [24]:
train_df = mask_df.groupby("label").sample(frac=0.6, random_state=2023)
val_test_df = mask_df.loc[mask_df.index.difference(train_df.index)]

val_df = val_test_df.groupby("label").sample(frac=0.6, random_state=2023)
test_df = val_test_df.loc[val_test_df.index.difference(val_df.index)]

print(train_df.shape, val_df.shape, test_df.shape)

(1776, 3) (711, 3) (473, 3)


In [25]:
def encode_input(sentences, gender_word, label, maxlen):

  encoded_input_ids = []
  encoded_input_attn_mask = []

  for i in tqdm(range(0, len(sentences))):
    encoded_input = tokenizer.encode_plus(sentences[i],
                                          gender_word[i],
                                        add_special_tokens=True,
                                        max_length=maxlen,
                                        return_attention_mask=True,
                                        return_tensors="pt",
                                        truncation=True,
                                        padding="max_length")
    
    encoded_input_ids.append(encoded_input.input_ids)
    encoded_input_attn_mask.append(encoded_input.attention_mask)

  encoded_input_ids = torch.cat(encoded_input_ids, dim=0)
  encoded_input_attn_mask = torch.cat(encoded_input_attn_mask, dim=0)
  label = torch.tensor(label, dtype=torch.long)

  return (encoded_input_ids, encoded_input_attn_mask, label)

In [26]:
train_sentences = train_df.masked_sentence.values
val_sentences = val_df.masked_sentence.values
test_sentences = test_df.masked_sentence.values

train_gender_word = train_df.gender_word.values
val_gender_word = val_df.gender_word.values
test_gender_word = test_df.gender_word.values

train_labels = train_df.label.values
val_labels = val_df.label.values
test_labels = test_df.label.values

In [27]:
train_encoded_input_ids, train_encoded_input_attn_mask, train_label = encode_input(sentences = train_sentences, gender_word = train_gender_word, label = train_labels, maxlen = 64)
val_encoded_input_ids, val_encoded_input_attn_mask, val_label = encode_input(sentences = val_sentences,  gender_word = val_gender_word, label = val_labels, maxlen = 64)
test_encoded_input_ids, test_encoded_input_attn_mask, test_label = encode_input(sentences = test_sentences,  gender_word = test_gender_word, label = test_labels, maxlen = 64)

  0%|          | 0/1776 [00:00<?, ?it/s]

  0%|          | 0/711 [00:00<?, ?it/s]

  0%|          | 0/473 [00:00<?, ?it/s]

In [28]:
print(f"Original Sentence: {train_sentences[0]}")
print(f"Token IDs: {train_encoded_input_ids[0]}")
print(f"Attention Mask: {train_encoded_input_attn_mask[0]}")
print(f"Label: {train_labels[0]}")
print(f'Decoded Token IDs: {tokenizer.decode(train_encoded_input_ids[0])}')

Original Sentence: The officer arrested the protester even though <mask> was not on duty.
Token IDs: tensor([    0,   133,  1036,  1128,     5, 26065,   190,   600, 50264,    21,
           45,    15,  4053,     4,     2,     2, 35369,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1])
Attention Mask: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
Label: False
Decoded Token IDs: <s>The officer arrested the protester even though<mask> was not on duty.</s></s>them</s><pad><pad><pad><pad><pad><pad><pad><pad><

In [29]:
train_ds = TensorDataset(train_encoded_input_ids, train_encoded_input_attn_mask, train_label)
eval_ds = TensorDataset(val_encoded_input_ids, val_encoded_input_attn_mask, val_label)
test_ds= TensorDataset(test_encoded_input_ids, test_encoded_input_attn_mask, test_label)

In [31]:
config = {
    "eos_token" : "</s>>",
    "batch_size" : 8,
    "random_seed" : 2023
}

In [32]:
train_dataloader = DataLoader(
            train_ds,  
            sampler = RandomSampler(train_ds), 
            batch_size = config["batch_size"]
        )

validation_dataloader = DataLoader(
            eval_ds, 
            sampler = RandomSampler(eval_ds), 
            batch_size = config["batch_size"]
        )

test_dataloader = DataLoader(
            test_ds, 
            sampler = RandomSampler(test_ds), 
            batch_size = config["batch_size"]
        )

In [33]:
EPOCHS = 5
LR = 2e-5
EPS = 1e-8

optimizer = AdamW(model.parameters(), lr = LR, eps = EPS)
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                           num_warmup_steps = 0, 
                                           num_training_steps = total_steps)



In [34]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [35]:
random.seed(config["random_seed"])
np.random.seed(config["random_seed"])
torch.manual_seed(config["random_seed"])
torch.cuda.manual_seed_all(config["random_seed"])

In [36]:
training_stats = []
epochs = EPOCHS

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))


        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]

        model.zero_grad()        


        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]

        with torch.no_grad():        


            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...


KeyboardInterrupt: 

In [None]:
# pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')

df_stats

In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4, 5, 6, 7])

plt.show()

In [None]:
# Prediction on test set

# print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in tqdm(test_dataloader):
  # Add batch to GPU
  batch = tuple(t for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  pred_labels = np.argmax(logits, axis=1)
  label_ids = b_labels.numpy()
  
  # Store predictions and true labels
  predictions.extend(pred_labels.tolist())
  true_labels.extend(label_ids.tolist())

print('DONE.')

In [None]:
result_report= classification_report(true_labels, predictions, digits=3, output_dict=True)
result_report

In [None]:
confusion_matrix(true_labels, predictions)

In [None]:
model.save_pretrained("models/roberta-base-masked/")
tokenizer.save_pretrained("models/roberta-base-masked/")

#### XAI

In [None]:
def predictor1(texts):
    outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
    _tok_op = tokenizer(texts, return_tensors="pt", padding=True)
    # print(_tok_op)
    print(tokenizer.convert_ids_to_tokens(_tok_op.input_ids[0]))
    probas = F.softmax(outputs.logits, dim=1).detach().numpy()
    return probas

In [None]:
class_names = [True, False]

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
test_df = test_df.reset_index()

In [None]:
_str_to_predict, gender_word, gold_label = train_df.masked_sentence[1], train_df.gender_word[1], train_df.label[1]
_str_to_predict, gender_word, gold_label

In [None]:
exp = explainer.explain_instance(str_to_predict + '</s></s>' + gender_word + '</s>', predictor1, num_features=len(str_to_predict.split(" ")), num_samples=10)
# exp = explainer.explain_instance(str_to_predict + '</s></s>' + gender_word + '</s>', predictor1, num_features=3, num_samples=1000)
exp.show_in_notebook(text=str_to_predict)

In [None]:
_op = tokenizer.encode_plus(_str_to_predict.replace("<MASK>", "<mask>"), gender_word, max_length=64, return_tensors="pt", padding="max_length")
_op

In [None]:
tokenizer.convert_ids_to_tokens(_op.input_ids[0])

In [None]:
def predictor2(texts):
    outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
    _tok_op = tokenizer(texts, return_tensors="pt", padding=True)
    print(_tok_op)
    probas = F.softmax(outputs.logits, dim=1).detach().numpy()
    return probas

In [None]:
str_to_predict = _str_to_predict + '|' + gender_word
# exp = explainer.explain_instance(str_to_predict, predictor, num_features=len(str_to_predict.split(" ")), num_samples=100)
exp = explainer.explain_instance(str_to_predict, predictor2, num_features=5, num_samples=100)
exp.show_in_notebook(text=str_to_predict)

In [None]:
def predictor(texts):
  outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
  print(tokenizer.encode(texts, return_tensors="pt", padding=True))
  probas = F.softmax(outputs.logits, dim=1).detach().numpy()
  return probas

In [None]:
class_names = [False, True]

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
_str_to_predict, gender_word, gold_label = test_df.masked_sentence[0], test_df.gender_word[0], test_df.label[0]
_str_to_predict, gender_word, gold_label

In [None]:
_inp = 

In [None]:
tokenizer.decode(
    tokenizer.encode_plus(
    _str_to_predict,
    gender_word,
    return_tensors='pt',
    padding=True
).input_ids[0]
)

In [None]:
tokenizer.encode_plus(str_to_predict + '</s></s>' + gender_word + '</s>', return_tensors="pt", padding=True)

In [None]:
str_to_predict + '</s></s>' + gender_word + '</s>'

In [None]:
def predictor(texts):
    # print(texts)
    sent, gender_word = texts[0].split("|")[0], texts[0].split("|")[1]
    print(sent, gender_word)
    _model_input = tokenizer(sent, gender_word, return_tensors="pt", padding=True)
    print(_model_input)
    outputs = model()
    print(tokenizer.encode(texts, return_tensors="pt", padding=True))
    probas = F.softmax(outputs.logits, dim=1).detach().numpy()
    return probas

In [None]:
str_to_predict = _str_to_predict + '|' + gender_word
# exp = explainer.explain_instance(str_to_predict, predictor, num_features=len(str_to_predict.split(" ")), num_samples=100)
exp = explainer.explain_instance(str_to_predict, predictor, num_features=5, num_samples=100)
exp.show_in_notebook(text=str_to_predict)

#### Approach 4 - Finetuning with Gender Word Replacement

In [None]:
gender_words= ["he","she","they","his","her","their","him","them"]

In [None]:
_sentence, _label, _gender_word = [], [], []

for _, row in df.iterrows():
    for gw in gender_words:
        if row.gender_word == gw:
            _label.append(True)
        else:
            _label.append(False)
        
        _sentence.append(row.sentence.replace(" "+row.gender_word, " "+gw))
        _gender_word.append(row.gender_word)

In [None]:
sentence, label, gender_word = [], [], []

for i in range(len(_sentence)):
    if _label[i] == True:
        sentence.append(_sentence[i])
        gender_word.append(_gender_word[i])
        label.append(_label[i])
        
for i in range(len(_sentence)):
    if _label[i] == False and _sentence[i] not in sentence:
        sentence.append(_sentence[i])
        gender_word.append(_gender_word[i])
        label.append(_label[i])

In [None]:
len(sentence), len(label), len(gender_word)

In [None]:
extract_df = pd.DataFrame(zip(sentence, gender_word, label), columns=["sentence", "gender_word", "label"])
extract_df = extract_df.sample(frac=1, random_state=2023).reset_index(drop=True)
extract_df.head(5)

In [None]:
extract_df.sentence[0], extract_df.gender_word[0], extract_df.label[0]

In [None]:
extract_df.label.value_counts()

In [None]:
train_df = extract_df.groupby("label").sample(frac=0.6, random_state=2023)
val_test_df = extract_df.loc[extract_df.index.difference(train_df.index)]

val_df = val_test_df.groupby("label").sample(frac=0.6, random_state=2023)
test_df = val_test_df.loc[val_test_df.index.difference(val_df.index)]

print(train_df.shape, val_df.shape, test_df.shape)

In [None]:
train_df.label.value_counts()

In [None]:
val_df.label.value_counts()

In [None]:
test_df.label.value_counts()

In [None]:
def encode_input(sentences, gender_word, label, maxlen):

  encoded_input_ids = []
  encoded_input_attn_mask = []

  for i in tqdm(range(0, len(sentences))):
    encoded_input = tokenizer.encode_plus(sentences[i],
                                          gender_word[i],
                                        add_special_tokens=True,
                                        max_length=maxlen,
                                        return_attention_mask=True,
                                        return_tensors="pt",
                                        truncation=True,
                                        padding="max_length")
    
    encoded_input_ids.append(encoded_input.input_ids)
    encoded_input_attn_mask.append(encoded_input.attention_mask)

  encoded_input_ids = torch.cat(encoded_input_ids, dim=0)
  encoded_input_attn_mask = torch.cat(encoded_input_attn_mask, dim=0)
  label = torch.tensor(label, dtype=torch.long)

  return (encoded_input_ids, encoded_input_attn_mask, label)

In [None]:
train_sentences = train_df.sentence.values
val_sentences = val_df.sentence.values
test_sentences = test_df.sentence.values

train_gender_word = train_df.gender_word.values
val_gender_word = val_df.gender_word.values
test_gender_word = test_df.gender_word.values

train_labels = train_df.label.values
val_labels = val_df.label.values
test_labels = test_df.label.values

In [None]:
train_encoded_input_ids, train_encoded_input_attn_mask, train_label = encode_input(sentences = train_sentences, gender_word = train_gender_word, label = train_labels, maxlen = 64)
val_encoded_input_ids, val_encoded_input_attn_mask, val_label = encode_input(sentences = val_sentences,  gender_word = val_gender_word, label = val_labels, maxlen = 64)
test_encoded_input_ids, test_encoded_input_attn_mask, test_label = encode_input(sentences = test_sentences,  gender_word = test_gender_word, label = test_labels, maxlen = 64)

In [None]:
print(f"Original Sentence: {train_sentences[0]}")
print(f"Token IDs: {train_encoded_input_ids[0]}")
print(f"Attention Mask: {train_encoded_input_attn_mask[0]}")
print(f"Label: {train_labels[0]}")

In [None]:
tokenizer.decode(train_encoded_input_ids[0])

In [None]:
tokenizer.decode(tokenizer(train_sentences[0], train_gender_word[0]).input_ids)

In [None]:
train_ds = TensorDataset(train_encoded_input_ids, train_encoded_input_attn_mask, train_label)
eval_ds = TensorDataset(val_encoded_input_ids, val_encoded_input_attn_mask, val_label)
test_ds= TensorDataset(test_encoded_input_ids, test_encoded_input_attn_mask, test_label)

In [None]:
config = {
    "eos_token" : "[SEP]",
    "batch_size" : 8,
    "random_seed" : 2023
}

In [None]:
train_dataloader = DataLoader(
            train_ds,  
            sampler = RandomSampler(train_ds), 
            batch_size = config["batch_size"]
        )

validation_dataloader = DataLoader(
            eval_ds, 
            sampler = RandomSampler(eval_ds), 
            batch_size = config["batch_size"]
        )

test_dataloader = DataLoader(
            test_ds, 
            sampler = RandomSampler(test_ds), 
            batch_size = config["batch_size"]
        )

In [None]:
EPOCHS = 5
LR = 2e-5
EPS = 1e-8

optimizer = AdamW(model.parameters(), lr = LR, eps = EPS)
total_steps = len(train_dataloader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                           num_warmup_steps = 0, 
                                           num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
random.seed(config["random_seed"])
np.random.seed(config["random_seed"])
torch.manual_seed(config["random_seed"])
torch.cuda.manual_seed_all(config["random_seed"])

In [None]:
training_stats = []
epochs = EPOCHS

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))


        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]

        model.zero_grad()        


        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0]
        b_input_mask = batch[1]
        b_labels = batch[2]

        with torch.no_grad():        


            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits
            
        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

In [None]:
# pd.set_option('precision', 2)
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')

df_stats

In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4, 5, 6, 7])

plt.show()

In [None]:
# Prediction on test set

# print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in tqdm(test_dataloader):
  # Add batch to GPU
  batch = tuple(t for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(b_input_ids, 
                     token_type_ids=None, 
                     attention_mask=b_input_mask,
                     return_dict=True)

  logits = result.logits

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  pred_labels = np.argmax(logits, axis=1)
  label_ids = b_labels.numpy()
  
  # Store predictions and true labels
  predictions.extend(pred_labels.tolist())
  true_labels.extend(label_ids.tolist())

print('DONE.')

In [None]:
result_report= classification_report(true_labels, predictions, digits=3, output_dict=True)
result_report

In [None]:
confusion_matrix(true_labels, predictions)

In [None]:
model.save_pretrained("models/roberta-base/")
tokenizer.save_pretrained("models/roberta-base/")

In [None]:
test_df.head(5)

In [None]:
test_df.label.value_counts()

#### XAI for SC on Finetuned RoBERTa-Base

In [None]:
def predictor(texts):
  outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
  print(tokenizer(texts, return_tensors="pt", padding=True))
  probas = F.softmax(outputs.logits, dim=1).detach().numpy()
  return probas

In [None]:
class_names = [False, True]

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
test_df.head(3)

In [None]:
str_to_predict, gender_word, gold_label = test_df.sentence[0], test_df.gender_word[0], test_df.label[0]
str_to_predict, gender_word, gold_label

In [None]:
# def text_tokenizer(sentence, gender_word):
#     return 

In [None]:
exp = explainer.explain_instance(str_to_predict + '</s></s>' + gender_word + '</s>', predictor, num_features=len(str_to_predict.split(" ")), num_samples=100)
# exp = explainer.explain_instance(str_to_predict, predictor, num_features=5, num_samples=100)
exp.show_in_notebook(text=str_to_predict)

#### XAI on DFs

In [None]:
def predictor(texts):
  outputs = model(**tokenizer(texts, return_tensors="pt", padding=True))
  probas = F.softmax(outputs.logits, dim=1).detach().numpy()
  return probas

In [None]:
class_names = [False, True]

In [None]:
explainer = LimeTextExplainer(class_names=class_names)

In [None]:
train_explain, val_explain, test_explain = [], [], []

for i in tqdm(range(len(train_sentences))): 
  sent = train_sentences[i] + '</s></s>' + train_gender_word[i] + '</s>'
  length = len(sent.split(" "))
  exp = explainer.explain_instance(sent, predictor, num_features=len(train_sentences[i].split(" ")), num_samples = 100)
  train_explain.append(exp.as_list())

In [None]:
with open('explain/train_explain', 'wb') as fp:
    pickle.dump(train_explain, fp)

In [None]:
for i in tqdm(range(len(val_sentences))): 
  sent = val_sentences[i] + '</s></s>' + val_gender_word[i] + '</s>'
  length = len(sent.split(" "))
  exp = explainer.explain_instance(sent, predictor, num_features=len(val_sentences[i].split(" ")), num_samples = 100)
  val_explain.append(exp.as_list())
  
with open('explain/val_explain', 'wb') as fp:
    pickle.dump(val_explain, fp)

In [None]:
for i in tqdm(range(len(test_sentences))): 
  sent = test_sentences[i] + '</s></s>' + test_gender_word[i] + '</s>'
  length = len(sent.split(" "))
  exp = explainer.explain_instance(sent, predictor, num_features=len(test_sentences[i].split(" ")), num_samples = 100)
  test_explain.append(exp.as_list())
  
with open('explain/test_explain', 'wb') as fp:
    pickle.dump(test_explain, fp)

In [None]:
train_explain[0]

In [None]:
with open ('explain/train_explain', 'rb') as fp:
    train_explain = pickle.load(fp)

with open ('explain/val_explain', 'rb') as fp:
    val_explain = pickle.load(fp)

with open ('explain/test_explain', 'rb') as fp:
    test_explain = pickle.load(fp)

### MC

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaForMultipleChoice.from_pretrained("roberta-base")

In [None]:
choices = gender_words
preds =[]

for _, row in df.iterrows():
    prompt = row.sentence.replace(" he", " <MASK>")
    encoding = tokenizer([prompt] * len(choices), choices, return_tensors="pt", padding=True)
    outputs = model(**{k:v.unsqueeze(0) for k, v in encoding.items()})
    pred = outputs.logits.argmax(dim=1)
    # print(F.softmax(outputs.logits, dim=1).detach().numpy())
    # print(prompt, choices[pred])
    preds.append(choices[pred])
    break

In [None]:
df["preds"] = preds

df.head(5)

In [None]:
df.preds.value_counts()

In [None]:
df.gender_word.value_counts()

### XAI

In [None]:
import lime
import torch.nn.functional as F
from lime.lime_text import LimeTextExplainer

In [None]:
def predictor(texts):
  encoding = tokenizer([texts] * len(choices), choices, return_tensors="pt", padding=True)
  outputs = model(**{k:v.unsqueeze(0) for k, v in encoding.items()})
  probas = F.softmax(outputs.logits, dim=1).detach().numpy()
  print(probas)
  return probas

In [None]:
gender_words= ["he","she","they","his","her","their","him","them"]

In [None]:
explainer = LimeTextExplainer(class_names=gender_words)

In [None]:
str_to_predict =  df.sentence[0].replace(" he", " <MASK>")
str_to_predict

In [None]:
exp = explainer.explain_instance(str_to_predict, predictor, num_features=2)
exp.show_in_notebook(text=str_to_predict)