<a href="https://colab.research.google.com/github/lorenzrossi/InfoRet_SemanticSimilarity/blob/main/InfoRet_Semantic_Similarity_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
from sklearn.metrics import classification_report, confusion_matrix

import opendatasets as op

import os

# NLTK
import nltk as nlp
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import ngrams

# PYTORCH
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig
import random
import gc

from google.colab import drive
drive.mount('/content/drive')

dir = '/content/drive/MyDrive/Infor'

In [None]:
# Load the datasets LORENZO TU DEVI CARICARE I FILE DA DRIVE STUPIDO COGLIONE

!wget https://nlp.stanford.edu/data/glove.6B.zip

!unzip glove.6B.zip

op.download("https://www.kaggle.com/datasets/stanfordu/stanford-natural-language-inference-corpus")

In [None]:
nlp.download('stopwords')
stop_words = stopwords.words('english')

In [None]:
datasetFolder = "mettere folder drive"

In [None]:
train_raw = pd.read_csv(os.path.join(datasetFolder, "snli_1.0_train.csv"))
test_raw = pd.read_csv(os.path.join(datasetFolder, "snli_1.0_test.csv"))
eval_raw = pd.read_csv(os.path.join(datasetFolder, "snli_1.0_dev.csv"))

In [None]:
train_raw.head(3)

In [None]:
train_raw.describe()

In [None]:
eval_raw.describe()

In [None]:
test_raw.describe()

In [None]:
# Data shape and missing values
print('■ Train set: ' + str(train_raw.shape))
print(train_raw.isnull().sum())

print('\n■ Evaluation set: ' + str(eval_raw.shape))
print(eval_raw.isnull().sum())

print('\n■ Test set:' + str(test_raw.shape))
print(test_raw.isnull().sum())

In [None]:
# Make sure all datasets have the same labels
print(train_raw['gold_label'].unique())
print(eval_raw['gold_label'].unique())
print(test_raw['gold_label'].unique())

In [None]:
# Propotion of gold labels (train only)
ratio_gold = train_raw['gold_label'].value_counts(normalize=True).sort_index(ascending=False).reset_index().set_index('index')
ratio_gold['gold_label'] = ratio_gold['gold_label'].apply(lambda x: round(x, 3))
ratio_gold

In [None]:
# Visualize the propotion
colors = sns.color_palette('pastel')
plt.pie(ratio_gold['gold_label'], labels=ratio_gold.index, colors=colors, autopct='%.1f%%', startangle=90)
plt.title('Proportion of the gold labels')
plt.show()

In [None]:
# Lengths of sentenses (train only)
# Sentense 1
train_sent1 = train_raw['sentence1'].str.count(' ') + 1
train_sent1 = train_sent1.apply(lambda x: int(x))
print('Sentence 1\n', round(train_sent1.describe(), 2))

# Sentense 2
train_sent2 = train_raw['sentence2'].dropna().str.count(' ') + 1
train_sent2 = train_sent2.apply(lambda x: int(x))
print('\nSentence 2\n', round(train_sent2.describe(), 2))

In [None]:
# Visualize the distribution of the lengths of sentences 1 and 2
train_sentences = pd.DataFrame({'sentence1':train_sent1,
                                'sentence2':train_sent2})

box = sns.boxplot(data=train_sentences, palette=colors)
box.set_ylabel('Words in a sentence')
box.set_title('Distribution of the lengths of sentences')
plt.show()

In [None]:
# Examples in sentence 1
# Minimum count of words
example1_min = train_sent1[train_sent1 == train_sent1.min()].sample(1)
print('Min word count: ', train_sent1.min())
print('Example: ', train_raw['sentence1'].loc[example1_min.index])
print('\n')

# Maximum count of words
example1_max = train_sent1[train_sent1 == train_sent1.max()].sample(1)
print('Max word count: ', train_sent1.max())
print('Example: ', train_raw['sentence1'].loc[example1_max.index])

In [None]:
# Examples in sentence 2
# Minimum count of words
example2_min = train_sent2[train_sent2 == train_sent2.min()].sample(1)
print('Min word count: ', train_sent2.min())
print('Example: ', train_raw['sentence2'].loc[example2_min.index])
print('\n')

# Maximum count of words
example2_max = train_sent2[train_sent2 == train_sent2.max()].sample(1)
print('Max word count: ', train_sent2.max())
print('Example: ', train_raw['sentence2'].loc[example2_max.index])

# PREPROCESSING

In [None]:
# Omit rows having the gold label "-" and irrelevant columns 
train = train_raw[['gold_label', 'sentence1', 'sentence2']][train_raw['gold_label'] != '-'].set_index(train_raw['pairID'][train_raw['gold_label'] != '-'])
eval = eval_raw[['gold_label', 'sentence1', 'sentence2']][eval_raw['gold_label'] != '-'].set_index(eval_raw['pairID'][eval_raw['gold_label'] != '-'])
test = test_raw[['gold_label', 'sentence1', 'sentence2']][test_raw['gold_label'] != '-'].set_index(test_raw['pairID'][test_raw['gold_label'] != '-'])

# Minimize the datasets for quick trials
train = train.iloc[:50000, :]
eval = eval.sample(100)
test = test.sample(100)

train.head(3)

In [None]:
# Omit null indexes
train.dropna(subset=['sentence2'], inplace=True)

# Recheck the number of null values
print(train.isnull().sum())
print(eval.isnull().sum())
print(test.isnull().sum())

In [None]:
# Check sentences including URL
print(train['sentence1'][train['sentence1'].str.contains('http')].count())
print(train['sentence2'][train['sentence2'].str.contains('http')].count())

print(eval['sentence1'][eval['sentence1'].str.contains('http')].count())
print(eval['sentence2'][eval['sentence2'].str.contains('http')].count())

print(test['sentence1'][test['sentence1'].str.contains('http')].count())
print(test['sentence2'][test['sentence2'].str.contains('http')].count())

In [None]:
# Check URL-only sentences
dropindex = train.index[train['sentence2'].str.contains('http')]
train[train['sentence2'].str.contains('http')]

In [None]:
# Omit URL-only sentences
train.drop(index=dropindex, inplace=True)
print(train[train['sentence2'].str.contains('http')].count())

In [None]:
# Dummy coding for gold labels
train['gold_label'] = train['gold_label'].replace('neutral', 0).replace('entailment', 1).replace('contradiction', 2)
eval['gold_label'] = eval['gold_label'].replace('neutral', 0).replace('entailment', 1).replace('contradiction', 2)
test['gold_label'] = test['gold_label'].replace('neutral', 0).replace('entailment', 1).replace('contradiction', 2)

train.head(3)

# Frequency analysis

In [None]:
nlp.download('popular')

In [None]:
def tokenize(sentence):
    # Tokenization
    new_tokens = word_tokenize(sentence)
    new_tokens = [t.lower() for t in new_tokens]
    new_tokens = [t for t in new_tokens if t not in stopwords.words('english')]
    new_tokens = [t for t in new_tokens if t.isalpha()]

    # Lemmatization (become, becomes, becoming, became --> become)
    lemmatizer = WordNetLemmatizer()
    new_tokens =[lemmatizer.lemmatize(t) for t in new_tokens]
    return new_tokens

In [None]:
# Connect all sentences in the preprocessed training set
train_sentence1 = " ".join(train['sentence1'])
token_s1 = tokenize(train_sentence1)

train_sentence2 = " ".join(train['sentence2'])
token_s2 = tokenize(train_sentence2)

In [None]:
# Visualization of frequent words in the train dataset
# Count the words
count_s1 = Counter(token_s1)
word_freq_s1 = pd.DataFrame(count_s1.items(), columns=['Word','Frequency']).sort_values(by='Frequency', ascending=False)

count_s2 = Counter(token_s2)
word_freq_s2 = pd.DataFrame(count_s2.items(), columns=['Word','Frequency']).sort_values(by='Frequency', ascending=False)

# Create subplots
nb_ranking = 15
fig = plt.figure(figsize=(15, 3))
ax1 = fig.add_subplot(1, 2, 1)
ax2 = fig.add_subplot(1, 2, 2)

sns.barplot(x='Frequency', y='Word', data=word_freq_s1.head(nb_ranking), ax=ax1).set(xlim=(0, 15000))
ax1.set_title('Top ' + str(nb_ranking) + ' frequent words in Sentence 1: n = ' + str(len(word_freq_s1)))

sns.barplot(x='Frequency', y='Word', data=word_freq_s2.head(nb_ranking), ax=ax2).set(xlim=(0, 15000))
ax2.set_title('Top ' + str(nb_ranking) + ' frequent words in Sentence 2: n = ' + str(len(word_freq_s2)))

plt.show()

# BERT

# Tokenization

In [None]:
# Tokenization using BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Get maximum number of words
max_len = []
train_s1 = train['sentence1'].values
train_s2 = train['sentence2'].values
eval_s1 = eval['sentence1'].values
eval_s2 = eval['sentence2'].values
test_s1 = test['sentence1'].values
test_s2 = test['sentence2'].values

for sent1, sent2, sent3, sent4, sent5, sent6 in zip(train_s1, train_s2, eval_s1, eval_s2, test_s1, test_s2):
    token_words_1 = tokenizer.tokenize(sent1)
    token_words_2 = tokenizer.tokenize(sent2)
    token_words_3 = tokenizer.tokenize(sent3)
    token_words_4 = tokenizer.tokenize(sent4)
    token_words_5 = tokenizer.tokenize(sent5)
    token_words_6 = tokenizer.tokenize(sent6)

    token_words_1.extend(token_words_2)
    token_words_1.extend(token_words_3)
    token_words_1.extend(token_words_4)
    token_words_1.extend(token_words_5)
    token_words_1.extend(token_words_6)

    max_len.append(len(token_words_1))
    
max_length = max(max_len) + 3 # max length = Word tokens + 3 special tokens(1 [CLS] and 2 [SEP])

print('Max words: ', max_length)

# Word embedding

In [None]:
# Function to get word ID and attention mask
def prep(sent1, sent2, label):
  input_ids = []
  attention_masks = []
  sentence_ids = []
  end_term = "[SEP]"
  labels = label.values

  for x , y in zip(sent1, sent2):
    sent= x + end_term + y
    
    encoded_dict = tokenizer.encode_plus(
        sent,
        add_special_tokens = True, # Distinguish two sentences
        max_length = max_length, # Padding
        pad_to_max_length = True, # Padding
        return_attention_mask = True, # Make attention mask
        return_tensors = 'pt', # Return Pytorch tensors
        )
    
    # Get word ID
    input_ids.append(encoded_dict['input_ids'])
    
    # Get attention mask
    attention_masks.append(encoded_dict['attention_mask'])

    # Get token type ID (distinguish sentence 1 and 2)
    sentence_ids.append(encoded_dict['token_type_ids'])
    
  # Concatenate listed tensor for vertical dimmention (dim=0)
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
    
  # Cast label list to tenosor
  labels = torch.tensor(labels)

  return input_ids, attention_masks, sentence_ids, labels

In [None]:
# Get word ID and attention mask
# train
train_ids, train_masks, sentence_ids, train_labels = prep(train_s1, train_s2, train['gold_label'])

# evaluation
eval_ids, eval_masks, sentence_ids, eval_labels = prep(eval_s1, eval_s2, eval['gold_label'])

# test
test_ids, test_masks, sentence_ids, test_labels = prep(test_s1, test_s2, test['gold_label'])

In [None]:
# Sample tensor
print('Original sentence1: ', train_s1[0])
print('Original sentence2: ', train_s2[0])
print('Token IDs:', train_ids[0]) 
print('Attention mask:', train_masks[0])

# Training and evaluation

In [None]:
# Make tensor dataset
train_tensor = TensorDataset(train_ids, train_masks, train_labels)
eval_tensor = TensorDataset(eval_ids, eval_masks, eval_labels)
test_tensor = TensorDataset(test_ids, test_masks, test_labels)

# Data loader
batch_size = 50

# Train data loader
train_dataloader = DataLoader(
            train_tensor,  
            sampler = RandomSampler(train_tensor), # make batches randomly
            batch_size = batch_size
        )

# Evaluation data loader
validation_dataloader = DataLoader(
            eval_tensor, 
            sampler = SequentialSampler(eval_tensor), # make batches in order
            batch_size = batch_size 
        )

# Test data loader
test_dataloader = DataLoader(
            test_tensor, 
            sampler = SequentialSampler(test_tensor), # make batches in order
            batch_size = batch_size
        )
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Enable GPU if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load a pre-traind BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Specify a pre-trained model
    num_labels = 3,
    output_attentions = False, # Output attention vectors
    output_hidden_states = False, # Output hidden layers
).to(device)

In [None]:
# Make tensor dataset
train_tensor = TensorDataset(train_ids, train_masks, train_labels)
eval_tensor = TensorDataset(eval_ids, eval_masks, eval_labels)
test_tensor = TensorDataset(test_ids, test_masks, test_labels)

# Data loader
batch_size = 50

# Train data loader
train_dataloader = DataLoader(
            train_tensor,  
            sampler = RandomSampler(train_tensor), # make batches randomly
            batch_size = batch_size
        )

# Evaluation data loader
validation_dataloader = DataLoader(
            eval_tensor, 
            sampler = SequentialSampler(eval_tensor), # make batches in order
            batch_size = batch_size 
        )

# Test data loader
test_dataloader = DataLoader(
            test_tensor, 
            sampler = SequentialSampler(test_tensor), # make batches in order
            batch_size = batch_size
        )

In [None]:
# Enable GPU if possible
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load a pre-traind BERT model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Specify a pre-trained model
    num_labels = 3,
    output_attentions = False, # Output attention vectors
    output_hidden_states = False, # Output hidden layers
).to(device)

In [None]:
gc.collect()

# Train and evaluation
lr = 2e-5 # Learning rate
optimizer = AdamW(model.parameters(), lr=lr)

max_epoch = 50
train_loss_ = []
eval_loss_ = []

# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

for epoch in range(max_epoch):
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, max_epoch))
    # Training sequence
    print('Training...')
    model.train()
    train_loss = 0
    for batch in train_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        # Loss means Cross Entropy Loss
        # Logits means values to be input to the softmax function
        loss, logits = model(b_input_ids, 
                             token_type_ids = None, 
                             attention_mask = b_input_mask, 
                             labels = b_labels)
        train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    avg_train_loss = train_loss / len(train_dataloader)  
    train_loss_.append(round(avg_train_loss, 2))
    print('Epoch training loss: ', round(avg_train_loss, 2))
    print('')

    # Evaluation sequence
    print('Evaluating...')
    model.eval()
    val_loss = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad(): # don't compute grading
          (loss, logits) = model(b_input_ids, 
                                 token_type_ids = None, 
                                 attention_mask = b_input_mask,
                                 labels = b_labels)
        val_loss += loss.item()

    avg_val_loss = val_loss / len(validation_dataloader)
    eval_loss_.append(round(avg_val_loss, 2))
    print('Epoch evaluation loss: ', round(avg_val_loss, 2))
    print('')


In [None]:
# Plot loss
plt.plot(list(range(1, max_epoch+1)), train_loss_, color='red', marker='o')
plt.plot(list(range(1, max_epoch+1)), eval_loss_, color='green', marker='^')
plt.gca().get_xaxis().set_major_locator(ticker.MaxNLocator(integer=True))
plt.title('Model loss\nEpoch = ' + str(max_epoch))
plt.legend(['Train loss', 'Evaluation loss'])
plt.xlabel('Epoch')
plt.ylabel('Average cross entropy loss')
plt.show()

# Prediction and model performance checking

In [None]:
# Prediction
prediction = []
true_labels = []

# Switch the data loader (use validation or test dataloader)
dataloader_mode = validation_dataloader
#dataloader_mode = test_dataloader

model.eval() # Turn off training mode
for batch in dataloader_mode:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():   
        # Get prediction by trained model
        preds = model(b_input_ids,
                      token_type_ids=None,
                      attention_mask=b_input_mask)
        prediction.append(preds[0].detach().cpu().numpy())
        true_labels.append(b_labels.detach().cpu().numpy())

In [None]:
# Extract relevant information from prediction
# Logits list
results = []
for i in range(len(prediction)):
  for j in range(len(prediction[0])):
    results.append(prediction[i][j])

logits_df = pd.DataFrame(results, columns=['logit_0', 'logit_1', 'logit_2'])

# Predicted label list
predicted_label = []
for i in results:
  predicted_label.append(np.argmax(i, axis=0))

pred_df = pd.DataFrame(predicted_label, columns=['pred_label'])

# True label list
true_labels2 = []
for i in range(len(true_labels)):
  for j in range(batch_size):
    true_labels2.append(true_labels[i][j])

label_df = pd.DataFrame(true_labels2, columns=['true_label'])

In [None]:
# Make a dataframe to calculate the performance of prediction
preds_df = pd.concat([logits_df, pred_df, label_df], axis=1)
preds_df.head()

In [None]:
# Performance score

y_pred = preds_df.pred_label.values
y_true = preds_df.true_label.values

print(classification_report(y_true, y_pred, digits=2))

In [None]:
# Visualize confusion matrix

cf_matrix = confusion_matrix(y_true, y_pred)
matrix = sns.heatmap(cf_matrix, annot=True)
matrix.xaxis.set_ticks_position('top') 
matrix.set(xlabel='prediction', ylabel='Gold label')
plt.title('Confusion Matrix\n0=Neutral, 1=Entailment, 2=Contradiction')
plt.show()