In [1]:
import torch
import numpy as np
import pandas as pd
import os
import statistics
from transformers import BertTokenizer, BertForPreTraining, BertModel
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import plotly
import plotly.express as px

# Utilitary functions

In [2]:
def fix_dataset(dataset):
    
    # Check is all questions in 'question1' and 'question2' are str
    filter = np.array([not isinstance(s1, str) for s1 in dataset['question1'].tolist()]) | np.array([not isinstance(s2, str) for s2 in dataset['question2'].tolist()])
    indexes_to_drop = dataset[filter].index
    
    # drop lines that are not
    if not len(indexes_to_drop):
        print("All rows are corrects")
    else:
        print("Removing the following lines: ")
        print(dataset.loc[indexes_to_drop])
        dataset = dataset.drop(indexes_to_drop)
    
    return dataset

## Import data

In [3]:
# Paths & Variables

data_path = "data/quora-question-pairs"
train_file = "train.csv"
test_pos_file = "test.csv"
label_file = "sample_submission.csv"

# Reading
train = pd.read_csv(os.path.join(data_path, train_file), index_col = 0)
test_pos = pd.read_csv(os.path.join(data_path, test_pos_file), index_col = 0)
y_label = pd.read_csv(os.path.join(data_path, label_file), index_col = 0)

# Fix datasets for NaN values in question1 or question2
train = fix_dataset(train)
test_pos = fix_dataset(test_pos)

# join test and y_label
test_pos = test_pos.join(y_label, on = 'test_id', how = 'left')

# test set contains only positive, labels; suffle to create negative examples
test_neg = test_pos.copy()
test_neg['question1'] = np.random.permutation(test_neg['question1'])
test_neg['is_duplicate'] = 0

# Create final test set
test = pd.concat([test_pos, test_neg], ignore_index = True)


Removing the following lines: 
          qid1    qid2                         question1  \
id                                                         
105780  174363  174364    How can I develop android app?   
201841  303951  174364  How can I create an Android app?   
363362  493340  493341                               NaN   

                                                question2  is_duplicate  
id                                                                       
105780                                                NaN             0  
201841                                                NaN             0  
363362  My Chinese name is Haichao Yu. What English na...             0  
Removing the following lines: 
                                           question1  \
test_id                                                
379205      How I can learn android app development?   
817520   How real can learn android app development?   
943911                          How app dev

## Tokenization 

See (https://paperswithcode.com/method/wordpiece)

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
print("Token id for [CLS]: " + str(tokenizer.cls_token_id))
print("Token id for [SEP]: " + str(tokenizer.sep_token_id))
print("Token id for [PAD]: " + str(tokenizer.pad_token_id))
print("Token id for [UNK]: " + str(tokenizer.unk_token_id))
print("Token id for [MASK]: " + str(tokenizer.mask_token_id))

print("Original sentense: " + train.loc[0, 'question1'])
print("Encoded sentense: ")
enc = tokenizer.encode(train.loc[0, 'question1'])
print(enc)
print("Decoded sentense: ")
dec = tokenizer.decode(enc)
print(dec)

# Check len of tokenized training sentences:
list_len = []
all_s = train['question1'].tolist() + train['question2'].tolist()
for s in all_s:
    tks = tokenizer.encode(s)
    list_len.append(len(tks))

max_len = max(list_len)

print(f"La taille maximale de tokens est {max_len} (avec les [CLS] et [SEP])")
lw_64 = round((sum([l <= 64 for l in list_len])/len(list_len)) * 100, 2) 
print(f"Il y a {lw_64}% des phrases tokenised qui sont <= 64. C'est suffisant, on supprimera celle plus grande du dataset")


Token id for [CLS]: 101
Token id for [SEP]: 102
Token id for [PAD]: 0
Token id for [UNK]: 100
Token id for [MASK]: 103
Original sentense: What is the step by step guide to invest in share market in india?
Encoded sentense: 
[101, 2054, 2003, 1996, 3357, 2011, 3357, 5009, 2000, 15697, 1999, 3745, 3006, 1999, 2634, 1029, 102]
Decoded sentense: 


2022-01-18 11:37:43.333556: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda-11.1/lib64:
2022-01-18 11:37:43.333579: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


[CLS] what is the step by step guide to invest in share market in india? [SEP]
La taille maximale de tokens est 286 (avec les [CLS] et [SEP])
Il y a 99.88% des phrases tokenised qui sont <= 64. C'est suffisant, on supprimera celle plus grande du dataset


# Data loading

See https://pytorch.org/tutorials/beginner/basics/data_tutorial.html for documentation about the Dataset and Dataloader creation.

In [5]:
class SiameseNetWorkSentenceDataset(Dataset):
    """
    SiameseNetWorkSentenceDataset create a Dataset
    - data (pd.DataFrame): the data dataframe with column 'question1' and 'question2' along with the label 'is_duplicate'
    - tokenizer: the BERT tokenizer, such as: BertTokenizer.from_pretrained('bert-base-uncased')
    - max_length: the maximal length of tokens input vector (default 64) Shorter vector arre padded to max_length with [PAD token] (id: 0) and longer are truncated. 
    The size includes the start [CLS] and end [SEP] tokens.
    """

    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return self.data.shape[0]
    
    def __getitem__(self, index):
        
        def squeeze_tensors(tks):
            """Take a tensor and remove unnecessary dimension. When using tokenizer with return_tensors = 'pt', the returned tensor is by default 2 dimensions, has it could handle a list of sentence as inputs.
            However, as we only sent one sentence at a time to the tokenizer to create the Dataset, it result in an additional dimension that will be useless after pooling results by batches in the DataLoader

            Args:
                tks ([type]): [description]
            """
            tks.data["input_ids"] = torch.squeeze(tks.data["input_ids"])
            tks.data["token_type_ids"] = torch.squeeze(tks.data["token_type_ids"])
            tks.data["attention_mask"] = torch.squeeze(tks.data["attention_mask"])

        s1 = self.data.loc[index, 'question1']
        s2 = self.data.loc[index, 'question2']
        label = torch.tensor(self.data.loc[index, 'is_duplicate'])

        tokens1 = self.tokenizer(text = s1, max_length = self.max_length, padding = 'max_length', truncation = True, return_tensors = 'pt')
        squeeze_tensors(tokens1)
        tokens2 = self.tokenizer(text = s2, max_length = self.max_length, padding = 'max_length', truncation = True, return_tensors = 'pt')
        squeeze_tensors(tokens2)

        return tokens1, tokens2, label


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = SiameseNetWorkSentenceDataset(data = train, tokenizer = tokenizer, max_length = 64)
print(train_dataset[10])
print(len(train_dataset))

({'input_ids': tensor([  101,  4118,  2000,  2424,  8745,  1997, 29199,  2478, 10424,  2229,
        11877, 12170, 18098,  2964,  1029,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}, {'inpu

In [22]:
dataloader = DataLoader(train_dataset, batch_size = 8, shuffle = True, num_workers = 0)

# Model

In [32]:
class SiameseBERTNet(nn.Module):

    def __init__(self):
        super(SiameseBERTNet, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.HS = self.bert.config.hidden_size

    def forward_siamese(self, input, noCLSpooling = True, noSEPpooling = True):
        """From tokenised input sentence, compute BERT 

        Args:
            input (dict): output dict from the tokenizer with input_ids, token_type_ids and attention_mask

        Returns:
            avg (tensor): Mean of the last hidden layer vectors for real tokens (attention_mask: 1) in the input.
        """
        # Get input_ids and attention mask
        input_ids, token_type_ids, attention_mask = input.values()

        # Apply BERT and extract last_hidden_state
        out = self.bert(input_ids = input_ids, attention_mask = attention_mask)
        last_hidden_state = out.last_hidden_state

        # Apply mean pooling on real tokens
        # Make a copy is some changes (CLS or SEP) need to be applied
        pooling_mask = attention_mask.detach().clone()

        # If the CLS output vector should not participate in average pooling
        if noCLSpooling:
            pooling_mask[:, 0] = 0
        
        # If the SEP output vector should not participate in average pooling
        if noSEPpooling:
            pooling_mask = torch.where(input_ids == 102, 0, pooling_mask)

        # Get mask at the same dimension as last_hidden_state
        expanded_pooling_mask = pooling_mask.unsqueeze(-1)
        expanded_pooling_mask = expanded_pooling_mask.expand(-1, -1, self.HS)

        # Element wise mul between last_hidden_state and mask to then only consider real tokens in the sum
        prod = torch.mul(last_hidden_state, expanded_pooling_mask)

        # Sum all token vectors
        sum_by_tks = torch.sum(prod, dim = 1)

        # Get normalisation factor to compute mean
        norm = torch.sum(pooling_mask, dim = -1).unsqueeze(-1)

        # Comptue average
        avg = torch.div(sum_by_tks, norm)

        return avg


    def forward(self, question1, question2):
        out1 = self.forward_siamese(question1)
        out2 = self.forward_siamese(question2)

        return out1, out2

        

In [33]:
model = SiameseBERTNet()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
test = next(iter(dataloader))
model(test[0], test[1])

(tensor([[ 0.4299, -0.2941, -0.0675,  ..., -0.1354,  0.3433, -0.1303],
         [-0.2913, -0.3231,  0.3012,  ..., -0.2725,  0.4931, -0.2576],
         [ 0.0685, -0.0801,  0.4622,  ..., -0.4811, -0.0436,  0.0921],
         ...,
         [ 0.0229, -0.0395,  0.2465,  ..., -0.1411, -0.0208,  0.4178],
         [ 0.1411, -0.3166, -0.1117,  ..., -0.2730,  0.2925,  0.2480],
         [ 0.2395,  0.0070,  0.0455,  ..., -0.1985,  0.0763,  0.2774]],
        grad_fn=<DivBackward0>),
 tensor([[ 0.4390, -0.3143, -0.2842,  ...,  0.0353,  0.1169, -0.2412],
         [-0.3177, -0.2250,  0.3498,  ..., -0.4351,  0.3755, -0.2483],
         [-0.0074,  0.1461,  0.5292,  ..., -0.4827, -0.0031, -0.1812],
         ...,
         [-0.1986,  0.0688,  0.0607,  ..., -0.2054,  0.1053,  0.1306],
         [ 0.1292, -0.1880, -0.0761,  ..., -0.4151,  0.3113,  0.3144],
         [ 0.2097,  0.2462, -0.0774,  ..., -0.0744,  0.0023,  0.4150]],
        grad_fn=<DivBackward0>))

# Test BERT embedding without fine-tuning

In [34]:
# Extract a set of sample sentenses from test set:

positives_s = test.head(n = 10)
negatives_s = test.tail(n = 10)

all_s = positives_s.append(negatives_s, ignore_index = True)
all_s.to_csv("data/sample.csv", index = False)
d = SiameseNetWorkSentenceDataset(data = all_s, tokenizer = tokenizer, max_length = 64)
l = DataLoader(d, batch_size = 20, shuffle = False, num_workers = 0)
sample = next(iter(l))
res = model(sample[0], sample[1])
res = torch.cat([res[0], res[1]], axis = 0)

# Compute PCA
U, S, V = torch.pca_lowrank(res, niter = 50)
proj = torch.matmul(res, V[:, :2])

plot_data = pd.DataFrame(proj.detach().numpy())
plot_data["question"] = (all_s["question1"].tolist() + all_s["question2"].tolist())
plot_data = plot_data.rename({0: 'X', 1: 'Y'}, axis=1)

fig = px.scatter(plot_data, x = "X", y = "Y", hover_data=['question'])
fig.show()

# Compute distance matrix :
D = torch.cdist(res.double(), res.double(), p = 2)
df = pd.DataFrame(D.detach().numpy().round(3))
df.to_csv("data/NormEucDistanceMatrix_1.csv", index = False, header = False)

