<a href="https://colab.research.google.com/github/mz-zarei/SentenceReording/blob/main/DL2_ReorderSentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Importing required library and Unziping the data**

In [None]:
!pip install transformers
from transformers import BertTokenizer, BertForNextSentencePrediction
import torch
from torch.nn import functional as F
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import spearmanr
from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
from torch.nn.functional import softmax
import torch
import pickle
import itertools
from tqdm import tqdm
import random
from numba import jit, cuda 
import csv

# Adjust the path to rar file of the data
!unrar x "/content/drive/MyDrive/Colab Notebooks/SentenceReordering_DataChallenge2/DataChallenge2.rar"



In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


# **2. Uploading Train and Test Data** 
- Uploading data
- Create a new data set including sentence pairs and their labels
- Selecte a specifed number of data points from the new data set with balanced labels and considering max_words for each sentence
- Tokenizing and saving the outputs in csv files for model training. This step takes a long time and that's why I saved them as csv files so I can use them easily. Those files are large and I couldn't uplaod them on LEARN but I can provide them if required. I used 3 sets of 1m pairs of sentences to train three models which takes a long time. But the code can be tested for 5000 pairs as well.






In [None]:
TEST_FILE_NAME = 'test.pkl'
TRAIN_FILE_NAME = 'train.pkl'

infile = open(TRAIN_FILE_NAME,'rb')
trainset = pickle.load(infile)
infile = open(TEST_FILE_NAME,'rb')
testset = pickle.load(infile)
val_data = trainset[:100]


# path to folder with tokenized data and trained model
path = '/content/drive/MyDrive/Colab Notebooks/SentenceReordering_DataChallenge2/'

bert_type = 'bert-base-uncased'
max_words = 80  # max number of words allowed in a sentence in train set
max_length = 128  # max number of tokens allowed in a sentence in train set
batch_size = 32
epochs = 1
data_size = 5000   # number of data set that is used for training

In [None]:
# import matplotlib.pyplot as plt


# word_count = []
# for data in testset:
#     c = 0
#     for i in range(6):
#         if len(data['sentences'][i].split()) < 500:
#             c += len(data['sentences'][i].split())
#     word_count.append(c)
# plt.hist(word_count, bins=100)
# np.mean(word_count)


# word_count = []
# for data in testset:
#     c = 0
#     for i in range(6):
#         if len(data['sentences'][i].split()) > 80:
#             print(data)

In [None]:
# count = 0
# for data in testset:
#     for i in range(6):
#         if len(data['sentences'][i].split()) > max_words:
#             count +=1
# count

In [None]:
# Make a new data set using each pair of 6 given sentence in each data point (30 pairs),
# 25 pairs with label 1 and 5 pair with label 0.

pairs = list(itertools.permutations(range(6),2))
data_set = []
# trainset = trainset[0:3]
for ID in tqdm(range(len(trainset[:data_size]))):
    
    sentences = trainset[ID]['sentences']
    indexes = trainset[ID]['indexes']
    for p in pairs:
        if indexes[p[1]] == indexes[p[0]] + 1:
            data = [sentences [p[0]],sentences [p[1]], 0]  # sentence A, sentence B, lable (0: B is a continuation of sequence A, 1:B is a random sequence)
            data_set.append(data)
        else:
            data_set.append([sentences [p[0]],sentences [p[1]], 1] )

random.shuffle(data_set)   # shuffling the new data set
print(len(trainset))
print(len(data_set))



  0%|          | 0/5000 [00:00<?, ?it/s][A
100%|██████████| 5000/5000 [00:00<00:00, 42978.74it/s]


590226
150000


In [None]:
data_balanced = []   # 1m balanced data which has max_words are selected for tokenization
count_1 = 0
count_0 = 0
for data in tqdm(data_set):
    if len(data[0].split()) < max_words and len(data[1].split()) < max_words:
        if data[2] == 0:
            if count_0 < data_size/2:
                count_0 += 1
                data_balanced.append(data)
        elif count_1 < data_size/2:
            count_1 += 1
            data_balanced.append(data)

del data_set
del trainset
print(len(data_balanced))


  0%|          | 0/150000 [00:00<?, ?it/s][A
 17%|█▋        | 25424/150000 [00:00<00:00, 254225.70it/s][A
 35%|███▍      | 51997/150000 [00:00<00:00, 257569.84it/s][A
 52%|█████▏    | 77577/150000 [00:00<00:00, 257034.92it/s][A
 70%|██████▉   | 104420/150000 [00:00<00:00, 260349.18it/s][A
100%|██████████| 150000/150000 [00:00<00:00, 263521.45it/s]

5000





In [None]:
# count_words = 0
# count_sentence = 0

# for data in tqdm(data_set):
#     if len(data[0].split()) > 50 :
#         count_words += 1
# count_words       

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
tokenizer = BertTokenizer.from_pretrained(bert_type)



input_ids = []
attention_masks = []
token_type_ids =[]
labels = []

# For every data point...
for data in tqdm(data_balanced):

    s1, s2, label = data[0], data[1], data[2]
    encoded_dict = tokenizer.encode_plus(
                        s1, text_pair = s2,                      # Two sentences to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_length,          # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',           # Return pytorch tensors.
                        truncation= True     
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])
    # And its token type ids (which sentence is first).
    token_type_ids.append(encoded_dict['token_type_ids'])
    # Get labels
    labels.append(label)


# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
token_type_ids = torch.cat(token_type_ids, dim=0)
labels = torch.Tensor(labels)

 
# Save csv files of tokenized data
input_ids_df = pd.DataFrame(input_ids.numpy())
input_ids_df.to_csv(path + 'input_ids_new.csv', index=False)

attention_masks_df = pd.DataFrame(attention_masks.numpy())
attention_masks_df.to_csv(path + 'attention_masks_new.csv', index=False)

token_type_ids_df = pd.DataFrame(token_type_ids.numpy())
token_type_ids_df.to_csv(path + 'token_type_ids_new.csv', index=False)

labels_df = pd.DataFrame(labels.numpy())
labels_df.to_csv(path + 'labels_new.csv', index=False)



# # Print sentence 0, now as a list of IDs.
# print('Original: ', data_balanced[0])
# print('Token IDs:', input_ids[0])
# print('Token type IDs:', token_type_ids[0])
# print('label:', labels[0])


# **3. Finetunning BertForNextSentencePrediction**
In this part a Bert Model trained using the balanced data set from previous step. This is mainly based on tutorial.

In [None]:
from torch.utils.data import TensorDataset, random_split


input_ids_df = pd.read_csv(path + 'input_ids_new.csv')
input_ids = torch.Tensor(input_ids_df.values[:data_size]).to(torch.int64)

attention_masks_df = pd.read_csv(path + 'attention_masks_new.csv')
attention_masks = torch.Tensor(attention_masks_df.values[:data_size]).to(torch.int64)

token_type_ids_df = pd.read_csv(path + 'token_type_ids_new.csv')
token_type_ids = torch.Tensor(token_type_ids_df.values[:data_size]).to(torch.int64)

labels_df = pd.read_csv(path + 'labels_new.csv').T
labels = torch.Tensor(labels_df.values[0][:data_size]).to(torch.int64)




# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, token_type_ids, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

# Print sentence 0, now as a list of IDs.
# print('Original: ', data_balanced[0])
# print('Token IDs:', input_ids[0])
# print('Token type IDs:', token_type_ids[0])
# print('label:', labels[0])


4,500 training samples
  500 validation samples


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForNextSentencePrediction, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )



model = BertForNextSentencePrediction.from_pretrained(
    bert_type, 
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

model.cuda()


optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
import random
import numpy as np



seed_val = 42 

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


training_stats = []

total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 10 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_input_type = batch[2].to(device)
        b_label = batch[3].to(device)

        model.zero_grad()        

        result = model(b_input_ids, 
                       token_type_ids=b_input_type, 
                       attention_mask=b_input_mask, 
                       labels=b_label,
                       return_dict=True)

        loss = result.loss
        logits = result.logits
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()

    # Tracking variables 
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_input_type = batch[2].to(device)
        b_labels = batch[3].to(device)
        
        with torch.no_grad():        
            result = model(b_input_ids, 
                           token_type_ids=b_input_type, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits
            
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("")
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
    

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch    50  of    141.    Elapsed: 0:00:20.
  Batch   100  of    141.    Elapsed: 0:00:39.

  Average training loss: 0.46
  Training epcoh took: 0:00:55

Running Validation...

  Accuracy: 0.72
  Validation Loss: 0.55
  Validation took: 0:00:02

Training complete!
Total training took 0:00:57 (h:mm:ss)


In [None]:

torch.save({'model' : model,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_train_loss,
            }, path + 'checkpoint-new.pt')


# **4. Predicting Order of Test Data With Three Models**
I used three trained BERT models using the given data set with different portion of data each time. I couldn't upload three models in LEARN as they have about 400mb size each, but I can share a link to Google Drive if required. Then I used all three outputs from each model for scoring 720 permutations of the given 6 sentences in the Testset and the order with maximum score (sum of 5 scores regarding 5 pairs of a given order) is selected as the best order for 6 sentences. This process takes about an hour with Colab Pro. For example fo an order of the given 6 setences, [s1,s3,s0,s4,s2,s5] the score from one BERT model is computed as follows BERT[s1,s3]+BERT[s3,s0]+BERT[s0,s4]+BERT[s4,s2]+BERT[s2,s5]. If we use only one of the models the Spearman's test becomes around 0.80 but if we use all three models in a bagging form it improves to 0.812 for the test and validation set. 

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Loading three trained BEertForNextSentencePrediction models
model1 = torch.load('/content/model1.pt')
model2 = torch.load('/content/model2.pt')
model3 = torch.load('/content/model3.pt')


model1.to(device)
model1.eval()
model2.to(device)
model2.eval()
model3.to(device)
model3.eval()

test_order_list = []   # this is final answer

# this function calculates the score of a given pair using three BERT models that I trained using the given data set in previous steps
def score_pair(s1,s2):
    encoding = tokenizer.encode_plus(s1, text_pair = s2, return_tensors='pt',max_length=max_length, pad_to_max_length = True, add_special_tokens= True).to(device)
    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']
    token_type_ids = encoding['token_type_ids']

    outputs1 = model1(**encoding)
    sx1 = softmax(outputs1.logits, dim = 1)
    score1 = sx1.tolist()[0][0]

    outputs2 = model2(**encoding)
    sx2 = softmax(outputs2.logits, dim = 1)
    score2 = sx2.tolist()[0][0]

    outputs3 = model3(**encoding)
    sx3 = softmax(outputs3.logits, dim = 1)
    score3 = sx3.tolist()[0][0]


    return  score1 + score1 + score3


all_pairs = list(itertools.permutations([0,1,2,3,4,5],2))
orders = list(itertools.permutations([0,1,2,3,4,5]))

for ID in tqdm(range(len(testset))):
    sentences = testset[ID]

    pair_score_dict = {} # the BERT score for each pair of sentences

    # computing each pair score
    for pair in all_pairs:
        s1 = sentences['sentences'][pair[0]]
        s2 = sentences['sentences'][pair[1]]

        pair_score_dict[pair] = score_pair(s1, s2)


    score = 0 # best score of orders
    best_ord = []
    for ord in orders:
        pairs = [(ord.index(0),ord.index(1)),
                 (ord.index(1),ord.index(2)),
                 (ord.index(2),ord.index(3)),
                 (ord.index(3),ord.index(4)),
                 (ord.index(4),ord.index(5))]
        score_ord = 0  # score of the ord
        for pair in pairs:
            score_ord += pair_score_dict[pair]
        if score_ord > score:
            best_ord = ord
            score = score_ord

    test_order_list.append(best_ord)

pd.DataFrame(test_order_list).to_excel('test.xlsx')

# **5. Predicting Order of Validation Data With One Model**
Here the trained models can be tested on validation set. With just 5000 data you should get about 0.7 for Spearman test on the validation set.


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Loading three trained BEertForNextSentencePrediction models
model = torch.load(path + 'checkpoint-new.pt')['model']


model.to(device)
model.eval()

val_order_list = []   
sr = []      # Spearsman score list

def score_pair(s1,s2):
    encoding = tokenizer.encode_plus(s1, text_pair = s2, return_tensors='pt',max_length=max_length, pad_to_max_length = True, add_special_tokens= True).to(device)
    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']
    token_type_ids = encoding['token_type_ids']

    outputs = model(**encoding)
    sx = softmax(outputs.logits, dim = 1)
    score = sx.tolist()[0][0]

    return  score


all_pairs = list(itertools.permutations([0,1,2,3,4,5],2))
orders = list(itertools.permutations([0,1,2,3,4,5]))


for ID in tqdm(range(len(val_data))):
    sentences = val_data[ID]
    

    pair_score_dict = {} # the BERT score for each pair of sentences

    # computing each pair score
    for pair in all_pairs:
        s1 = sentences['sentences'][pair[0]]
        s2 = sentences['sentences'][pair[1]]

        pair_score_dict[pair] = score_pair(s1, s2)


    score = 0 # best score of orders
    best_ord = []
    for ord in orders:
        pairs = [(ord.index(0),ord.index(1)),
                 (ord.index(1),ord.index(2)),
                 (ord.index(2),ord.index(3)),
                 (ord.index(3),ord.index(4)),
                 (ord.index(4),ord.index(5))]
        score_ord = 0  # score of the ord
        for pair in pairs:
            score_ord += pair_score_dict[pair]
        if score_ord > score:
            best_ord = ord
            score = score_ord
    sr.append(spearmanr(sentences['indexes'], best_ord)[0])
    val_order_list.append(best_ord)



print("")
print(np.mean(sr))

# **6. Predicting Order of Test Data With One Model**
Here the trained model or the submitted model trained on 1m data (model3.pt in the submission folder) can be used on test set.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

## Loading three trained BEertForNextSentencePrediction models

# model = torch.load(path + 'checkpoint-new.pt')['model']
model = torch.load(path + 'model3.pt')


model.to(device)
model.eval()

test_order_list = []   
sr = []      # Spearsman score list

def score_pair(s1,s2):
    encoding = tokenizer.encode_plus(s1, text_pair = s2, return_tensors='pt',max_length=max_length, pad_to_max_length = True, add_special_tokens= True).to(device)
    input_ids = encoding['input_ids']
    attention_masks = encoding['attention_mask']
    token_type_ids = encoding['token_type_ids']

    outputs = model(**encoding)
    sx = softmax(outputs.logits, dim = 1)
    score = sx.tolist()[0][0]

    return  score


all_pairs = list(itertools.permutations([0,1,2,3,4,5],2))
orders = list(itertools.permutations([0,1,2,3,4,5]))


for ID in tqdm(range(len(testset))):
    sentences = testset[ID]
    

    pair_score_dict = {} # the BERT score for each pair of sentences

    # computing each pair score
    for pair in all_pairs:
        s1 = sentences['sentences'][pair[0]]
        s2 = sentences['sentences'][pair[1]]

        pair_score_dict[pair] = score_pair(s1, s2)


    score = 0 # best score of orders
    best_ord = []
    for ord in orders:
        pairs = [(ord.index(0),ord.index(1)),
                 (ord.index(1),ord.index(2)),
                 (ord.index(2),ord.index(3)),
                 (ord.index(3),ord.index(4)),
                 (ord.index(4),ord.index(5))]
        score_ord = 0  # score of the ord
        for pair in pairs:
            score_ord += pair_score_dict[pair]
        if score_ord > score:
            best_ord = ord
            score = score_ord
    test_order_list.append(best_ord)

pd.DataFrame(test_order_list).to_excel('test.xlsx')