## ReBERT Implementation
https://github.com/fabrahman/ReBART

The implementation below is in reference to the paper above. It has been altered to work for our use case for the data challenge

## CITATION:

@inproceedings{Basu-brahman-chaturvedi-rebart,
    title = "Is Everything in Order? A Simple Way to Order Sentences",
    author = "Somnath Basu Roy Chowdhury, Faeze Brahman and
      Snigdha Chaturvedi",
    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
    month = nov,
    year = "2021",
    publisher = "Association for Computational Linguistics",
}



In [None]:
!pip3 install pickle5
!pip3 install absl-py==0.13.0
!pip3 install pandas==1.3.3
!pip3 install regex==2021.4.4
!pip3 install requests==2.26.0
!pip3 install rouge-score==0.0.4
!pip3 install sacremoses==0.0.45
!pip3 install scipy==1.7.0
!pip3 install sentence-transformers==2.0.0
!pip3 install sentencepiece==0.1.96
!pip3 install tokenizers==0.8.0rc4
!pip3 install torch==1.9.0
!pip3 install torchaudio==0.9.0
!pip3 install torchvision==0.10.0
!pip3 install tqdm==4.62.3
!pip3 install transformers==3.0.0

# Import Packages

In [None]:
# Packages
import numpy as np
import torch
from google.colab import drive
import pandas as pd
import os
import json
from tqdm import tqdm
import scipy

import argparse
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss
import torch.nn as nn
from transformers import AutoModelWithLMHead, AutoTokenizer
from torch.utils.data import RandomSampler, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange

# Set up Google Drive

In [None]:
# Connect Google Drive where the data is located
drive.mount('/content/drive')

# Train on GPU if applicable

In [None]:
# Check if GPU is active
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # check is GPU is available

print(device)
!nvidia-smi -L

In [None]:
# Set seed for reproducibility
np.random.seed(940101)
torch.manual_seed(940101)

# Read the Data

In [None]:
FOLDER_PATH = '/content/drive/MyDrive/Colab Notebooks/data-challenge-2'
DATA_PATH = os.path.join(FOLDER_PATH, 'data')
TRAIN_JSONL_NAME = 'train_new.jsonl'
TEST_JSONL_NAME = 'test_new.jsonl'
GITHUB_JSONL_EXAMPLE_NAME = "github_test_check.jsonl"

# Load the train and test pickled data
#   - key: id
#   - value: [list of sentences, array of ordering. ofsentences]
train_dict = pd.read_pickle(os.path.join(DATA_PATH, "train.pickle"))
test_dict = pd.read_pickle(os.path.join(DATA_PATH, "test.pickle"))

## Match implementation input data
- The sample code requires JSONL file such that:

  
[
    {"orig_sents": [orders...],
     "shuf_sents": [sentences...]
     }, ...
] 


In [None]:
def write_to_jsonl(data, sample_size = 10, is_test = False):

    # If looking at only a sample, loop through first sample_size elements
    if not sample_size:
        sample_size = len(data)

    df_dict = {}
    for i in range(sample_size):
        df_dict[i] = data[i]

    to_convert_to_json = []

    # Test dataset should have default empty set as orderings
    if not is_test:
        file_name = TRAIN_JSONL_NAME
        # Apply the same format as the example on github page
        for sentences, orders in df_dict.values():
            temp_dict = {}
            temp_dict["orig_sents"] = [str(x) for x in orders]
            temp_dict["shuf_sents"] = sentences

            to_convert_to_json.append(temp_dict)

    else:
        file_name = TEST_JSONL_NAME
        for sentences in df_dict.values():
            temp_dict = {}
            temp_dict["orig_sents"] = []
            temp_dict["shuf_sents"] = sentences[0]
            
            to_convert_to_json.append(temp_dict)
    
    # Convert to jsonl file
    with open(os.path.join(DATA_PATH, file_name), 'w') as f:
        for item in to_convert_to_json:
            f.write(json.dumps(item) + "\n")
    pass

In [None]:
# Convert train/test dictionary into jsonl format:
write_to_jsonl(train_dict, sample_size = None)
write_to_jsonl(test_dict, sample_size = None, is_test = True)

# Compare with sample jsonl given in github repo
with open(os.path.join(DATA_PATH, GITHUB_JSONL_EXAMPLE_NAME)) as f:
    json_list = list(f)

# Check train 
with open(os.path.join(DATA_PATH, TRAIN_JSONL_NAME)) as f:
    train_json_list = list(f)

# Check to see if it opens
with open(os.path.join(DATA_PATH, TEST_JSONL_NAME)) as f:
    test_json_list = list(f)

# DEBUG LINE:
print(len(json_list), json_list[0])
print(len(train_json_list), train_json_list[0])
print(len(test_json_list), test_json_list[0])

In [None]:
def init_model(model_name, device, do_lower_case=False, args=None):
    # Initialize the REBART model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=do_lower_case)
    model = AutoModelWithLMHead.from_pretrained(model_name)

    # Make sure to send to GPU if applicable
    model.to(device)
    model.eval()

    return tokenizer, model

In [None]:
def load_data(in_file, task="in_shuf", is_test = False):

    # Convert tthe jsonl files into list of tuples where:
    #   - (tokenized with special tokens, order of sentences, if applicable)

    # Read each jsonl element and append to list
    all_lines = []
    with open(in_file, "r", encoding="utf-8") as f:
        for line in f:
            all_lines.append(json.loads(line))

    # index_with_sep will include the special tokens <S{i}> which indicates the start of a sentence.
    if not is_test:
        if task == "index_with_sep":
            examples = [
                (
                    f"[shuffled] {' '.join([' '.join((f'<S{i}>', sent)) for i, sent in zip(list(range(len(line['orig_sents']))), line['shuf_sents'])])} [orig]",
                    f"{' '.join(line['orig_sents'])} <eos>",
                )
            for line in all_lines
        ]
        else:
            examples = [
                (
                    f"[shuffled] {line['shuf_sents'].rstrip(' <eos>') if type(line['shuf_sents']) == str else ' '.join(line['shuf_sents'])} [orig]",
                    f"{line['orig_sents'].rstrip(' <eos>') if type(line['orig_sents']) == str else ' '.join(line['orig_sents'])} <eos>",
                )
                for line in all_lines
            ]
    else:
        if task == "index_with_sep":
            examples = [
                (
                    f"[shuffled] {' '.join([' '.join((f'<S{i}>', sent)) for i, sent in zip(list(range(5)), line['shuf_sents'])])} [orig]",
                )
            for line in all_lines
        ]
        else:
            examples = [
                (
                    f"[shuffled] {line['shuf_sents'].rstrip(' <eos>') if type(line['shuf_sents']) == str else ' '.join(line['shuf_sents'])} [orig]",
                )
                for line in all_lines
            ]
    return examples

In [None]:
# # DEBUG:
# in_file = os.path.join(DATA_PATH, GITHUB_JSONL_EXAMPLE_NAME)
# in_file_mike = os.path.join(DATA_PATH, TRAIN_JSONL_NAME)
# in_file_test_mike = os.path.join(DATA_PATH, TEST_JSONL_NAME)

# examples_in_shuff = load_data(in_file, task="in_shuf")
# examples_index_with_sep = load_data(in_file, task="index_with_sep")

# examples_in_shuff_mike = load_data(in_file_mike, task="in_shuf")
# examples_index_with_sep_mike = load_data(in_file_mike, task="index_with_sep")

# examples_in_shuff_test_mike = load_data(in_file_test_mike, task="in_shuf", is_test = True)
# examples_index_with_sep_test_mike = load_data(in_file_test_mike, task="index_with_sep", is_test = True)

# print(examples_in_shuff[0:1])
# print(examples_index_with_sep[0:1])
# print(examples_in_shuff_mike[0:1])
# print(examples_index_with_sep_mike[0:1])
# print(examples_in_shuff_test_mike[0:1])
# print(examples_index_with_sep_test_mike[0:1])

In [None]:
FOLDER_PATH

In [None]:
# Initialize the args as mentioned in the paper
#   - NOTE: not all parameters will be useful in this data challenge
parser = argparse.ArgumentParser()

# Required parameters
parser.add_argument('-f')
parser.add_argument("--out_dir", default=FOLDER_PATH, type=str, help="Out directory for checkpoints.")

# Other parameters
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
parser.add_argument("--do_eval", default=False, help="Whether to run eval on the dev set.")
parser.add_argument("--do_lower_case", default=False, help="Set this flag if you are using an uncased model.")
parser.add_argument("--do_train", default=True, type=bool, help="Whether to run training.")
parser.add_argument("--eval_batch_size", default=16, type=int, help="Batch size for evaluation.")
parser.add_argument("--eval_during_train", default=False, help="Evaluate at each train logging step.")
parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Steps before backward pass.")
parser.add_argument("--learning_rate", default=5e-6, type=float, help="The initial learning rate for Adam.")
parser.add_argument("--logging_steps", type=int, default=-1, help="Log every X updates steps (default after each epoch).")
parser.add_argument("--max_input_length", default=140, type=int, help="Maximum input event length in words.")
parser.add_argument("--max_output_length", default=120, type=int, help="Maximum output event length in words.")
parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument("--max_steps", default=1, type=int, help="If > 0: total number of training steps to perform.")
parser.add_argument("--model_name_or_path", default="facebook/bart-large-mnli", type=str, help="LM checkpoint for initialization.",)
parser.add_argument("--model_type", default="", type=str, help="which family of LM, e.g. gpt, gpt-xl, ....",)
parser.add_argument("--num_train_epochs", default=1.0, type=float, help="Number of training epochs to perform.",)
parser.add_argument("--overwrite_cache", action="store_true", help="Overwrite the cached data.")
parser.add_argument("--overwrite_out_dir", action="store_true", help="Overwrite the output directory.",)
parser.add_argument("--continue_training",action="store_true",help="Continue training from the last checkpoint.",)
parser.add_argument("--save_steps", type=int, default=-1, help="Save checkpoint every X updates steps (default after each epoch).",)
parser.add_argument("--save_total_limit", type=int, default=None, help="Maximum number of checkpoints to keep",)
parser.add_argument("--train_batch_size", default=16, type=int, help="Batch size for training.")
parser.add_argument("--train_file", type=str, required=False, help="The input CSV train file.")
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
parser.add_argument("--task", type=str, help="what is the task?")
args = parser.parse_args()

## Prepare for Training

In [None]:
# Taken from GitHub
def get_loss(args, batch, model):
    """
    Compute this batch loss
    """
    input_ids = batch["inputs"].to(device)
    input_mask = batch["input_mask"].to(device)
    target_ids = batch["outputs"].to(device)
    output_mask = batch["output_mask"].to(device)
    decoder_input_ids = target_ids[:, :-1].contiguous()

    # We don't send labels to model.forward because we want to compute per token loss
    lm_logits = model(
        input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids, use_cache=False
    )[0]  # use_cache=false is added for HF > 3.0
    batch_size, max_length, vocab_size = lm_logits.shape

    # Compute loss for each instance and each token
    loss_fct = CrossEntropyLoss(reduction="none")
    lm_labels = target_ids[:, 1:].clone().contiguous()
    lm_labels[target_ids[:, 1:] == args.pad_token_id] = -100
    loss = loss_fct(lm_logits.view(-1, vocab_size), lm_labels.view(-1)).view(
        batch_size, max_length
    )

    # Only consider non padded tokens
    loss_mask = output_mask[..., :-1].contiguous()
    loss = torch.mul(loss_mask, loss)  # [batch_size, max_length]
    return loss

In [None]:
# Taken from GitHub
def train(args, train_dataset, model, tokenizer, loss_fnc=get_loss, eval_dataset=None):
    """
    Train the model.
    """ 
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    t_total = (len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare optimizer and scheduler (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": args.weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    # Train
    total_batch_size = args.train_batch_size * args.gradient_accumulation_steps
    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(model, "module") else model
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch")

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            model.train()

            # Take the loss only for the part after the input (as in seq2seq architecture)
            loss = loss_fnc(args, batch, model)
            loss = loss.mean()

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"

                    # Save model checkpoint
                    out_dir = os.path.join(
                        args.out_dir, "{}-{}".format(checkpoint_prefix, global_step)
                    )

                    if not os.path.exists(out_dir):
                        os.makedirs(out_dir)

                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(out_dir)
                    tokenizer.save_pretrained(out_dir)
                    torch.save(args, os.path.join(out_dir, "training_args.bin"))

                    torch.save(
                        optimizer.state_dict(), os.path.join(out_dir, "optimizer.pt")
                    )
                    torch.save(
                        scheduler.state_dict(), os.path.join(out_dir, "scheduler.pt")
                    )

    return global_step, tr_loss / global_step

## PyTorch Custom Dataset


In [None]:
class EncoderDecoderTextDataset(Dataset):
    def __init__(self, tokenizer, args, file_path, block_size=512, is_test = False):
        # Load the JSONL file
        examples = load_data(file_path, task = "index_with_sep", is_test = is_test)
        # Input Data information (for both train/test):
        current_dict = {}
        inputs = [
            tokenizer.convert_tokens_to_ids(tokenizer.tokenize(ex[0]))
            for ex in examples
        ]

        max_input_length = min(args.max_input_length, max([len(ex) for ex in inputs]))
        
        input_lengths = [min(len(ex), max_input_length) for ex in inputs]

        inputs = [tokenizer.encode(
            ex[0], add_special_tokens=False, max_length=max_input_length, pad_to_max_length=True)
            for ex in examples]

        current_dict["inputs"] = inputs
        current_dict["input_lengths"] = input_lengths

        # Output Data information (only for train, test will have nothing):
        if not is_test:
            outputs = [
                [inputs[i][-1]]
                + tokenizer.convert_tokens_to_ids(tokenizer.tokenize(ex[1]))
                for i, ex in enumerate(examples)
            ]
            max_output_length = min(
                args.max_output_length, max([len(ex) for ex in outputs])
            )
            output_lengths = [min(len(ex), max_output_length) for ex in outputs]

            current_dict["outputs"] = outputs
            current_dict["output_lengths"] = output_lengths
            outputs = [tokenizer.encode(
                '[orig] ' + ex[1], add_special_tokens=False, max_length=max_output_length, pad_to_max_length=True)
                for ex in examples]

        self.examples = current_dict

    def __len__(self):
        return len(self.examples["input_lengths"])

    def __getitem__(self, item):
        # Input Data information (for both train/test):
        current_dict = {}
        inputs = torch.tensor(self.examples["inputs"][item])
        max_length = inputs.shape[0]
        input_lengths = self.examples["input_lengths"][item]
        input_mask = torch.tensor([1] * input_lengths + [0] * (max_length - input_lengths))

        current_dict["inputs"] = inputs
        current_dict["input_mask"] = input_mask

        # Output Data information (only for train, test will have nothing):
        if self.examples.get("outputs") != None:
            outputs = torch.tensor(self.examples["outputs"][item])
            max_length = outputs.shape[0]
            output_lengths = self.examples["output_lengths"][item]
            output_mask = torch.tensor([1] * output_lengths + [0] * (max_length - output_lengths))
            current_dict["outputs"] = outputs
            current_dict["output_mask"] = output_mask

        return current_dict

## Begin Training

In [None]:
# Initialize the tokenize and pretrained BART model
tokenizer, model = init_model(args.model_name_or_path, device=device, do_lower_case=args.do_lower_case)

In [None]:
# Apply the special tokens specified in the paper
args.pad_token_id = tokenizer.pad_token_id
args.block_size = tokenizer.max_len_single_sentence

special_tokens = ["[shuffled]", "[orig]", "<eos>"]
extra_specials = [f"<S{i}>" for i in range(args.max_output_length)]
special_tokens += extra_specials
tokenizer.pad_token = "<pad>"
tokenizer.eos_token = "<eos>"
tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Initialize the train PyTorch dataset
train_data = os.path.join(DATA_PATH, TRAIN_JSONL_NAME)
train_set = EncoderDecoderTextDataset(tokenizer, args, train_data)

In [None]:
# Training
model.to(device) # send to GPU if applicable

if args.do_train:
    global_step, tr_loss = train(
        args,
        train_set,
        model,
        tokenizer,
        loss_fnc=get_loss,
    )

    # Create output directory if needed
    if not os.path.exists(args.out_dir):
        os.makedirs(args.out_dir)

    # Save the following:
    #   - arguments
    #   - model
    #   - tokenizer
    model_to_save = model.module if hasattr(model, "module") else model
    model_to_save.save_pretrained(args.out_dir)
    tokenizer.save_pretrained(args.out_dir)

    # Good practice: save your training arguments together with the trained model
    torch.save(args, os.path.join(args.out_dir, "training_args.bin"))

# For Prediction (trained models)

In [None]:
# Reload the saved model and parameters
tokenizer, model = init_model(
    args.out_dir, device=device, do_lower_case=args.do_lower_case, args=args
)
args.block_size = tokenizer.max_len_single_sentence
model.to(device)

In [None]:
# Prepare the test data
test_data = os.path.join(DATA_PATH, TEST_JSONL_NAME)
test_set = EncoderDecoderTextDataset(tokenizer, args, test_data, is_test = True)
test_loader = DataLoader(test_set, batch_size=1)

In [None]:
outputs = []
for i, batch in tqdm(enumerate(test_loader)):
    # Send the inputs to gpu
    input_ids = batch["inputs"].to(device)
    model_outputs = model.generate(input_ids=input_ids).tolist()[0]
    # Decode to get position vectors (only elements 1 through 6)
    decoded = tokenizer.decode(model_outputs[1:6])
    decoded_list = decoded.split(" ")
    pos_pred = []
    for item in decoded_list:
    # first element may be empty string
        if item != '':
            pos_pred.append(int(item))
    outputs.append(pos_pred)

In [None]:
df_to_submit = pd.DataFrame(np.concatenate([np.arange(len(test_loader))[:, None], np.array(outputs)], axis = 1))
df_to_submit.columns = ['id'] + [f"index_{i+1}"for i in range(5)]

# CHECK RESULTS
df_to_submit.head()

In [None]:
# SAVE SUBMISSION FOR KAGGLE
df_to_submit.to_csv(os.path.join(FOLDER_PATH, "submission_mike.csv"), index = False)

In [None]:
# Submission should be: col. = id, index_1, ..., index_5
#   - double check if submission is in the same format
sample_submission = pd.read_csv(os.path.join(DATA_PATH, "sample_submission.csv"))
sample_submission