# Imports

In [None]:
!pip install datasets pandas torch transformers ml_things



In [None]:
from datasets import load_dataset
import pandas as pd
import torch
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import (set_seed,
                          TrainingArguments,
                          Trainer,
                          GPT2Config,
                          GPT2Tokenizer,
                          GPT2LMHeadModel,
                          AutoTokenizer,
                          AdamW,
                          get_linear_schedule_with_warmup,
                          GPT2ForSequenceClassification,
                          PreTrainedTokenizer)
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from ml_things import plot_dict, plot_confusion_matrix, fix_text
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import os
import json
from tqdm import tqdm

In [None]:
from google.colab import drive
drive.mount('/content/drive')

  and should_run_async(code)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Pre-train on domain specific knowledge

Adapted from https://github.com/agrechnev/hugging_examples/blob/master/train_gpt2_torch1.py

In [None]:
subset = 1
field = 'medium_rephrase' # 'text' for tiny pijama, 'abstract' for semantic scholar raw, 'medium_rephrase' for semantic scholar rephrased
json_file_name = f'SS_merged.json'
json_file_path = f"/content/drive/My Drive/SNLP Group Project/SS_final_datasets/{json_file_name}"

  and should_run_async(code)


In [None]:
TEXT_CORPUS = json_file_path
BLOCK_LEN = 1024
TOKEN_ENDOFTEXT = 50256
max_length = 1024
model_name_or_path = 'gpt2'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  and should_run_async(code)


## Train and validate on one epoch

In [None]:
def train_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader, optimizer: torch.optim.Optimizer):
    """Standard PyTorch training, one epoch"""
    model.train()
    losses = []
    for batch in tqdm(loader):
        for k, v in batch.items():
            batch[k] = v.to(device)
        optimizer.zero_grad()
        out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = out['loss']
        loss.backward()
        optimizer.step()
        losses.append(loss.item())


    return np.mean(losses)

  and should_run_async(code)


In [None]:
def val_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader):
    """Standard PyTorch eval, one epoch"""
    model.eval()
    losses = []
    for batch in tqdm(loader):
        for k, v in batch.items():
            batch[k] = v.to(device)
        with torch.no_grad():
            out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = out['loss']
        losses.append(loss.item())

    return np.mean(losses)

  and should_run_async(code)


## Split the text into tokenized blocks

In [None]:
def break_text_to_pieces(text_path: str, tokenizer: PreTrainedTokenizer, block_len: int = 512) -> list[str]:
    with open(json_file_path, 'r') as json_file:
        json_list = list(json_file)
    collection = [json.loads(json_str)[field] for json_str in json_list]
    blocks = tokenizer(collection, return_tensors="pt", padding='max_length', truncation=True,  max_length=block_len)
    return blocks

  and should_run_async(code)


In [None]:
class MyDset(torch.utils.data.Dataset):
    def __init__(self, data):
        print("Constructing dataset...")
        self.data = []
        n = len(data['input_ids'])
        for i in tqdm(range(n)):
          self.data.append({'input_ids': data['input_ids'][i],
                            'attention_mask': data['attention_mask'][i],
                            'labels': data['input_ids'][i]})
        print("Done!")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        return self.data[idx]

  and should_run_async(code)


In [None]:
def prepare_dsets(text_path: str, tokenizer: PreTrainedTokenizer, block_len: int):
    """Read the text, prepare the datasets """
    data = break_text_to_pieces(text_path, tokenizer, block_len)
    return MyDset(data)

  and should_run_async(code)


## Create the model and tokenizer and train

In [None]:
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

  and should_run_async(code)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Create datasets and loader
dset_train = prepare_dsets(TEXT_CORPUS, tokenizer, BLOCK_LEN)

  and should_run_async(code)


Constructing dataset...


100%|██████████| 12000/12000 [00:00<00:00, 140133.61it/s]

Done!





In [None]:
loader_train = torch.utils.data.DataLoader(dset_train, batch_size=1)

  and should_run_async(code)


In [None]:
# Optimizer, device
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

loss_train_list = []
loss_val_list   = []

# Training loop
for i_epoch in range(4):
    loss_train = train_one(model, loader_train, optimizer)
    loss_train_list.append(loss_train)
    print(f'{i_epoch} : loss_train={loss_train}')
    print("Saving model...")
    model_name_or_path_pretrained = f'/content/drive/My Drive/SNLP Group Project/SS_final_datasets/trained_model_rephrasing_{i_epoch+1}_epochs/'
    model.save_pretrained(model_name_or_path_pretrained)
    tokenizer.save_pretrained(model_name_or_path_pretrained)
    print("Done!")

  and should_run_async(code)
100%|██████████| 12000/12000 [17:24<00:00, 11.49it/s]


0 : loss_train=0.8952494959446291
Saving model...
Done!


100%|██████████| 12000/12000 [17:23<00:00, 11.50it/s]


1 : loss_train=0.7701165051438535
Saving model...
Done!


100%|██████████| 12000/12000 [17:23<00:00, 11.50it/s]


2 : loss_train=0.6909109307210892
Saving model...
Done!


100%|██████████| 12000/12000 [17:23<00:00, 11.50it/s]


3 : loss_train=0.6247986778058112
Saving model...
Done!
