In [44]:
import os
import copy
import random
import pandas as pd
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
from tqdm.auto import tqdm
from tqdm import tqdm
#import t5Utils
#from t5Utils import get_target_id_map

pl.seed_everything(42)

path = 'feedback-prize-2021'
train_csv = os.path.join(path, 'train.csv')

Global seed set to 42


In [30]:
def prepare_training_data_helper(path, tokenizer, df, train_ids):
    training_samples = []
    for idx in tqdm(train_ids):
        filename = os.path.join(path, "train", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        input_labels = copy.deepcopy(input_ids)
        offset_mapping = encoded_text["offset_mapping"]

        for k in range(len(input_labels)):
            input_labels[k] = "O"

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        temp_df = df[df["id"] == idx]
        # each row of the df is going to be one sentence with one type
        for _, row in temp_df.iterrows():
            # id is just one document
            text_labels = [0] * len(text)
            discourse_start = int(row["discourse_start"])
            discourse_end = int(row["discourse_end"])
            prediction_label = row["discourse_type"]
            text_labels[discourse_start:discourse_end] = [1] * (discourse_end - discourse_start)
            target_idx = []
            # iterating over the offset mapping for the encoded text (tokenized text) so you are basically iterating over every single word (token)
            for map_idx, (offset1, offset2) in enumerate(encoded_text["offset_mapping"]):
                if sum(text_labels[offset1:offset2]) > 0:
                    if len(text[offset1:offset2].split()) > 0:
                        target_idx.append(map_idx)

            targets_start = target_idx[0]
            targets_end = target_idx[-1]
            pred_start = "B-" + prediction_label
            pred_end = "I-" + prediction_label
            input_labels[targets_start] = pred_start
            input_labels[targets_start + 1 : targets_end + 1] = [pred_end] * (targets_end - targets_start)

        sample["input_ids"] = input_ids
        sample["input_labels"] = input_labels
        training_samples.append(sample)
    return training_samples

target_id_map = {
        "B-Lead": 0,
        "I-Lead": 1,
        "B-Position": 2,
        "I-Position": 3,
        "B-Evidence": 4,
        "I-Evidence": 5,
        "B-Claim": 6,
        "I-Claim": 7,
        "B-Concluding Statement": 8,
        "I-Concluding Statement": 9,
        "B-Counterclaim": 10,
        "I-Counterclaim": 11,
        "B-Rebuttal": 12,
        "I-Rebuttal": 13,
        "O": 14,
        "PAD": -100,
    }

id_target_map = {v: k for k, v in target_id_map.items()}


In [37]:
## GLOBALS
N_EPOCHS = 3 # starting guess
BATCH_SIZE = 4 # also a starting guess 
NUM_WORKERS = 4 # start at the same as batch size

In [38]:
class EssayDataset(Dataset):
    
    def __init__(self, samples, tokenizer, max_len):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)
        return
    
    def __len__(self):
        return self.length
        
    
    def __getitem__(self, index):
        #input_ids = self.samples[index]['input_ids']
        input_labels = self.samples[index]['input_labels']
        # GET FROM ABISHEK THE target_id_map
        input_labels = [target_id_map[x] for x in input_labels]
        other_label_id = target_id_map["O"]
        padding_label_id = target_id_map["PAD"]
        
        text = self.samples['text']
        
        encoded_text = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        input_ids = endcoded_text['input_ids']
        mask = encoded_text['attention_mask']
        token_type_id = encoded_text['token_type_ids']
        
        
        # add start token id to the input_ids
        #input_ids = [self.tokenizer.cls_token_id] + input_ids
        input_labels = [other_label_id] + input_labels

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]
            input_labels = input_labels[: self.max_len - 1]

        # add end token id to the input_ids
        #input_ids = input_ids + [self.tokenizer.sep_token_id]
        input_labels = input_labels + [other_label_id]
        
        padding_length = self.max_len - len(input_ids)
        if padding_length > 0:
            if self.tokenizer.padding_side == "right":
                input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
                input_labels = input_labels + [padding_label_id] * padding_length
                attention_mask = attention_mask + [0] * padding_length
            else:
                input_ids = [self.tokenizer.pad_token_id] * padding_length + input_ids
                input_labels = [padding_label_id] * padding_length + input_labels
                attention_mask = [0] * padding_length + attention_mask
        
        
        
        # add start token id 
        #input_ids = [self.tokenizer.decoder_start_token_ ] + input_ids
        
        ## !!! add other token to the labels !!!
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'input_labels': torch.tensor(input_labels, dtype=torch.float)
        }
        return 
    

In [56]:
class EssayDataModule(pl.LightningDataModule):
    
    def __init__(self,
                 train_df,
                 test_df,
                 tokenizer: T5Tokenizer,
                 batch_size: int,
                 max_len: int):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.max_len = max_len
        return
    
    def setup(self, stage=None):
        self.train_dataset = EssayDataset(self.train_df, self.tokenizer, self.max_len)
        self.test_dataset = EssayDataset(self.test_df, self.tokenizer, self.max_len)
        return
    
    
    def train_dataloader(self):
        return DataLoader(dataset=self.train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)
    
    def val_dataloader(self):
        return DataLoader(dataset=self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)
    
    def test_dataloader(self):
        return DataLoader(dataset=self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)
    

In [57]:
MODEL_NAME = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

class EssayT5Model(LightningModule):
    
    def __init__(self):
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)#, return_dict=True)
        return
    
    def forward(self, input_ids, attention_mask, labels=None):
        ouput = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameter(), lr=0.0001) 
    
    
    

In [35]:
df = pd.read_csv(train_csv)
essay_ids = df['id'].unique()
samples = prepare_training_data_helper(path, tokenizer, df, essay_ids) 

  0%|                                                 | 0/15594 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (733 > 512). Running this sequence through the model will result in indexing errors
100%|█████████████████████████████████████| 15594/15594 [02:44<00:00, 94.55it/s]


In [58]:
n = len(samples)
random.seed(42)
test_idxs = set(random.sample(range(n), int(n*0.1)))
test = [samples[idx] for idx in range(n) if idx in test_idxs]
train = [samples[idx] for idx in range(n) if idx not in test_idxs]
data_module = EssayDataModule(train, test, tokenizer, batch_size=BATCH_SIZE, max_len=1536)

# model = EssayT5Model(model, data_module)

In [36]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Setup training ang saving chekcpoints, train model.

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

logger = TensorBoardLogger('lightning_logs', name='essay_evaluation')

trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_regresh_rate=30
)

trainer.fit(model, data_module)
    

In [None]:
trained_model = EssayT5Mode.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)