In [1]:
import os
import copy
import random
import pandas as pd
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
from tqdm.auto import tqdm
from tqdm import tqdm
#import t5Utils
#from t5Utils import get_target_id_map

pl.seed_everything(42)

path = 'feedback-prize-2021'
train_csv = os.path.join(path, 'train.csv')

Global seed set to 42


In [2]:
def prepare_training_data_helper(path, tokenizer, df, train_ids):
    training_samples = []
    for idx in tqdm(train_ids):
        filename = os.path.join(path, "train", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()

        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        input_labels = copy.deepcopy(input_ids)
        offset_mapping = encoded_text["offset_mapping"]

        for k in range(len(input_labels)):
            input_labels[k] = "O"

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }

        temp_df = df[df["id"] == idx]
        # each row of the df is going to be one sentence with one type
        for _, row in temp_df.iterrows():
            # id is just one document
            text_labels = [0] * len(text)
            discourse_start = int(row["discourse_start"])
            discourse_end = int(row["discourse_end"])
            prediction_label = row["discourse_type"]
            text_labels[discourse_start:discourse_end] = [1] * (discourse_end - discourse_start)
            target_idx = []
            # iterating over the offset mapping for the encoded text (tokenized text) so you are basically iterating over every single word (token)
            for map_idx, (offset1, offset2) in enumerate(encoded_text["offset_mapping"]):
                if sum(text_labels[offset1:offset2]) > 0:
                    if len(text[offset1:offset2].split()) > 0:
                        target_idx.append(map_idx)

            targets_start = target_idx[0]
            targets_end = target_idx[-1]
            pred_start = "B-" + prediction_label
            pred_end = "I-" + prediction_label
            input_labels[targets_start] = pred_start
            input_labels[targets_start + 1 : targets_end + 1] = [pred_end] * (targets_end - targets_start)

        sample["input_ids"] = input_ids
        sample["input_labels"] = input_labels
        training_samples.append(sample)
    return training_samples

target_id_map = {
        "B-Lead": 0,
        "I-Lead": 1,
        "B-Position": 2,
        "I-Position": 3,
        "B-Evidence": 4,
        "I-Evidence": 5,
        "B-Claim": 6,
        "I-Claim": 7,
        "B-Concluding Statement": 8,
        "I-Concluding Statement": 9,
        "B-Counterclaim": 10,
        "I-Counterclaim": 11,
        "B-Rebuttal": 12,
        "I-Rebuttal": 13,
        "O": 14,
        "PAD": -100,
    }

id_target_map = {v: k for k, v in target_id_map.items()}


In [3]:
## GLOBALS
N_EPOCHS = 3 # starting guess
BATCH_SIZE = 3 # also a starting guess 
NUM_WORKERS = 1 # start at the same as batch size

In [4]:
class EssayDataset(Dataset):
    
    def __init__(self, samples, tokenizer, max_len):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)
        return
    
    def __len__(self):
        return self.length
        
    
    def __getitem__(self, index):
        #input_ids = self.samples[index]['input_ids']
        input_labels = self.samples[index]['input_labels']
        # GET FROM ABISHEK THE target_id_map
        input_labels = [target_id_map[x] for x in input_labels]
        other_label_id = target_id_map["O"]
        padding_label_id = target_id_map["PAD"]
        
        text = self.samples[index]['text']
        
        encoded_text = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        input_ids = encoded_text['input_ids']
        mask = encoded_text['attention_mask']
        token_type_ids = encoded_text['token_type_ids']
        
        # add start token id to the input_ids
        #input_ids = [self.tokenizer.cls_token_id] + input_ids
        input_labels = [other_label_id] + input_labels

        if len(input_labels) > self.max_len - 1:
            print('chopped')
            #input_ids = input_ids[: self.max_len - 1]
            input_labels = input_labels[: self.max_len - 1]

        # add end token id to the input_ids
        #input_ids = input_ids + [self.tokenizer.sep_token_id]
        input_labels = input_labels + [other_label_id]
        
        padding_length = self.max_len - len(input_labels)
        if padding_length > 0:
            if self.tokenizer.padding_side == "right":
                #input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
                input_labels = input_labels + [padding_label_id] * padding_length
                #attention_mask = mask + [0] * padding_length
            else:
                #input_ids = [self.tokenizer.pad_token_id] * padding_length + input_ids
                input_labels = [padding_label_id] * padding_length + input_labels
                #attention_mask = [0] * padding_length + mask
        
        ret = {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'input_labels': torch.tensor(input_labels, dtype=torch.long)
        }
        return ret
    
    def collate_fn(self, batch):
        print(len(batch[0]['input_labels']))
        thing = {key: set([len(d[key]) for d in batch]) for key in batch[0]} 
        for key in thing.keys(): 
            if len(thing[key]) > 1:
                print('found a mistmatch', key)
        return
    

In [5]:
class EssayDataModule(pl.LightningDataModule):
    
    def __init__(self,
                 train_df,
                 test_df,
                 tokenizer: T5Tokenizer,
                 batch_size: int,
                 max_len: int):
        super().__init__()
        self.train_df = train_df
        self.test_df = test_df
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.max_len = max_len
        return
    
    def setup(self, stage=None):
        self.train_dataset = EssayDataset(self.train_df, self.tokenizer, self.max_len)
        self.test_dataset = EssayDataset(self.test_df, self.tokenizer, self.max_len)
        return
    
    
    def train_dataloader(self):
        return DataLoader(dataset=self.train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)#, collate_fn = self.train_dataset.collate_fn)
    
    def val_dataloader(self):
        return DataLoader(dataset=self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)#, collate_fn = self.test_dataset.collate_fn)
    
    def test_dataloader(self):
        return DataLoader(dataset=self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)#, collate_fn = self.test_dataset.collate_fn)
    

In [6]:
MODEL_NAME = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

class EssayT5Model(LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        return
    
    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001) 
    
    
    

In [7]:
# Run this cell only once, its the one that gets all the data
df = pd.read_csv(train_csv)
essay_ids = df['id'].unique()
samples = prepare_training_data_helper(path, tokenizer, df, essay_ids) 

  0%|                                                 | 0/15594 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (733 > 512). Running this sequence through the model will result in indexing errors
100%|████████████████████████████████████| 15594/15594 [02:22<00:00, 109.59it/s]


In [8]:
n = len(samples)
random.seed(42)
test_idxs = set(random.sample(range(n), int(n*0.1)))
test = [samples[idx] for idx in range(n) if idx in test_idxs]
train = [samples[idx] for idx in range(n) if idx not in test_idxs]
data_module = EssayDataModule(train, test, tokenizer, batch_size=BATCH_SIZE, max_len=1536)
model = EssayT5Model()
print(len(test))
print(len(train))

1559
14035


In [9]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

Reusing TensorBoard on port 6006 (pid 441400), started 3 days, 8:57:55 ago. (Use '!kill 441400' to kill it.)

In [10]:
# Setup training ang saving chekcpoints, train model.

checkpoint_callback = ModelCheckpoint(
    dirpath='checkpoints',
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

logger = TensorBoardLogger('lightning_logs', name='essay_evaluation')

trainer = pl.Trainer(
    logger=logger,
    checkpoint_callback=checkpoint_callback,
    max_epochs=N_EPOCHS,
    gpus=1,
    progress_bar_refresh_rate=30
)

trainer.fit(model, data_module)
    

  rank_zero_deprecation(
  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  rank_zero_warn(
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Global seed set to 42
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


RuntimeError: CUDA out of memory. Tried to allocate 324.00 MiB (GPU 0; 7.91 GiB total capacity; 3.87 GiB already allocated; 327.75 MiB free; 3.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trained_model = EssayT5Mode.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)