In [34]:
import os
import pandas as pd
import torch 
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
)
from tqdm.auto import tqdm

pl.seed_everything(42)

train_csv = os.path.join('feedback-prize-2021', 'train.csv')

Global seed set to 42


In [35]:
#df = pd.read_csv

In [36]:
## GLOBALS
N_EPOCHS = 3 #TODO
BATCH_SIZE = 0 #TODO
NUM_WORKERS = 0 #TODO 

In [37]:
class EssayDataset(Dataset):
    
    def __init__(self, samples, tokenizer, max_len):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)
        return
    
    def __len__(self):
        return self.length
        
    
    def __getitem__(self, index):
        #input_ids = self.samples[index]['input_ids']
        input_labels = self.samples[index]['input_labels']
        # GET FROM ABISHEK THE target_id_map
        input_labels = [target_id_map[x] for x in input_labels]
        other_label_id = target_id_map["O"]
        padding_label_id = target_id_map["PAD"]
        
        text = self.samples['text']
        
        encoded_text = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        input_ids = endcoded_text['input_ids']
        mask = encoded_text['attention_mask']
        token_type_id = encoded_text['token_type_ids']
        
        
        # add start token id to the input_ids
        #input_ids = [self.tokenizer.cls_token_id] + input_ids
        input_labels = [other_label_id] + input_labels

        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]
            input_labels = input_labels[: self.max_len - 1]

        # add end token id to the input_ids
        #input_ids = input_ids + [self.tokenizer.sep_token_id]
        input_labels = input_labels + [other_label_id]
        
        padding_length = self.max_len - len(input_ids)
        if padding_length > 0:
            if self.tokenizer.padding_side == "right":
                input_ids = input_ids + [self.tokenizer.pad_token_id] * padding_length
                input_labels = input_labels + [padding_label_id] * padding_length
                attention_mask = attention_mask + [0] * padding_length
            else:
                input_ids = [self.tokenizer.pad_token_id] * padding_length + input_ids
                input_labels = [padding_label_id] * padding_length + input_labels
                attention_mask = [0] * padding_length + attention_mask
        
        
        
        # add start token id 
        #input_ids = [self.tokenizer.decoder_start_token_ ] + input_ids
        
        ## !!! add other token to the labels !!!
        
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'input_labels': torch.tensor(input_labels, dtype=torch.float)
        }
        return 
    

In [38]:
class EssayDataModule(pl.LightningDataModule):
    
    def __init__(self,
                 train_df: pd.DataFrame,
                 test_df: pd.DataFrame,
                 tokenizer: T5Tokenizer,
                 batch_size: int,
                 max_len: int):
        super().__init__()
        self.train_df
        self.test_df
        self.batch_size
        self.tokenizer
        self.max_len
        return
    
    def setup(self, stage=None):
        self.train_dataset = EssayDataset(self.train_df, self.tokenizer, self.max_len)
        self.test_dataset = EssayDataset(self.test_df, self.tokenizer, self.max_len)
        return
    
    
    def train_dataloader(self):
        return DataLoader(dataset=self.train_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)
    
    def val_dataloader(self):
        return DataLoader(dataset=self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)
    
    def test_dataloader(self):
        return DataLoader(dataset=self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=NUM_WORKERS)
    

In [45]:
MODEL_NAME = 't5-base'

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

data_module = EssayDataModule(train_df, test_df, tokenizer, batch_size=BATCH_SIZE)

In [47]:
class EssayT5Model(LightningModule):
    
    def __init__(self):
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)#, return_dict=True)
        return
    
    def forward(self, input_ids, attention_mask, labels=None):
        ouput = self.model(
            input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['mask']
        labels = batch['input_labels']
        loss, outputs = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameter(), lr=0.0001) 
    
    
    

In [48]:
model = EssayT5Model()

Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

AttributeError: cannot assign module before Module.__init__() call

In [None]:
checkpoint_callback = ModelCheckpoint(
    #dirpath='checkpoints',
    #filename='best-checkpoint',
    #save_top_k=1,
    #verbose=True,
    #monitor='val_loss',
    #mode='min'
)

logger = TensorBoardLogger('lightning_logs', name='essay_evaluation')

trainer = pl.Trainer(
    #logger=logger,
    #checkpoint_callback=checkpoint_callback,
    #max_epochs=N_EPOCHS,
    #gpus=1,
    #progress_bar_regresh_rate=30
)
trainer.fit(model, data_module)
    

In [None]:
trained_model = EssayT5Mode.load_from_checkpoint(
    trainer.checkpoint_callback.best_model_path
)