In [1]:
# load packages
import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5EncoderModel
from transformers import AdamW, get_linear_schedule_with_warmup
import time
import datetime
import random
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

torch.cuda.amp.autocast(enabled=True)

<torch.cuda.amp.autocast_mode.autocast at 0x7bc65b5728c0>

In [2]:
# Run in colab during first execution and restart
!pip install SentencePiece

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.0-py3-none-any.whl (230 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m230.6/230.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.0 colorlog-6.8.0 optuna-3.5.0
Collecting SentencePiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K 

In [2]:
SEED = 15
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7bc63af6d690>

In [3]:
torch.backends.cudnn.deterministic = True

# tell pytorch to use cuda
device = torch.device("cuda")

In [6]:
# Run in colab during first execution
!gdown --folder https://drive.google.com/drive/folders/1CAbb3DjrOPBNm0ozVBfhvrEh9P9rAppc

Retrieving folder list
Processing file 1e_G-9a66AryHxBOwGWhriePYCCa4_29e subtaskA_dev_monolingual.jsonl
Processing file 123UQ92LxtHaVTbNYlmjnG1CWwD-x7wDL subtaskA_dev_multilingual.jsonl
Processing file 1HeCgnLuDoUHhP-2OsTSSC3FXRLVoI6OG subtaskA_train_monolingual.jsonl
Processing file 13-9-DakCeLFbPgCiVIU0v6_BCQx0ppz6 subtaskA_train_multilingual.jsonl
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1e_G-9a66AryHxBOwGWhriePYCCa4_29e
To: /content/SubtaskA/subtaskA_dev_monolingual.jsonl
100% 10.8M/10.8M [00:00<00:00, 52.7MB/s]
Downloading...
From: https://drive.google.com/uc?id=123UQ92LxtHaVTbNYlmjnG1CWwD-x7wDL
To: /content/SubtaskA/subtaskA_dev_multilingual.jsonl
100% 21.2M/21.2M [00:00<00:00, 76.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1HeCgnLuDoUHhP-2OsTSSC3FXRLVoI6OG
To: /content/SubtaskA/subtaskA_train_monolingual.jsonl
100% 347M/347M [00:02<00:00, 121MB/s] 
Downl

In [4]:
# Load data
import pandas as pd
import json
def load_data(filename: str):
    entries = []
    with open(filename, 'r') as file:
        for line in file:
            entry = json.loads(line)
            entries.append(entry)

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(entries)

In [7]:
training_english = load_data('/content/SubtaskA/subtaskA_train_monolingual.jsonl')
dev_english = load_data('/content/SubtaskA/subtaskA_dev_monolingual.jsonl')

In [8]:
all_train_text = training_english['text'] + " </s>"

all_dev_text = dev_english['text'] + " </s>"

In [9]:
mapping = {1: '1', 0: '0'}

In [10]:
all_train_labels = training_english['label']
all_dev_labels = dev_english['label']

all_train_labels.replace(mapping, inplace=True)
all_dev_labels.replace(mapping, inplace=True)

all_train_labels = all_train_labels + " </s>"
all_dev_labels = all_dev_labels + " </s>"

In [11]:
# instantiate T5 tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# tokenize the main text
def tokenize_corpus(df, tokenizer, max_len):
    # token ID storage
    input_ids = []
    # attension mask storage
    attention_masks = []
    # max len -- 512 is max
    max_len = max_len
    # for every document:
    for doc in df:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            doc,  # document to encode.
                            add_special_tokens=True,  # add tokens relative to model
                            max_length=max_len,  # set max length
                            truncation=True,  # truncate longer messages
                            pad_to_max_length=True,  # add padding
                            return_attention_mask=True,  # create attn. masks
                            return_tensors='pt'  # return pytorch tensors
                       )

        # add the tokenized sentence to the list
        input_ids.append(encoded_dict['input_ids'])

        # and its attention mask (differentiates padding from non-padding)
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)


# create tokenized data
train_input_ids, train_attention_masks = tokenize_corpus(all_train_text, tokenizer, 512)
target_input_ids, target_attention_masks = tokenize_corpus(all_train_labels, tokenizer, 2)



In [13]:
dev_input_ids, dev_attention_masks = tokenize_corpus(all_dev_text, tokenizer, 512)
dev_target_input_ids, dev_target_attention_masks = tokenize_corpus(all_dev_labels, tokenizer, 2)

In [14]:
train_tensor_df = TensorDataset(train_input_ids, train_attention_masks, target_input_ids, target_attention_masks)
dev_tensor_df = TensorDataset(dev_input_ids, dev_attention_masks, dev_target_input_ids, dev_target_attention_masks)

In [26]:
class CustomT5Model(torch.nn.Module):
    def __init__(self, t5_model, dropout_rate=0.1):
        super(CustomT5Model, self).__init__()
        self.t5_encoder = T5EncoderModel.from_pretrained(t5_model)
        self.global_max_pooling = torch.nn.AdaptiveMaxPool2d((1, None))
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.classifier = torch.nn.Linear(self.t5_encoder.config.hidden_size, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, input_ids, attention_mask, labels, decoder_attention_mask):
        outputs = self.t5_encoder(input_ids=input_ids, attention_mask=attention_mask, labels=labels, decoder_attention_mask=decoder_attention_mask)
        hidden_states = outputs.last_hidden_state
        pooled_output = self.global_max_pooling(hidden_states.transpose(1, 2)).squeeze(-1)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        probabilities = self.sigmoid(logits)
        return outputs, probabilities

    def generate(self, input_ids, attention_mask):
        outputs = self.t5_encoder(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        pooled_output = self.t5_encoder.generate
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        ids = self.sigmoid(logits)
        return ids

In [16]:
t5_model = 't5-small'

In [27]:
model = CustomT5Model(t5_model, 0.1).cuda()

In [35]:
def train(model, dataloader, optimizer):

    # capture time
    total_t0 = time.time()

    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    print('Training...')

    # reset total loss for epoch
    train_total_loss = 0
    total_train_f1 = 0

    # put model into traning mode
    model.train()

    # for each batch of training data...
    for step, batch in enumerate(dataloader):

        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_target_ids = batch[2].cuda()
        b_target_mask = batch[3].cuda()

        # clear previously calculated gradients
        optimizer.zero_grad()

        # runs the forward pass with autocasting.
        with autocast():
            # forward propagation (evaluate model on training batch)
            outputs = model(input_ids=b_input_ids,
                            attention_mask=b_input_mask,
                            labels=b_target_ids,
                            decoder_attention_mask=b_target_mask)

            loss, prediction_scores = outputs[:2]

            # sum the training loss over all batches for average loss at end
            train_total_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    # calculate the average loss over all of the batches
    avg_train_loss = train_total_loss / len(dataloader)

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'Train Loss': avg_train_loss
        }
    )

    # training time end
    training_time = format_time(time.time() - total_t0)

    # print result summaries
    print("")
    print("summary results")
    print("epoch | trn loss | trn time ")
    print(f"{epoch+1:5d} | {avg_train_loss:.5f} | {training_time:}")

    return training_stats

# time function
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [36]:
#model = T5ForConditionalGeneration.from_pretrained('t5-small').cuda()


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [37]:
optimizer = AdamW(model.parameters(),lr = 3e-5)

In [38]:
epochs = 3
scaler = GradScaler()

In [39]:
#Updating to avoid size issues due to batch size 
duplicate_train_tensor_df = TensorDataset(train_input_ids[:-1], train_attention_masks[:-1], target_input_ids[:-1], target_attention_masks[:-1])

In [40]:
def create_batches(dataset, batch_size=32, shuffle=True):
    # Create a DataLoader from the TensorDataset
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)
    return dataloader

In [41]:
batch_size = 8
shuffle_data = True  # Set to True to shuffle the data while batching
train_batches = create_batches(duplicate_train_tensor_df, batch_size=batch_size, shuffle=shuffle_data)

In [42]:
dev_batches = create_batches(dev_tensor_df, batch_size=batch_size, shuffle=shuffle_data)

In [43]:
# create training result storage
training_stats = []
valid_stats = []
best_valid_loss = float('inf')

# for each epoch
for epoch in range(epochs):
    train(model, train_batches, optimizer)

model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained('./model_save/t5-classification/')  # transformers save
tokenizer.save_pretrained('./model_save/t5-classification/')  # transformers save


Training...

summary results
epoch | trn loss | trn time 
    1 | 0.11977 | 0:40:02

Training...

summary results
epoch | trn loss | trn time 
    2 | 0.04040 | 0:39:43

Training...

summary results
epoch | trn loss | trn time 
    3 | 0.02981 | 0:39:38


('./model_save/t5-classification/tokenizer_config.json',
 './model_save/t5-classification/special_tokens_map.json',
 './model_save/t5-classification/spiece.model',
 './model_save/t5-classification/added_tokens.json')

In [48]:
test_stats = []
def testing(model, dataloader):

    print("")
    print("Running Testing...")

    # measure training time
    t0 = time.time()

    # put the model in evaluation mode
    model.eval()

    # track variables

    total_test_loss = 0
    total_test_acc = 0
    total_test_f1 = 0
    predictions = []
    actuals = []

    # evaluate data for one epoch
    for step, batch in enumerate(dataloader):
        # progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(dataloader), elapsed))

        b_input_ids = batch[0].cuda()
        b_input_mask = batch[1].cuda()
        b_target_ids = batch[2].cuda()
        b_target_mask = batch[3].cuda()

        with torch.no_grad():

            generated_ids = model.generate(
                    input_ids=b_input_ids,
                    attention_mask=b_input_mask,
                    max_length=3
                    )

            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in b_target_ids]

            total_test_acc += accuracy_score(target, preds)
            total_test_f1 += f1_score(preds, target,
                                       average='weighted',
                                       labels=np.unique(preds))

            predictions.extend(preds)
            actuals.extend(target)

    avg_test_acc = total_test_acc / len(dataloader)

    avg_test_f1 = total_test_f1 / len(dataloader)

    # Record all statistics from this epoch.
    test_stats.append(
        {
            'Test Acc.': avg_test_acc,
            'Test F1': avg_test_f1,
        }
    )

    return test_stats

In [49]:
testing(model, dev_batches)


Running Testing...
  Batch    40  of    625.    Elapsed: 0:00:05.
  Batch    80  of    625.    Elapsed: 0:00:10.
  Batch   120  of    625.    Elapsed: 0:00:14.
  Batch   160  of    625.    Elapsed: 0:00:19.
  Batch   200  of    625.    Elapsed: 0:00:23.
  Batch   240  of    625.    Elapsed: 0:00:27.
  Batch   280  of    625.    Elapsed: 0:00:32.
  Batch   320  of    625.    Elapsed: 0:00:37.
  Batch   360  of    625.    Elapsed: 0:00:42.
  Batch   400  of    625.    Elapsed: 0:00:47.
  Batch   440  of    625.    Elapsed: 0:00:51.
  Batch   480  of    625.    Elapsed: 0:00:56.
  Batch   520  of    625.    Elapsed: 0:01:01.
  Batch   560  of    625.    Elapsed: 0:01:07.
  Batch   600  of    625.    Elapsed: 0:01:15.


[{'Test Acc.': 0.7294, 'Test F1': 0.7464598867798865}]