In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
from datasets import load_dataset
import torch
from torch.nn.functional import cross_entropy
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import logging
from torcheval.metrics.functional import multiclass_f1_score, multiclass_confusion_matrix, binary_f1_score
from copy import deepcopy, copy
import seaborn as sns
import pandas as pd
from huggingface_hub import notebook_login
from torch.utils.data import DataLoader, Dataset
from collections import defaultdict, deque
from sklearn.model_selection import train_test_split
import random
from sentence_transformers import SentenceTransformer, InputExample, losses, evaluation

import os 
while 'notebooks' in os.getcwd():
    os.chdir("..")

import re
from typing import List, Dict
    
from src.preprocessing.sequence_classification.dataset import get_n_shots_per_class
from src.model.mlm.pet_for_text_classification import train 
import warnings
warnings.filterwarnings("ignore")

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
!mkdir logs
model_name = "PET"
dataset_name = "RAFT-Ade-corpus"
logging.basicConfig(filename=f'logs/{model_name}_{dataset_name}.log', encoding='utf-8', level= logging.INFO)
device = "cuda" if torch.cuda.is_available() else "cpu"

mkdir: cannot create directory ‘logs’: File exists


## Loading dataset

In [4]:
ade_corpus_dataset = load_dataset(
    "ought/raft", 
    name="ade_corpus_v2" , 
    cache_dir= "/Data/pedro.silva"
)

In [5]:
train_dataset = ade_corpus_dataset['train'].add_column(
    "labels",
    list(map(lambda x: 0 if x == 2 else 1, ade_corpus_dataset['train']['Label']))
)

test_dataset = ade_corpus_dataset['test'].add_column(
    "labels",
    list(map(lambda x: 0 if x == 2 else 1, ade_corpus_dataset['test']['Label']))
)

In [6]:
class_counts = np.bincount(train_dataset['labels'])
num_classes = len(class_counts)
total_samples = len(train_dataset['labels'])

class_weights = []
for count in class_counts:
    weight = 1 / (count / total_samples)
    class_weights.append(weight)


In [7]:
class_weights, class_counts

([1.4285714285714286, 3.3333333333333335], array([35, 15]))

In [8]:
train_size = 0.7
def stratified_train_test_split(
    dataset : Dataset,
    classes : np.ndarray,
    train_size : float
):
    """Performs train test split keeping class distributions

    Args:
        dataset (Dataset): _description_
        classes (np.ndarray): _description_
        train_size (float): _description_

    Returns:
        _type_: _description_
    """    

    indexes_dict = {}
    for label in classes[0]:
        indexes_dict[label] = []

    for i in range(len(dataset['labels'])):
        label = dataset['labels'][i]
        text = dataset['Sentence'][i]
        indexes_dict[label].append(text)


    train_data = {
        'labels': [],
        'text': []
    }

    validation_data = {
        "labels" : [],
        "text": []
    }

    # generating train data
    for label in classes[0]:
        n = len(indexes_dict[label])
        size = int(train_size * n)

        train_data['text'] += indexes_dict[label][:size]
        train_data['labels'] += [label]*size
        
        validation_data['text'] +=indexes_dict[label][size:]
        validation_data['labels'] += [label]* (n-size)

    return train_data, validation_data

In [9]:
classes = np.unique(train_dataset['labels'], return_counts=True)
train_data, val_data = stratified_train_test_split(
    train_dataset,
    classes,
    0.6
)

In [10]:
def pattern1(text: str, tokenizer : AutoTokenizer):
    return f"{text} Are any there adverse drug effects ? {tokenizer.mask_token}"

In [11]:

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large",
                                          cache_dir = "/Data/pedro.silva")
idx2class = {
    0: "no",
    1: "yes"
}

class2idx = {
    "no": 0,
    "yes": 1
}

class_names = ['no', 'yes']

verbalizer = {idx : tokenizer.vocab[x.lower()] for idx, x in enumerate(class_names)}
inverse_verbalizer = {tokenizer.vocab[x.lower()] : idx for idx, x in enumerate(class_names)}

In [12]:
verbalizer

{0: 2362, 1: 10932}

In [13]:

def preprocess(text : List[str], labels : List[int]):

    processed_text = []
    processed_labels = []
    for idx in range(len(text)):
        label = idx2class[labels[idx]]
        text_ = text[idx]

        processed_text.append(pattern1(text_, tokenizer))
        processed_labels.append(label)

    return processed_text, processed_labels

In [14]:
train_text, train_labels = preprocess(train_data['text'], train_data['labels'])
val_text, val_labels = preprocess(val_data['text'], val_data['labels'])

In [15]:
train_text

['No regional side effects were noted. Are any there adverse drug effects ? <mask>',
 'We describe the case of a 10-year-old girl with two epileptic seizures and subcontinuous spike-waves during sleep, who presented unusual side-effects related to clobazam (CLB) monotherapy. Are any there adverse drug effects ? <mask>',
 'The INR should be monitored more frequently when bosentan is initiated, adjusted, or discontinued in patients taking warfarin. Are any there adverse drug effects ? <mask>',
 'As termination was not an option for the family, the patient was extensively counseled and treated with oral ganciclovir. Are any there adverse drug effects ? <mask>',
 'Pulses have been given for periods up to three years without evident toxicity. Are any there adverse drug effects ? <mask>',
 'CONCLUSION: Pancreatic enzyme intolerance, although rare, would be a major problem in the management of patients with CF. Are any there adverse drug effects ? <mask>',
 'The treatment of Toxoplasma enceph

In [16]:
train_labels

['no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes',
 'yes']

In [17]:
class PETDatasetForClassification(Dataset):
    def __init__(
        self, 
        processed_text : List[str], 
        labels : List[int],
        tokenizer : AutoTokenizer,
        device : str = "cuda"
    ) -> None:
        super().__init__()

        self.tokens = tokenizer(
            processed_text,
            return_tensors="pt",
            padding="max_length",
            truncation=True
        )
        self.encoded_labels = deepcopy(self.tokens['input_ids'])
        
        self.encoded_labels[self.encoded_labels != tokenizer.mask_token_id] = -100

        for idx, sentence in enumerate(self.encoded_labels):
            sentence[sentence == tokenizer.mask_token_id] = tokenizer.vocab[labels[idx].lower()]

        self.inputs : Dict[str, torch.Tensor] = self.tokens
        self.inputs['labels'] = self.encoded_labels

        for k,v in self.inputs.items():
            self.inputs[k] = v.to(device)

    def __getitem__(self, index):
        d : Dict = dict()
        for key in self.inputs.keys():
            d[key] = self.inputs[key][index]

        return d
    
    def __len__(self):
        return self.tokens['input_ids'].shape[0]

In [18]:
train_dataset = PETDatasetForClassification(
    train_text,
    train_labels,
    tokenizer
)

val_dataset = PETDatasetForClassification(
    val_text,
    val_labels,
    tokenizer
)

In [19]:
train_dataloader = DataLoader(train_dataset, shuffle=True)
val_dataloader =DataLoader(val_dataset)

In [20]:
from src.utils import FocalLoss

In [21]:
classes

(array([0, 1]), array([35, 15]))

In [25]:
model = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-large",
                                             cache_dir = "/Data/pedro.silva").to(device)
history, confusion_matrix, best_model = train(
    train_dataloader,
    val_dataloader,
    num_classes = 2,
    model = model,
    verbalizer = verbalizer,
    tokenizer=tokenizer,
    alpha = 1e-4,
    evaluation_fn= binary_f1_score,
    loss_fn=FocalLoss(class_weights, gamma=2),
    n_epochs=30
)

f1-score : 0.0
tensor([[14,  0],
        [ 6,  0]], device='cuda:0')


KeyboardInterrupt: 