In [None]:
!pip install huggingface-hub
!pip install transformers==4.36
!pip install datasets evaluate
!pip install accelerate -U

In [None]:
import pickle
import os
import pandas as pd
from datasets import Dataset
import json
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.metrics import confusion_matrix, classification_report

def import_data(importDirectory, train_or_valid, spacy_tokenizer_name):
    
    export_list = []
    for file in os.listdir(importDirectory):
        filenameArr = file.split('.')
        filenameArr2 = file.split('_')
        if(len(filenameArr) > 1):
            if(filenameArr2[0] == train_or_valid):
                if(spacy_tokenizer_name in file):
                    if(filenameArr[1] == 'pkl'):
                        with open(importDirectory+'/'+file, 'rb') as f:
                            print(importDirectory+'/'+file)

                            data = pickle.load(f)
                            f.close()
                            #print(importDirectory)
                            export_list.append({"data":data, "foldername":filenameArr[0]})
    return(export_list)

In [None]:
import_path = #should equal the path to the folder containing the output of BRAT_Parser

#Data is a list with a length equal to the amount of folders that were iterated through 
#data[n] is a list of length x. X is the number of files presnet in the nth folder. 
#data[n][x] is a dictionary, with keys 'ADE_strings', 'noADE_strings', 'num_Multi_Token_ADE_Relatons


#Will work with data via Hugging Face Datasets library
#Should be a dataset dict: {"train": ["string", "label", "idx"], "verification":["string", "label", "idx"], "test":[...
#First, cast it to a pandas df of shape ["label"]["string" ....] 
def load_data(import_path, train_or_valid, spacy_tokenizer_name):
    data = import_data(import_path, train_or_valid, spacy_tokenizer_name)
        
    dataset = pd.DataFrame({})
    ADE_Strings = [] 
    noADE_Strings = []
    filenames = []
    for i in data:
        foldername = i['foldername'].split('_')[1]
        for j in i['data'][0]:
            #temp = j[0]
            for k in j.get('ADE_strings'):
                ADE_Strings.append(k.get('string'))
                filenames.append(foldername)
            for k in j.get('noADE_strings'):
                noADE_Strings.append(k.get('string'))
                filenames.append(foldername)

    dataset['string'] = ADE_Strings + noADE_Strings
    dataset['filename'] = filenames
    
    #Set to 1 because they are ADE
    dataset.loc[0:len(ADE_Strings)-1, ('label')] = 1
    #Set rest to 0 because they are not ADE
    dataset.loc[len(ADE_Strings):len(dataset['string']), ('label')] = 0
    return(dataset)

#note: train2/dev2 names represent files that only treat the Problem events in ADE tags as ADE sentences, not both Problem AND Drug

train_dataframe = load_data(import_path, 'train3', "en_core_sci_md")
train_dataframe['label'] = train_dataframe['label'].astype(int)
train_dataframe['Row_Number'] = train_dataframe.reset_index().index

valid_dataframe = load_data(import_path, 'test3', "en_core_sci_md")
#Without this line, will result in dimesnioanl mismatch: https://discuss.huggingface.co/t/valueerror-target-size-torch-size-8-must-be-the-same-as-input-size-torch-size-8-8/12133/2
valid_dataframe['label'] = valid_dataframe['label'].astype(int)
valid_dataframe['Row_Number'] = valid_dataframe.reset_index().index
#print(type(train_dataframe['label'][0]))
display(train_dataframe)
display(valid_dataframe)

print("Total length of train dataframe:")
print(len(train_dataframe))
print("Length of class 1, ADE")
print(train_dataframe['label'].value_counts().get(1))
print("Length of  class 2, noADE")
print(train_dataframe['label'].value_counts().get(0))


print("Total length of test dataframe:")
print(len(valid_dataframe))
print("Length of class 1, ADE")
print(valid_dataframe['label'].value_counts().get(1))
print("Length of  class 2, noADE")
print(valid_dataframe['label'].value_counts().get(0))

print("\n\nADE Example")
print(train_dataframe.iloc[1, 0])
print('--------')
print("noADE Example")
print(train_dataframe.iloc[16244, 0])

In [None]:
#code to get counts for classes:
print("Total length of train dataframe")
print(len(train_dataframe))
print("Total length of ADE sentences in train dataframe")
df = train_dataframe[train_dataframe.label == 1]
print(len(df))
print("Total length of noADE sentences in train dataframe")
df = train_dataframe[train_dataframe.label == 0]
print(len(df))
print("----------------")
print("Total length of test dataframe")
print(len(valid_dataframe))
print("Total length of ADE sentences in test dataframe")
df = valid_dataframe[valid_dataframe.label == 1]
print(len(df))
print("Total length of noADE sentences in test dataframe")
df = valid_dataframe[valid_dataframe.label == 0]
print(len(df))

In [None]:
display(df)


In [None]:
import torch
import transformers
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_path = #path to a locally saved instance of T5-large
from transformers import AutoTokenizer, AutoModel, DataCollatorForLanguageModeling, T5ForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained(model_path)

#Using AutoModel results in the error "forward got unexpected kwarg: labels". 
#The generic BERT does not expect labels in the forward method (https://github.com/huggingface/transformers/blob/7d9a33fb5cf40a87ff7fa9b4b8556b9bd4760461/src/transformers/models/bert/modeling_bert.py#L189)
#Bert for seq class should
model = T5ForSequenceClassification.from_pretrained(#model_path, num_labels=2)

In [None]:
def tokenize_function(dataset):
    return tokenizer(dataset["string"], padding='max_length', truncation=True, max_length=512)

train_dataset = Dataset.from_pandas(train_dataframe)
valid_dataset = Dataset.from_pandas(valid_dataframe)

tokenized_train_datasets = train_dataset.map(tokenize_function, batched=True)

tokenized_valid_datasets = valid_dataset.map(tokenize_function, batched=True)

In [None]:
tokenized_train_datasets = tokenized_train_datasets.remove_columns(["string"])
tokenized_train_datasets = tokenized_train_datasets.rename_column("label", "labels")

tokenized_train_datasets.set_format("torch")

tokenized_valid_datasets = tokenized_valid_datasets.remove_columns(["string"])
tokenized_valid_datasets = tokenized_valid_datasets.rename_column("label", "labels")

tokenized_valid_datasets.set_format("torch")

In [None]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_train_datasets, shuffle=True, batch_size=8)

valid_dataloader = DataLoader(tokenized_valid_datasets, shuffle=True, batch_size=8)

In [None]:
model = model.to(device);
##NOTE: must be moved prior to optimizer init (https://stackoverflow.com/questions/66091226/runtimeerror-expected-all-tensors-to-be-on-the-same-device-but-found-at-least)

In [None]:
"""
print(tokenized_train_datasets)
print((tokenized_train_datasets['labels'][0]))
print(type(tokenized_train_datasets['input_ids'][0]))
print(type(tokenized_train_datasets['token_type_ids'][0]))
print(type(tokenized_train_datasets['attention_mask'][0]))

for i in tokenized_train_datasets['labels']:
    print(type(i))
    break

print(tokenized_train_datasets)
print(tokenized_train_datasets['input_ids'].to(device))

for i in tokenized_train_datasets['input_ids']:
    print(tokenizer.decode(i))
    break

for i in range(torch.cuda.device_count()):
    print(torch.cuda.get_device_properties(i).name)
""";

In [None]:
retrain = False


optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

if retrain:
    #tokenized_train_datasets['input_ids'].to(device)
    progress_bar = tqdm(range(num_training_steps))
    loss_values = []
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0.0
        true_labels = []
        predicted_labels = []
        

        for batch in train_dataloader:
            #batch = {k: v.to(device) for k, v in batch.items()}
            batch['labels'] = batch['labels'].to(device)
            batch['input_ids'] = batch['input_ids'].to(device)
            batch['attention_mask'] = batch['attention_mask'].to(device)
            outputs = model(labels = batch['labels'], input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])

            #outputs = model(**batch)
            loss = outputs.loss

            labels = batch["labels"].to(device)

            loss_values.append(loss.item())

            total_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            true_labels.extend(labels.cpu().numpy())
            predicted_labels.extend(predictions.cpu().numpy())

            lr_scheduler.step()
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            progress_bar.update(1)
    
        average_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")

        cm = classification_report(true_labels, predicted_labels)
        print("Classification Report:")
        print(cm)
    """
    plt.plot(range(1, epoch + 1), loss_values, label='Training Loss')
    plt.title('Training Loss Curve')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    """
else:
    model = model.from_pretrained(#path to a locally saved T5-large model)
    model.to(torch.device('cuda:0'))

In [None]:
if retrain:
    #Save model
    model.save_pretrained(#path to save T5-large model locally, from_pt=True) 


In [None]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

model.eval()
all_predictions = []
all_references = []
all_filenames = []
all_input_ids = []
all_row_numbers = []

for batch in valid_dataloader:
    #print(len(batch.items()))
    #print((batch.keys()))
    #batch = {k: v.to(device) for k, v in batch.items()}
    #print(k)
    #print(v)
    
    #batch = {k: v.to(device) for k, v in batch.items()}
    
    with torch.no_grad():
        #individually move everything BESIDES filename to the gpu
        
        batch['labels'] = batch['labels'].to(device)
        batch['input_ids'] = batch['input_ids'].to(device)
        batch['attention_mask'] = batch['attention_mask'].to(device)
        outputs = model(labels = batch['labels'], input_ids = batch['input_ids'], attention_mask = batch['attention_mask'])

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    clf_metrics.add_batch(predictions=predictions, references=batch["labels"])
    all_predictions.extend(predictions.cpu().numpy())
    all_references.extend(batch["labels"].cpu().numpy())
    all_filenames.extend(batch["filename"])  # Assuming 'filename' is the key in your batch
    all_input_ids.extend(batch['input_ids'])
    all_row_numbers.extend(batch['Row_Number'])
    
result_dict = {
    "predictions": all_predictions,
    "references": all_references, 
    "filenames": all_filenames, 
    'input_ids': all_input_ids,
    'row_numbers': all_row_numbers
}

cr = classification_report(all_references,all_predictions, digits = 3)
print(cr)


clf_metrics.compute()

In [None]:
count1 = 0
count2 = 0
print(len(all_predictions))
for i in all_predictions:
    if(i == 0):
        count1+=1
    elif(i == 1):
        count2+=1
    else:
        print(i)
print(count1)
print(count2)

In [None]:
import pandas as pd

# Create a DataFrame with filenames, predictions, and references
df = pd.DataFrame({
    'filename': result_dict["filenames"],
    'prediction': result_dict["predictions"],
    'reference': result_dict["references"]
})

# Identify correct and incorrect predictions
df['correct'] = (df['prediction'] == df['reference'])


# Group by filename and calculate total correct and incorrect predictions
summary_df = df.groupby('filename')['correct'].value_counts().unstack(fill_value=0).reset_index()

# Rename columns for clarity
summary_df.columns = ['filename', 'incorrect_predictions',  'correct_predictions']

summary_df['total_predicts'] = (summary_df['incorrect_predictions'] + summary_df['correct_predictions'])

# Print the DataFrame
print(summary_df)


In [None]:
decoded_sentences = [tokenizer.decode(ids, skip_special_tokens=True) for ids in result_dict['input_ids']]
result_dict['decoded_sentences'] = decoded_sentences

def export_data(data, path):
    f = open(path+".pkl",'w')
    f.close()
    f = open(path+".pkl", "wb")
    pickle.dump(data, f)
    f.close()

export_data(result_dict, "PT_T5_Classifier_results_dict_TEST_SET_v1")