In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset

from accelerate import Accelerator

import torch
from torch.utils.data import DataLoader

import numpy as np

import os
import sys
sys.path.append(os.getcwd()+"/../..")

from src import paths

import tqdm

import evaluate 

In [2]:
# Load data
data_files = {"train": "ms-diag_clean_train.csv", "validation": "ms-diag_clean_val.csv", "test": "ms-diag_clean_test.csv"}
df = load_dataset(os.path.join(paths.DATA_PATH_PREPROCESSED,'ms-diag'), data_files = data_files)

# Number of labels
num_labels = len(set(df['train']['labels']))

# Label to id
label2id = {label: i for i, label in enumerate(set(df['train']['labels']))}
id2label = {i: label for i, label in enumerate(set(df['train']['labels']))}

In [3]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(paths.MODEL_PATH/'medbert')

# Load model
model = AutoModelForSequenceClassification.from_pretrained(paths.MODEL_PATH/'medbert', num_labels=num_labels)
model.load_state_dict(torch.load(paths.MODEL_PATH/'ms_diag_medbert.pt'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /mnt/c/Users/marc_/OneDrive/ETH/MSC_Thesis/inf-extr/resources/models/medbert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [4]:
def prepare_data(data):
    
    # Label
    data['labels'] = [label2id[label] for label in data['labels']]

    # Tokenize
    data = tokenizer(data['text'], padding=True, truncation=True, return_tensors='pt')

    return data

# Tokenize dataset
dataset = df.map(prepare_data, batched=True, remove_columns=['rid', 'text', 'date'], batch_size=512)

Map:   0%|          | 0/143 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

In [11]:
BATCH_SIZE = 16

# Collator
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors='pt')

# Dataloader
# train_loader = DataLoader(dataset['train'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
val_loader = DataLoader(dataset['validation'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False)
test_loader = DataLoader(dataset['test'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False)

# Accelerator
accelerator = Accelerator(mixed_precision='fp16')

f1_metric = evaluate.load("f1")

# Prepare with accelerator
model, test_loader, val_loader = accelerator.prepare(
    model, test_loader, val_loader
)

In [12]:
# Predict
for batch in tqdm.tqdm(val_loader):
    with torch.no_grad():
        outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        f1_metric.add_batch(predictions=predictions, references=batch['labels'])

f1_score = f1_metric.compute(average='weighted')
print("F1", f1_score)
        

100%|██████████| 1/1 [00:01<00:00,  1.72s/it]

F1 {'f1': 0.041666666666666664}



