In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, DataCollatorWithPadding, get_scheduler
from datasets import load_dataset, concatenate_datasets, Dataset, ClassLabel

from accelerate import Accelerator

import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

import numpy as np
import pandas as pd 

import os
import sys
sys.path.append(os.getcwd()+"/../..")

from src import paths

import tqdm

import evaluate 

from sklearn.metrics import f1_score, precision_score, recall_score, ConfusionMatrixDisplay


In [2]:
def get_artifical_data_for_label(label:str):
    label_dict = {
        "rrms": "relapsing_remitting_multiple_sclerosis",
        "ppms": "primary_progressive_multiple_sclerosis",
        "spms": "secondary_progressive_multiple_sclerosis"
    }
    generated_data = pd.read_csv(paths.DATA_PATH_PREPROCESSED/f'ms-diag/artificial_{label}.csv')
    generated_data["labels"] = label_dict[label]
    generated_data = generated_data[["0", "labels"]].rename(columns = {"0":"text"})

    return generated_data

def get_artifical_data_all():
    artifical_data = []
    for label in ["rrms", "ppms", "spms"]:
        try: 
            artifical_data.append(get_artifical_data_for_label(label))
        except:
            print(f"Could not find data for {label}")
    artifical_data = pd.concat(artifical_data)
    artifical_data = Dataset.from_pandas(artifical_data).remove_columns('__index_level_0__')
    return artifical_data

In [3]:
# # Load data
data_files = {"train": "ms-diag_clean_train.csv", "validation": "ms-diag_clean_val.csv", "test": "ms-diag_clean_test.csv", "augmented": "ms-diag_augmented.csv"}
df = load_dataset(os.path.join(paths.DATA_PATH_PREPROCESSED,'ms-diag'), data_files = data_files)
df_augmented = df["augmented"]
# df["train"] = concatenate_datasets([df["augmented"], df["train"]])
# Load corrected data
df = load_dataset("csv", data_files = os.path.join(paths.DATA_PATH_PREPROCESSED, "ms-diag/ms-diag_content_annotated.csv"))
df = concatenate_datasets([df["train"], df_augmented])
new_features = df.features.copy()
new_features["labels"] = ClassLabel(names=["primary_progressive_multiple_sclerosis", "relapsing_remitting_multiple_sclerosis", "secondary_progressive_multiple_sclerosis"])
df = df.cast(new_features)
df= df.filter(lambda x: x["contains_dm"] == True)


df = df.train_test_split(test_size=0.1, seed=42, stratify_by_column="labels"
                         )

# Number of labels
num_labels = len(set(df['train']['labels']))

# Label to id
label2id = {'primary_progressive_multiple_sclerosis': 0,
            'relapsing_remitting_multiple_sclerosis': 1,
            'secondary_progressive_multiple_sclerosis': 2}
id2label = {v:k for k,v in label2id.items()}

In [4]:
# # Run this cell if you want to download and fine-tune the model
# from huggingface_hub import notebook_login

# # Login to Hugging Face Hub as model is gated
# notebook_login()

# # Checkpoint
# checkpoint = "GerMedBERT/medbert-512"

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# # Save tokenizer
# tokenizer.save_pretrained(paths.MODEL_PATH/'medbert')

# # Load model for embedding
# model = AutoModel.from_pretrained(checkpoint)

# # Save model
# model.save_pretrained(paths.MODEL_PATH/'medbert')

In [5]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(paths.MODEL_PATH/'medbert')

# Load model
model = AutoModelForSequenceClassification.from_pretrained(paths.MODEL_PATH/'medbert', num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /mnt/c/Users/marc_/OneDrive/ETH/MSC_Thesis/inf-extr/resources/models/medbert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def prepare_data(data):
    
    # Label
    # data['labels'] = [label2id[label] for label in data['labels']]

    # Tokenize
    # data["text"] = [text[:256] for text in data["text"]]
    data = tokenizer(data['text'][:256], padding=True, truncation=True, return_tensors='pt', max_length = 512)

    return data

# Tokenize dataset
dataset = df.map(prepare_data, batched=True, batch_size=512, remove_columns=list(df['train'].features.keys())[1:])

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [7]:
# Training arguments
EPOCHS = 12
BATCH_SIZE = 2
TRAIN_STEPS = EPOCHS * len(dataset['train']) // BATCH_SIZE
NUM_GRADIENT_ACCUMULATION_STEPS = 4
LEARNING_RATE = 1e-5

# Collator
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding=True, return_tensors='pt')

# Dataloader
train_loader = DataLoader(dataset['train'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=True)
# val_loader = DataLoader(dataset['validation'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False)
test_loader = DataLoader(dataset['test'], batch_size=BATCH_SIZE, collate_fn=collate_fn, shuffle=False)

# Accelerator
accelerator = Accelerator(mixed_precision='fp16')

# Optimizer and scheduler
optim = AdamW(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = get_scheduler(
    "linear",
    optim,
    num_warmup_steps=0,
    num_training_steps=TRAIN_STEPS
)

# Prepare with accelerator #I Removed val loader
model, optim, train_loader, test_loader = accelerator.prepare(
    model, optim, train_loader, test_loader
)

In [8]:
# Training
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
progress_bar = tqdm.tqdm(range(TRAIN_STEPS))

for epoch in range(EPOCHS):
    model.train()
    for step, batch in enumerate(train_loader):
        optim.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss/NUM_GRADIENT_ACCUMULATION_STEPS
        accelerator.backward(loss)
        if step % NUM_GRADIENT_ACCUMULATION_STEPS == 0:
            optim.step()
            lr_scheduler.step()
        progress_bar.update(1)

    model.eval()
    with torch.no_grad():
        preds = []
        labels = []
        val_loss = 0
        acc = 0
        for batch in test_loader:
            outputs = model(**batch)
            predictions = outputs.logits.argmax(dim=-1)
            
            val_loss += outputs.loss
            acc += (predictions == batch['labels']).sum().item()
            
            preds.extend(predictions.tolist())
            labels.extend(batch['labels'].tolist())

    f1 = f1_score(labels, preds, average='macro')
    # if epoch == 0:
    #         min_val_loss = val_loss
    #         largest_f1 = f1
    # elif (val_loss < min_val_loss):
    #     min_val_loss = val_loss
    #     model.save_pretrained(paths.MODEL_PATH/'ms_diag_medbert_valloss')
    # elif (largest_f1 < f1):
    #     largest_f1 = f1
    #     model.save_pretrained(paths.MODEL_PATH/'ms_diag_medbert_f1')

    print(f"Epoch {epoch+1}: F1 score: {f1} Loss: {val_loss/len(test_loader)} Accuracy: {acc/len(dataset['test'])}")

  0%|          | 0/276 [00:00<?, ?it/s]You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  9%|▊         | 24/276 [00:04<00:53,  4.74it/s]

Epoch 1: F1 score: 0.3571428571428572 Loss: 1.0400390625 Accuracy: 0.5


 17%|█▋        | 47/276 [00:07<00:48,  4.77it/s]

Epoch 2: F1 score: 0.2222222222222222 Loss: 1.0548503398895264 Accuracy: 0.5


 25%|██▌       | 70/276 [00:11<00:43,  4.72it/s]

Epoch 3: F1 score: 0.2222222222222222 Loss: 1.0928955078125 Accuracy: 0.5


 34%|███▎      | 93/276 [00:15<00:38,  4.73it/s]

Epoch 4: F1 score: 0.2222222222222222 Loss: 1.1230876445770264 Accuracy: 0.5


 42%|████▏     | 116/276 [00:18<00:34,  4.68it/s]

Epoch 5: F1 score: 0.2222222222222222 Loss: 1.121337890625 Accuracy: 0.5


 50%|█████     | 139/276 [00:22<00:29,  4.64it/s]

Epoch 6: F1 score: 0.2222222222222222 Loss: 1.0657145977020264 Accuracy: 0.5


 59%|█████▊    | 162/276 [00:26<00:24,  4.58it/s]

Epoch 7: F1 score: 0.2222222222222222 Loss: 1.030517578125 Accuracy: 0.5


 67%|██████▋   | 185/276 [00:29<00:19,  4.67it/s]

Epoch 8: F1 score: 0.2222222222222222 Loss: 1.0249837636947632 Accuracy: 0.5


 75%|███████▌  | 208/276 [00:33<00:14,  4.63it/s]

Epoch 9: F1 score: 0.2222222222222222 Loss: 1.0288900136947632 Accuracy: 0.5


 84%|████████▎ | 231/276 [00:37<00:09,  4.73it/s]

Epoch 10: F1 score: 0.2222222222222222 Loss: 1.0257161855697632 Accuracy: 0.5


 92%|█████████▏| 254/276 [00:41<00:04,  4.73it/s]

Epoch 11: F1 score: 0.2222222222222222 Loss: 1.0183919668197632 Accuracy: 0.5


100%|██████████| 276/276 [00:44<00:00,  7.07it/s]

Epoch 12: F1 score: 0.2222222222222222 Loss: 1.0264079570770264 Accuracy: 0.5


In [9]:
preds

[1, 1, 1, 1, 1, 1]