In [1]:
# %%
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from collections import Counter
import evaluate
import numpy as np
from transformers import DataCollatorWithPadding
from utils.logging import logger
import mlflow
from matplotlib import pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


## Loading Model

In [2]:

model_name = "emilyalsentzer/Bio_ClinicalBERT"
max_length = 512

tokenizer = AutoTokenizer.from_pretrained(model_name,batch_size=32)


In [3]:
tokenizer.is_fast

True

## Loading Data

In [4]:

labels = []
texts = []
with open("./data/medical_data/train.dat", "r") as fh:
    train_lines = fh.readlines()
for line in train_lines:
    splitline = line.split('\t')
    labels.append(splitline[0])
    texts.append(splitline[1])
# %%


In [5]:
# convert to int, labels should be ints from 0 to n_labels 
labels = [eval(i)-1 for i in labels]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(texts[:2000], labels[:2000], test_size=0.20)

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=max_length)

In [8]:
# preprocessing input data

from datasets import Dataset
def preprocessing_function(examples):
    return tokenizer(examples['text'],truncation=True, max_length=max_length,padding='max_length')

train_data  = Dataset.from_dict({'text':X_train,'label':y_train})
train_data =  train_data.map(preprocessing_function)

test_data  = Dataset.from_dict({'text':X_test,'label':y_test})
test_data =  test_data.map(preprocessing_function)
# test_dataset = Dataset.from_dict({'text':X_test,'labels':y_test})


                                                                 

In [9]:
logger.info(len(test_data['input_ids'][0]))
logger.info(len(test_data['input_ids'][1]))
logger.info(test_data['label'][0])
logger.info(Counter(labels))

512
512
0
Counter({4: 4805, 0: 3163, 3: 3051, 2: 1925, 1: 1494})


In [10]:
from datasets import DatasetDict
# create a dataset dictionary, with each element being a Dataset
dev_data = DatasetDict({'train':train_data, 'test':test_data})

In [11]:
# provide mapping between label ids and names
id2label = {0: "digestive system diseases", 1: "cardiovascular diseases",2:'neoplasms',3:'nervous system diseases',4:'general pathological conditions'}

label2id = {v:k for k,v in id2label.items()}
# instantiate model 
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=len(id2label.keys()),id2label=id2label, label2id=label2id)

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

## Configure Training Arguments

In [12]:
import os
os.environ["MLFLOW_EXPERIMENT_NAME"] = "trainer-mlflow-demo"
os.environ["MLFLOW_FLATTEN_PARAMS"] = "1"

In [31]:

# %%
batch_size = 32

args = TrainingArguments(
    evaluation_strategy="steps",
    save_strategy="epoch",
    logging_strategy="steps",
    eval_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    report_to="none",
    weight_decay=0.01,
    output_dir='./medical_text/',
    metric_for_best_model='accuracy',
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True)

# Configure metrics to be computed

In [32]:

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(eval_pred):

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    results = {accuracy.compute(predictions=predictions, references=labels),
               precision.compute(predictions=predictions, references=labels,average='macro'),
               recall.compute(predictions=predictions, references=labels,average='macro')
               }
    return  results

## Create a trainer object

In [33]:

# %%
from transformers.integrations import MLflowCallback, TrainerCallback, ProgressCallback

trainer = Trainer(
    model,
    args,
    train_dataset=dev_data['train'],
    eval_dataset=dev_data['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    # callbacks=[MLflowCallback,TrainerCallback]
    callbacks = [ProgressCallback]

)


## Train model

In [34]:
with mlflow.start_run() as r:
   logger.info(r.info.run_id)
   train_results = trainer.train()
mlflow.end_run()

a2eb2bd704b34346af2f52c7b2308b62
  4%|▍         | 1/24 [00:02<01:08,  2.97s/it]

Step,Training Loss,Validation Loss


TypeError: unhashable type: 'dict'

In [30]:
trainer.save_metrics()

TypeError: save_metrics() missing 2 required positional arguments: 'split' and 'metrics'

## To be continued:
* add mlflow integration (log metrics, save model, etc)
* plot loss and metrics computed per step

In [None]:
# https://gitlab.com/juliensimon/huggingface-demos/-/blob/main/mlflow/MLflow%20and%20Transformers.ipynb