# Simple Transformers for Site of Mets Model (Multi-Label Classification)
This script is used to fine-tune BioBERT model for sites of mets multilabel classification using simpletransformers library (https://simpletransformers.ai/). You may replace BioBERT with other bert models supported in simpletransformers library for experimentation. Download required bert models from huggingface and place it in your local folder for offline training.

Data source: radiology report

Text column: conclusion section

Label column: true_site_of_mets (list of metastatic sites, eg ["site1","site2"])

## Set-up environment
First, we install the libraries which we'll use: !pip install simpletransformers

## Import libraries

In [None]:
from simpletransformers.classification import (MultiLabelClassificationArgs, MultiLabelClassificationModel)
from sklearn.metrics import accuracy_score, hamming_loss, classification_report, roc_auc_score, f1_score, precision_score,recall_score
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np
import logging
import datetime
import time
import json


## Import data

In [None]:
# refer example.csv on the data format
# true_site_of_mets contains a list of sites, eg ["breast","bone"], use converters to read in as list, else it will be strings.
# dataset contains either train / dev

train_data = pd.read_csv(r'./data/train.csv', usecols=["report_id","study_id","conclusion","true_site_of_mets"], converters={"true_site_of_mets":eval})
dev_data = pd.read_csv(r'./data/dev.csv', usecols=["report_id","study_id","conclusion","true_site_of_mets"], converters={"true_site_of_mets":eval})
test_data = pd.read_csv(r'./data/test.csv', usecols=["report_id","study_id","conclusion","true_site_of_mets"], converters={"true_site_of_mets":eval})

train_data.shape, dev_data.shape, test_data.shape

In [None]:
train_data.sample(3)

In [None]:
train_data["true_site_of_mets"][0]

In [None]:
train_data.isnull().sum()

In [None]:
train_df  = train_data.copy()
dev_df  = dev_data.copy()
test_df  = test_data.copy()

## Data preprocessing

### Multi-hot encoding for train data

In [None]:
# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit(train_df['true_site_of_mets'])
cols = ["%s" % c for c in mlb.classes_]
num_labels = len(cols)
print(num_labels)

# Fit data into binarizer, generate multi-hot encodings
df = pd.DataFrame(mlb.fit_transform(train_df['true_site_of_mets']), columns=mlb.classes_)
df.head()

In [None]:
cols

In [None]:
# Merge original text with multi-hot encodings
train_df_wlabels = pd.concat([train_df[['conclusion']], df], axis=1)
train_df_columns = train_df_wlabels.columns

# Generate labels columns as list
count = len(cols)
train_df_wlabels['labels'] = ''

for (i, row) in train_df_wlabels.iterrows():
    labels = []
    j = 1
    while j <= count:
        labels.append(train_df_wlabels.iloc[i].iloc[j])
        j += 1
    tup = tuple(labels)
    train_df_wlabels.at[i, 'labels'] = tup

# output individual label columns also
#train_df_wlabels = train_df_wlabels[['conclusion', 'labels']]

print(train_df_wlabels.head(1))

In [None]:
len(train_df_wlabels['labels'][0])

### Multi-hot encoding for dev data

In [None]:
# Fit data into binarizer, generate multi-hot encodings
df2 = pd.DataFrame(mlb.transform(dev_df['true_site_of_mets']),columns=mlb.classes_)
print(df2.columns)

# Merge original text with multi-hot encodings
dev_df_wlabels = pd.concat([dev_df[['conclusion']], df2], axis=1)
dev_df_columns = dev_df_wlabels.columns

# Generate labels columns as list
count = len(df2.columns)
dev_df_wlabels['labels'] = ''

In [None]:
for (i, row) in dev_df_wlabels.iterrows():
    labels = []
    j = 1
    while j <= count:
        labels.append(dev_df_wlabels.iloc[i].iloc[j])
        j += 1
    tup = tuple(labels)
    dev_df_wlabels.at[i, 'labels'] = tup

# output individual label columns also
#dev_df_wlabels = dev_df_wlabels[['conclusion', 'labels']]

print(dev_df_wlabels.head(1))

### Multi-hot encoding for test data

In [None]:
# Fit data into binarizer, generate multi-hot encodings
df3 = pd.DataFrame(mlb.transform(test_df['true_site_of_mets']),columns=mlb.classes_)
print(df3.columns)

# Merge original text with multi-hot encodings
test_df_wlabels = pd.concat([test_df[['conclusion']], df3], axis=1)
test_df_columns = test_df_wlabels.columns

# Generate labels columns as list
count = len(df3.columns)
test_df_wlabels['labels'] = ''

In [None]:
for (i, row) in test_df_wlabels.iterrows():
    labels = []
    j = 1
    while j <= count:
        labels.append(test_df_wlabels.iloc[i].iloc[j])
        j += 1
    tup = tuple(labels)
    test_df_wlabels.at[i, 'labels'] = tup

# output individual label columns also
#test_df_wlabels = test_df_wlabels[['conclusion', 'labels']]

print(test_df_wlabels.head(1))

In [None]:
train_df_wlabels.to_csv("./data/train_wlabels.csv", index=False)
dev_df_wlabels.to_csv("./data/dev_wlabels.csv", index=False)
test_df_wlabels.to_csv("./data/test_wlabels.csv", index=False)

## BioBert
### do rename the filename_prefix to appropriate file directory for output file
### change use_cuda=True for GPU

In [None]:
# select conclusion and labels columns for training
train_df_wlabels = train_df_wlabels[['conclusion', 'labels']]
dev_df_wlabels = dev_df_wlabels[['conclusion', 'labels']]

In [None]:
train_df_wlabels.shape, dev_df_wlabels.shape

In [None]:
from simpletransformers.classification import (
    MultiLabelClassificationModel, MultiLabelClassificationArgs
)

In [None]:
print(num_labels)

In [None]:
# baseline model configuration
model_args = MultiLabelClassificationArgs()
model_args.num_train_epochs = 10 #10
model_args.learning_rate = 4e-5
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = False
model_args.manual_seed = 4
model_args.use_multiprocessing = True
model_args.train_batch_size = 4
model_args.eval_batch_size = 4
model_args.max_seq_length = 512
model_args.threshold = 0.5 

model_args.n_gpu = 1

#28-jun-2023 test1 earlystopping to prevent model overfitting
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
model_args.early_stopping_metric = "eval_loss"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 5
model_args.evaluate_during_training_steps = 500

model_type = "bert"
model_name=r"path\to\yourlocalfolder\biobert-base-cased-v1.2"

# Create a MultiLabelClassificationModel
model = MultiLabelClassificationModel(
    model_type,
    model_name,
    num_labels=num_labels,
    args=model_args,
)

train_df_wlabels.columns = ["conclusion", "labels"]
dev_df_wlabels.columns = ["conclusion", "labels"]

# Train the model
model.train_model(train_df_wlabels)



In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(dev_df_wlabels)

In [None]:
result

In [None]:
prediction_df = dev_df_wlabels['conclusion'].values.tolist()

# Predict output
prediction, outputs = model.predict(prediction_df)
outputs_df = pd.DataFrame(outputs, columns=cols)
prediction_df = pd.DataFrame(prediction, columns=cols)

# Save outputs to csv file
filename_prefix = "./outputs/" + "biobert_outputs_df"
filename = "%s.csv" % filename_prefix
outputs_df.to_csv(filename)

In [None]:
prediction_df.head(1)

In [None]:
# Save true and predicted labels to csv file
combined_cols_df = pd.concat([dev_df_wlabels, prediction_df], axis=1)
filename_prefix = "./outputs/" + "biobert_combined_cols_df"
filename = "%s.csv" % filename_prefix
combined_cols_df.to_csv(filename)

## get model metrics (DEV_DF)

In [None]:
# read in dev prediction csv
combined_cols_df = pd.read_csv("./outputs/biobert_combined_cols_df.csv",converters={"labels":eval})

In [None]:
combined_cols_df.head(1)

In [None]:
cols = combined_cols_df.columns[3:]
cols

In [None]:
# Calculate individual label accuracies
prediction_df = combined_cols_df.copy()

# add predicted_labels tuple
prediction_df["biobert_labels"]=""

for (i, row) in prediction_df.iterrows():
    labels = []
    j = 3
    while j <= len(cols)-1+3:
        labels.append(prediction_df.iloc[i].iloc[j])
        j += 1
    tup = tuple(labels)
    prediction_df.at[i, 'biobert_labels'] = tup

prediction_df.head(1)

In [None]:
# y_true is a string if read in the csv without converter, need to convert to tuple
#y_true = prediction_df["labels"].apply(lambda x: eval(x)).values.tolist()
y_true = prediction_df["labels"].values.tolist()

# y_pred is already a tuple
y_pred = prediction_df["biobert_labels"].values.tolist()
len(y_true), len(y_pred)

In [None]:
from sklearn.metrics import label_ranking_average_precision_score
label_ranking_average_precision_score(y_true, y_pred)

In [None]:
# get classification report
print(classification_report(y_true,y_pred, target_names=cols, digits=4))

In [None]:
overall_accuracy_score = accuracy_score(y_true,y_pred)
micro_precision_score = precision_score(y_true,y_pred, average='micro')
micro_recall_score = recall_score(y_true,y_pred, average='micro')
micro_f1_score= f1_score(y_true,y_pred, average='micro')
print("overall_accuracy_score: ", overall_accuracy_score)
print("micro_precision_score: ", micro_precision_score)
print("micro_recall_score: ", micro_recall_score)
print("micro_f1_score: ",micro_f1_score)

sample_precision_score = precision_score(y_true,y_pred, average='samples')
sample_recall_score = recall_score(y_true,y_pred, average='samples')
sample_f1_score= f1_score(y_true,y_pred, average='samples')
print("sample_precision_score: ", sample_precision_score)
print("sample_recall_score: ", sample_recall_score)
print("sample_f1_score: ",sample_f1_score)

In [None]:
y_true[0][1], cols[1]

In [None]:
count = len(cols)
i = 0
colnames = []
accuracies = []
precision = []
recall = []
f1= []
auroc = []

while i < len(cols):
    print("col: ", cols[i])
    # extract col i
    actualValue = [x[i] for x in y_true]
    #print(actualValue)
    predictedValue = [x[i] for x in y_pred]   
    #print(predictedValue)
    acc = accuracy_score(actualValue, predictedValue)
    prec = precision_score(actualValue, predictedValue)
    rc = recall_score(actualValue, predictedValue)
    f = f1_score(actualValue, predictedValue)

    print("***accuracy: ", acc)
    # temporary fix, try-except block will be removed in the future with a more balanced dataset
    try:
        auroc_score = roc_auc_score(actualValue, predictedValue)
        print("***auroc_score: ", auroc_score)
    except ValueError:
        auroc_score = 0
    colnames.append(cols[i])
    accuracies.append(acc)
    precision.append(prec)
    recall.append(rc)
    f1.append(f)
    auroc.append(auroc_score)
    i += 1

In [None]:
accuracy_auroc_df = pd.DataFrame(list(zip(colnames, accuracies, auroc,precision,recall,f1)), columns=['biobert_labels', 'accuracy','auc_roc_score','precision','recall','f1'])
accuracy_auroc_df.head(30)

In [None]:
## get biobert inverse labels
mlb = MultiLabelBinarizer()
mlb.fit(train_df['true_site_of_mets'])
cols = ["%s" % c for c in mlb.classes_]
print(len(cols))
mlb.inverse_transform(np.asarray([prediction_df["biobert_labels"].iloc[0]]))


In [None]:
prediction_df["biobert_labels_inverse"] = prediction_df["biobert_labels"].apply(lambda x: mlb.inverse_transform(np.asarray([x])))
prediction_df["labels_inverse"] = prediction_df["labels"].apply(lambda x: mlb.inverse_transform(np.asarray([x])))
prediction_df.head(1)                                                                                  

In [None]:
prediction_df.to_csv("./outputs/biobert_dev_prediction_df.csv")

## Load best checkpoint

In [None]:
output_dir = "./outputs/checkpoint-5880-epoch-10"
model = MultiLabelClassificationModel("bert",output_dir)

## get model metrics (TEST_DF)

In [None]:
prediction_df = test_df_wlabels['conclusion'].values.tolist()

# Predict output
prediction, outputs = model.predict(prediction_df)
outputs_df = pd.DataFrame(outputs, columns=cols)
prediction_df = pd.DataFrame(prediction, columns=cols)

# Save outputs to csv file
filename_prefix = "./outputs/" + "biobert_test_prediction_df"
filename = "%s.csv" % filename_prefix
outputs_df.to_csv(filename)

In [None]:
prediction_df.head(1)

In [None]:
# add predicted_labels tuple
prediction_df["biobert_labels"]=""

for (i, row) in prediction_df.iterrows():
    labels = []
    j = 0
    while j <= len(cols)-1:
        labels.append(prediction_df.iloc[i].iloc[j])
        j += 1
    tup = tuple(labels)
    prediction_df.at[i, 'biobert_labels'] = tup

prediction_df.head(1)

In [None]:
test_df.head(1)

In [None]:
y_true = test_df_wlabels["labels"].values.tolist()
y_pred = prediction_df["biobert_labels"].values.tolist()
len(y_true), len(y_pred)

In [None]:
from sklearn.metrics import label_ranking_average_precision_score
label_ranking_average_precision_score(y_true, y_pred)

In [None]:
# get classification report
print(classification_report(y_true,y_pred, target_names=cols, digits=4))

In [None]:
overall_accuracy_score = accuracy_score(y_true,y_pred)
micro_precision_score = precision_score(y_true,y_pred, average='micro')
micro_recall_score = recall_score(y_true,y_pred, average='micro')
micro_f1_score= f1_score(y_true,y_pred, average='micro')
print("overall_accuracy_score: ", overall_accuracy_score)
print("micro_precision_score: ", micro_precision_score)
print("micro_recall_score: ", micro_recall_score)
print("micro_f1_score: ",micro_f1_score)

sample_precision_score = precision_score(y_true,y_pred, average='samples')
sample_recall_score = recall_score(y_true,y_pred, average='samples')
sample_f1_score= f1_score(y_true,y_pred, average='samples')
print("sample_precision_score: ", sample_precision_score)
print("sample_recall_score: ", sample_recall_score)
print("sample_f1_score: ",sample_f1_score)

In [None]:
# save to csv
prediction_df["biobert_labels_inverse"] = prediction_df["biobert_labels"].apply(lambda x: mlb.inverse_transform(np.asarray([x])))
# clean biobert_labels_inverse
prediction_df["biobert_labels_inverse_clean"] = prediction_df["biobert_labels_inverse"].apply(lambda x: set(','.join([item for sublist in x for item in sublist]).replace("'","").split(",")))
prediction_df.head(1)

In [None]:
# Save true and predicted labels to csv file
combined_cols_df = pd.concat([test_df, prediction_df["biobert_labels_inverse_clean"]], axis=1)
filename_prefix = "./outputs/" + "biobert_test_prediction_df_clean"
filename = "%s.csv" % filename_prefix
combined_cols_df.to_csv(filename)

## Hyperparameter Tuning