In [1]:
import sys
from pathlib import Path

# Add the src directory to the system path
project_root = Path().resolve().parent  # Adjust if necessary
sys.path.append(str(project_root))
print(f"Project root added to sys.path: {project_root}")

# Import necessary modules
from src.models import biobert_goldhamster as goldh

# Setup reloading all changed modules every time before executing a cell
%load_ext autoreload
%autoreload 2

Project root added to sys.path: /Users/thiloweber/Code/3r-monitoring


  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()
All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [15]:
import os
import pandas as pd
import json

papers_dir = project_root / "data" / "goldhamster" / "papers"
labels_dir = project_root / "data" / "goldhamster" / "labels"

# labels
all_labels = ['in_silico','organs','other','human','in_vivo','invertebrate','primary_cells','immortal_cell_line']

#######################################
### --------- Import text --------- ###
def read_text(pmid, docs_dir):
	# file = os.path.join(docs_dir, pmid+".txt")
	# with open(txt_file, "r") as text_file:
	# 	text = text_file.read()
	# 	text = text.replace("\n"," ")
	# 	text = text.replace("\t"," ")

	file = docs_dir / f"{pmid}.json"
	paper_dict = json.loads(file.read_text())
	title = paper_dict["title"]
	abstract = paper_dict["abstract"]
	mesh_terms = paper_dict["mesh_terms"]

	# return f"{title} {abstract}".replace("\n"," ").replace("\t"," ")
	return f"{abstract}"

#######################################
### -------- Import Splits -------- ###
def import_splits(docs_dir,train_dev_test_dir,filename):
	tsv_file = filename[0:-5]+".tsv"
	with open(os.path.join(tsv_file), "w") as writer:
		line = "PMID"
		for label in all_labels:
			line += "\t"+label
		line += "\tTEXT\n"
		writer.write(line)
		skipped = []
		doc_index = 0
		with open(os.path.join(train_dev_test_dir,filename), "r") as reader:
			lines = reader.readlines()
			for line in lines:
				pmid, str_labels = line.strip().split("\t")
				labels = str_labels.split(",")
				text = read_text(pmid,docs_dir)
				# exclude documents w/o text
				if len(text)==0:
					skipped.append(doc_index)
					continue
				#print(pmid,labels,text)
				line = pmid
				for label in all_labels:
					if label in labels:
						line += "\t1"
					else:
						line += "\t0"
				line += "\t"+text+"\n"
				writer.write(line)
				doc_index += 1
	writer.close()
	return tsv_file, skipped

############################################
### --------- Pre-process data --------- ###
def pre_process_data(docs_dir,train_dev_test_dir,filename):
	# Import splits
	tsv_file, skipped = import_splits(docs_dir,train_dev_test_dir,filename)
	# Import data from tsv
	data = pd.read_csv(tsv_file,sep='\t')
	# # Select required columns
	# filters = []
	# for label in all_labels:
	# 	filters.append(label)
	# filters.append('TEXT')
	# data = data[filters]
	# display(data)
	# Set your model output as categorical and save in new label col
	for label in all_labels:
		if label in data:
			data[label+'_label'] = pd.Categorical(data[label])
	# Transform your output to numeric
	for label in all_labels:
		if label in data:
			data[label] = data[label+'_label'].cat.codes
	return data, tsv_file, skipped

data_test, tsv_file_test, skipped_test = pre_process_data(papers_dir, labels_dir, "test0.txt")
display(data_test)

Unnamed: 0,PMID,in_silico,organs,other,human,in_vivo,invertebrate,primary_cells,immortal_cell_line,TEXT,in_silico_label,organs_label,other_label,human_label,in_vivo_label,invertebrate_label,primary_cells_label,immortal_cell_line_label
0,31705587,0,0,0,0,0,0,0,0,Neurodegeneration refers to the complex proces...,0,0,0,0,0,0,0,0
1,30807815,0,0,0,1,1,0,1,1,The present study was aimed at broadening the ...,0,0,0,1,1,0,1,1
2,30594027,0,0,0,1,0,1,0,1,The increase of opportunistic fungal infection...,0,0,0,1,0,1,0,1
3,30879321,0,0,0,0,0,0,0,1,The phospholipase C (PLC) is a family of kinas...,0,0,0,0,0,0,0,1
4,30818821,0,0,0,0,0,0,0,1,Mycoplasma gallisepticum (MG) mainly infects c...,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66,30074186,0,0,0,0,1,0,0,0,This study was designed to evaluate the effect...,0,0,0,0,1,0,0,0
67,31150628,0,0,0,0,1,0,0,0,The dose-dependent neuroprotective role of lic...,0,0,0,0,1,0,0,0
68,31104401,0,0,0,0,1,0,0,0,Both Plasmodium spp. and Toxoplasma gondii are...,0,0,0,0,1,0,0,0
69,29390112,0,0,0,0,1,0,0,0,Most animals display retarded growth in advers...,0,0,0,0,1,0,0,0


In [16]:
data_test[['PMID', 'TEXT']].to_numpy()

array([[31705587,
        'Neurodegeneration refers to the complex process of progressive degeneration or neuronal apoptosis leading to a set of incurable and debilitating conditions. Physiologically, apoptosis is important in proper growth and development. However, aberrant and unrestricted apoptosis can lead to a variety of degenerative conditions including neurodegenerative diseases. Although dysregulated apoptosis has been implicated in various neurodegenerative disorders, the triggers and molecular mechanisms underlying such untimely and faulty apoptosis are still unknown. Hippo signaling pathway is one such apoptosis-regulating mechanism that has remained evolutionarily conserved from Drosophila to mammals. This pathway has gained a lot of attention for its tumor-suppressing task, but recent studies have emphasized the soaring role of this pathway in inflaming neurodegeneration. In addition, strategies promoting inactivation of this pathway have aided in the rescue of neurons fro

In [17]:
from transformers import TFBertModel, BertTokenizerFast, BertModel
from tensorflow.keras.utils import to_categorical

def setup_biobert():
	# Max length of tokens
#	max_length = 128
	max_length = 256
	# Load BioBERT tokenizer
	tokenizer = BertTokenizerFast.from_pretrained('dmis-lab/biobert-v1.1')
	# Load the Transformers BERT model
	transformer_model = TFBertModel.from_pretrained('dmis-lab/biobert-v1.1', from_pt=True)
	return transformer_model, transformer_model.config, max_length, tokenizer

transformer_model, config, max_length, tokenizer = setup_biobert()

def prepare_x_y_dev_test(data,max_length,tokenizer):
	test_ys = {}
	for label in all_labels:
		test_y = to_categorical(data[label])	
		test_ys[label] = test_y
	test_x = tokenizer(
	    text=data['TEXT'].to_list(),
    	add_special_tokens=True,
	    max_length=max_length,
	    truncation=True,
	    padding=True, 
	    return_tensors='tf',
	    return_token_type_ids = False,
	    return_attention_mask = False,
	    verbose = True)
	x={'input_ids': test_x['input_ids']}
	y = {}
	for label in all_labels:
		y[label] = test_ys[label]
	return x, y

x, y = prepare_x_y_dev_test(data_test, max_length, tokenizer)

display(x)
display(y)

All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


{'input_ids': <tf.Tensor: shape=(71, 256), dtype=int32, numpy=
 array([[  101,   151,  8816, ...,  1954,  3189,   102],
        [  101,  1109,  1675, ...,   185,  7111,   102],
        [  101,  1109,  2773, ...,  3382,  1233,   102],
        ...,
        [  101,  2695,   153, ...,   157,   119,   102],
        [  101,  2082,  3551, ...,  1830, 26503,   102],
        [  101,  1284,  3033, ...,  1115,  5844,   102]], dtype=int32)>}

{'in_silico': array([[1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [0., 1.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.],
        [1., 0.

In [18]:
from tensorflow.keras.models import Model, load_model

model_path = project_root / "models/goldhamster/goldhamster_model.h5"


# Load model
model = load_model(model_path)
# Run predictions
predictions = model.predict(x)
predictions





{'in_silico': array([[-1.2821354 ,  1.5166415 ],
        [ 3.4218984 , -3.0335653 ],
        [ 2.2962332 , -2.1601543 ],
        [ 3.0511847 , -2.9212089 ],
        [ 2.7034705 , -2.6561596 ],
        [ 3.0328457 , -2.7379212 ],
        [ 2.977708  , -2.869934  ],
        [ 3.1903777 , -2.926043  ],
        [ 2.8473723 , -2.6833549 ],
        [ 3.0378098 , -2.7981951 ],
        [ 3.2752156 , -3.1302323 ],
        [ 3.3260486 , -3.0978427 ],
        [ 1.9684745 , -1.8327421 ],
        [ 3.0562363 , -2.7842572 ],
        [ 3.3054936 , -3.0800784 ],
        [ 3.0698197 , -2.8746367 ],
        [ 3.15983   , -2.9699674 ],
        [ 3.4055595 , -3.2769353 ],
        [ 3.228362  , -2.8728113 ],
        [ 2.1720924 , -2.0164433 ],
        [ 2.5650463 , -2.4978714 ],
        [ 2.7791328 , -2.623496  ],
        [ 2.073139  , -2.0612695 ],
        [ 3.5107555 , -3.2870429 ],
        [ 2.4913619 , -2.377628  ],
        [ 3.1075323 , -3.05359   ],
        [ 3.3169994 , -3.0570366 ],
        [ 3.784

In [19]:
def print_predictions(predictions, train_dev_test_dir, test_file, out_file, skipped_test):
	#print(len(predictions[label]))	
	with open(os.path.join(train_dev_test_dir,out_file), "w") as writer:
		with open(os.path.join(train_dev_test_dir,test_file), "r") as reader:
			lines = reader.readlines()
			doc_index = 0
			for line in lines:
				pmid, str_labels = line.strip().split("\t")
				list_labels = []
				if doc_index not in skipped_test:
					for label in predictions:
						arr_pred = predictions[label][doc_index]
						#print(arr_pred)
						if arr_pred[1]>arr_pred[0]:
							list_labels.append(label)
					doc_index += 1
				#print(pmid,list_labels)
				writer.write(pmid+"\t"+','.join(list_labels)+"\n")
		writer.close()

print_predictions(predictions, labels_dir, "test0.txt", 'preds_0.txt', skipped_test)

In [21]:
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model, load_model
# from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.initializers import TruncatedNormal


save_model_path = project_root / "models/goldhamster_custom/goldhamster_model.h5"

# hyperparameters
learning_rate = 1e-04
batch_size = 32
epochs = 10

#######################################
### ------- Build the model ------- ###
def build_model(transformer_model,config,max_length,data):
	# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model# Load the MainLayer
	bert = transformer_model.layers[0]
	# Build your model input
	input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
	inputs = {'input_ids': input_ids}
	# Load the Transformers BERT model as a layer in a Keras model
	bert_model = bert(inputs)[1]
	dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
	pooled_output = dropout(bert_model, training=False)
	# Then build your model output
	# one per output
	outputs = {}
	for label in all_labels:
		label_output = Dense(units=len(data[label+'_label'].value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name=label)(pooled_output)
		outputs[label] = label_output
	# And combine it all in a model object
	model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiLabel_MultiClass')
	# Take a look at the model
	model.summary()
	return model

#######################################
### ------- Train the model ------- ###
def train_model(model,data_train,data_dev,max_length,tokenizer,learning_rate,batch_size,epochs):
	# Set an optimizer
	optimizer = Adam(
    	learning_rate,
    	epsilon=1e-08,
    	decay=0.01,
    	clipnorm=1.0)
	# Set loss and metrics
	loss = {}
	for label in all_labels:
		loss[label] = CategoricalCrossentropy(from_logits = True)
	metric = {}
	for label in all_labels:
		metric[label] = CategoricalAccuracy('accuracy')
	# Compile the model
	model.compile(
	    optimizer = optimizer,
	    loss = loss, 
	    metrics = metric)
	# trainig data
	x_train, y_train = prepare_x_y_dev_test(data_train,max_length,tokenizer)
	# validation data
	x_val, y_val = prepare_x_y_dev_test(data_dev,max_length,tokenizer)
	# Fit the model
	history = model.fit(
		x_train, y_train,
		validation_data=(x_val,y_val),
	    batch_size=batch_size,
	    epochs=epochs
	)
	model.save(model_path)

def train_bert_goldhamster2(docs_dir,train_dev_test_dir,train_file,dev_file):
	# import data
	data_train, tsv_file_train, skipped_train = pre_process_data(docs_dir,train_dev_test_dir,train_file)
	data_dev, tsv_file_dev, skipped_dev = pre_process_data(docs_dir,train_dev_test_dir,dev_file)
	# set up model
	transformer_model, config, max_length, tokenizer = setup_biobert() 
	# build model
	model = build_model(transformer_model,config,max_length,data_train)
	# train model
	train_model(model,data_train,data_dev,max_length,tokenizer,learning_rate,batch_size,epochs)


def predict_with_model(docs_dir,train_dev_test_dir,test_file,out_file):
	# import data
	data_test, tsv_file_test, skipped_test = pre_process_data(docs_dir,train_dev_test_dir,test_file)
	# set up model
	transformer_model, config, max_length, tokenizer = setup_biobert()
	# prepare test data
	x, y = prepare_x_y_dev_test(data_test,max_length,tokenizer)
	# Load model
	model = load_model('model.h5')
	# Run predictions
	predictions = model.predict(x)
	print(model.summary())
	print_predictions(predictions,train_dev_test_dir,test_file,out_file,skipped_test)
	

split = 1
name = "goldhamster"
train_bert_goldhamster2(papers_dir, labels_dir, 'train'+str(split)+'.txt', 'dev'+str(split)+'.txt')
predict_with_model(papers_dir, labels_dir, 'test'+str(split)+'.txt', 'preds_'+str(split)+'_'+name+'.txt')

All PyTorch model weights were used when initializing TFBertModel.

All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "BERT_MultiLabel_MultiClass"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1083102   ['input_ids[0][0]']           
                             ngAndCrossAttentions(last_   72                                      
                             hidden_state=(None, 256, 7                                           
                             68),                                                                 
                              pooler_output=(None, 768)                                           
                             , past_key_values=None, hi                  

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]