# Train Mondo Annotations

In [18]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import numpy as np
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from fetch import *
import pandas as pd

## Load model and tokenizer

In [5]:
def split_data(embeddings, labels):
    # Split data into 80% training and 20% test
    embeddings_train, embeddings_temp, labels_train, labels_temp = train_test_split(
        embeddings, labels, test_size=0.20, random_state=42)

    # Split the 20% test into 15% validation and 5% test
    embeddings_val, embeddings_test, labels_val, labels_test = train_test_split(
        embeddings_temp, labels_temp, test_size=0.25, random_state=42)  # 0.25 * 0.20 = 0.05

    return embeddings_train, labels_train, embeddings_val, labels_val, embeddings_test, labels_test

## Prepare dataset

In [6]:
class ProteinDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

## Create a classifier

In [7]:
class ProteinTaggingModel(nn.Module):
    def __init__(self, embedding_size, num_labels, dropout_rate=0.1):
        super(ProteinTaggingModel, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(embedding_size, embedding_size)  # Adjust this to your combined embedding size
        self.relu = nn.ReLU()
        self.norm1 = nn.LayerNorm(embedding_size)
        self.fc2 = nn.Linear(embedding_size, embedding_size)
        self.norm2 = nn.LayerNorm(embedding_size)
        self.classifier = nn.Linear(embedding_size, num_labels)
        self.output_fc = nn.Linear(num_labels, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, embeddings):
        x = self.dropout(embeddings)
        x = self.relu(self.fc1(x))
        x = self.norm1(x + embeddings)  # Ensure `embeddings` is broadcastable with `x`
        x = self.relu(self.fc2(x))
        x = self.norm2(x + embeddings)  # Ensure `embeddings` is broadcastable with `x`
        logits = self.classifier(x)
        logits = self.output_fc(logits)
        predictions = self.sigmoid(logits)
        return predictions

## Train model

In [8]:
# load dataset
embedding_types = ['func_embedding']
combine_method = 'concat'
embeddings, labels, annotations_vocab = fetch_data_multi(embedding_types, combine_method, include_empty=True)

# split data
embeddings_train, labels_train, embeddings_val, labels_val, embeddings_test, labels_test = split_data(embeddings, labels)

# dataloaders
train_dataset = ProteinDataset(embeddings_train, labels_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

val_dataset = ProteinDataset(embeddings_val, labels_val)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [9]:
# Setup DataLoader, Model, Loss, and Optimizer
dataset = ProteinDataset(embeddings, labels)

model = ProteinTaggingModel(embedding_size=embeddings.shape[1], num_labels=labels.shape[1])

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_and_validate(model, train_loader, val_loader, criterion, optimizer, epochs):
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        for embeddings, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for embeddings, labels in val_loader:
                outputs = model(embeddings)
                loss = criterion(outputs, labels.float())
                total_val_loss += loss.item()

        print(f'Epoch {epoch+1}, Training Loss: {total_train_loss / len(train_loader)}, Validation Loss: {total_val_loss / len(val_loader)}')

In [10]:
# Start training
train_and_validate(model, train_loader, val_loader, criterion, optimizer, epochs=60)

Epoch 1, Training Loss: 0.01277017981608616, Validation Loss: 0.007581576297910798
Epoch 2, Training Loss: 0.00575825486700537, Validation Loss: 0.0050104453863687075
Epoch 3, Training Loss: 0.004455271648446941, Validation Loss: 0.004041963022522974
Epoch 4, Training Loss: 0.003616916785330572, Validation Loss: 0.0035658496183756465
Epoch 5, Training Loss: 0.0033500810700818544, Validation Loss: 0.0033940823235246815
Epoch 6, Training Loss: 0.002844545281166674, Validation Loss: 0.0030289315202254924
Epoch 7, Training Loss: 0.002681824909922764, Validation Loss: 0.0029594953000755203
Epoch 8, Training Loss: 0.0024587505261601406, Validation Loss: 0.0028758397123881227
Epoch 9, Training Loss: 0.002422463717442979, Validation Loss: 0.002873670052280588
Epoch 10, Training Loss: 0.0022744013037676205, Validation Loss: 0.004388622046041222
Epoch 11, Training Loss: 0.0022232396643526563, Validation Loss: 0.002740716498456155
Epoch 12, Training Loss: 0.002185225826444855, Validation Loss: 0.

## Evaluation

In [11]:
def evaluate_model(model, test_loader):
    model.eval()
    predictions = []
    truths = []
    with torch.no_grad():
        for embeddings, labels in test_loader:
            outputs = model(embeddings)
            predicted = torch.round(outputs)
            predictions.extend(predicted.cpu().numpy())
            truths.extend(labels.cpu().numpy())

    accuracy = accuracy_score(truths, predictions)
    precision = precision_score(truths, predictions, average='macro', zero_division=1)
    recall = recall_score(truths, predictions, average='macro', zero_division=1)
    f1 = f1_score(truths, predictions, average='macro', zero_division=1)

    return accuracy, precision, recall, f1

test_dataset = ProteinDataset(embeddings_test, labels_test)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
accuracy, precision, recall, f1 = evaluate_model(model, test_loader)
print(f'Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')

Accuracy: 0.7270463741051977, Precision: 0.9840962258118077, Recall: 0.9629705100511545, F1 Score: 0.9579605931359665


## Inference

In [15]:
def create_reverse_vocab(annotations_vocab):
    return {idx: label for label, idx in annotations_vocab.items()}

def convert_to_labels(binary_vector, reverse_vocab):
    labels = [reverse_vocab[i] for i, value in enumerate(binary_vector) if value == 1]
    return labels

def intersection_and_difference(true_labels, predicted_labels):
    # Calculate intersection and differences of two label lists
    true_set = set(true_labels)
    predicted_set = set(predicted_labels)
    intersection = true_set & predicted_set
    incorrect = predicted_set - true_set
    return len(intersection), len(true_set), len(predicted_set), len(incorrect)

import torch

def predict_and_evaluate(model, test_loader, annotations_vocab, num_samples=10):
    model.eval()
    total_intersection = 0
    total_true_labels = 0
    total_predicted_labels = 0
    total_incorrect = 0
    num_zero_prediction = 0  # To track samples with zero predicted labels
    sample_data = []
    reverse_vocab = create_reverse_vocab(annotations_vocab)

    with torch.no_grad():
        for embeddings, labels in test_loader:
            outputs = model(embeddings)
            predicted = torch.round(outputs)  # Using 0.5 as a threshold
            # Convert binary vectors to MONDO names
            predicted_labels = [convert_to_labels(pred, reverse_vocab) for pred in predicted.cpu().numpy()]
            true_labels = [convert_to_labels(true, reverse_vocab) for true in labels.cpu().numpy()]
            
            for true, pred in zip(true_labels, predicted_labels):
                inter, true_count, pred_count, incorrect = intersection_and_difference(true, pred)
                total_intersection += inter
                total_true_labels += true_count
                total_predicted_labels += pred_count
                total_incorrect += incorrect
                if len(pred) == 0:
                    num_zero_prediction += 1  # Increment if no labels were predicted
            
            # Collect samples for display
            if len(sample_data) < num_samples:
                sample_data.extend(zip(embeddings, true_labels, predicted_labels))
            if len(sample_data) >= num_samples:
                break
    
    # Calculate averages
    num_samples = len(test_loader.dataset)
    average_correct = (total_intersection / num_samples) * 100
    average_incorrect = (total_incorrect / num_samples) * 100
    print(f"Average percentage of correct MONDO names per sample: {average_correct:.2f}%")
    print(f"Average percentage of incorrect MONDO names per sample: {average_incorrect:.2f}%")
    print(f"Number of samples with zero predicted labels: {num_zero_prediction}")

    return sample_data, num_zero_prediction

# Example usage assuming model, test_loader, and annotations_vocab are defined
sample_data, num_zero_prediction = predict_and_evaluate(model, test_loader, annotations_vocab, num_samples=10)

# Print the samples and the total number of zero predicted labels
for idx, (embedding, true_label, predicted_label) in enumerate(sample_data):
    print(f"Sample {idx+1}")
    print(f"True MONDO Names: {true_label}")
    print(f"Predicted MONDO Names: {predicted_label}\n")
print(f"Total samples with zero predictions: {num_zero_prediction}")

Average percentage of correct MONDO names per sample: 44.76%
Average percentage of incorrect MONDO names per sample: 1.56%
Number of samples with zero predicted labels: 9
Sample 1
True MONDO Names: []
Predicted MONDO Names: ['large cell medulloblastoma', 'malignant ependymoma', 'atypical teratoid rhabdoid tumor', 'medulloblastoma', 'tuberculosis', 'glioblastoma', 'lung adenocarcinoma']

Sample 2
True MONDO Names: ['osteosarcoma', 'ovarian cancer']
Predicted MONDO Names: ['osteosarcoma', 'ovarian cancer']

Sample 3
True MONDO Names: ['classic maple syrup urine disease', 'inherited organic acidemia', 'intermediate maple syrup urine disease', 'maple syrup urine disease', 'prostate carcinoma', 'colorectal cancer', 'myocardial infarction', 'intermittent maple syrup urine disease', 'osteoarthritis', 'inborn errors of metabolism', 'intellectual disability', 'heart disease']
Predicted MONDO Names: ['classic maple syrup urine disease', 'inherited organic acidemia', 'intermediate maple syrup uri

## Save model as Torchfile

In [16]:
model.eval()
example = torch.rand(1, 1024)
traced_script_module = torch.jit.trace(model, example)
traced_script_module.save("model.pt")

### Ensure model was properly saved and is working

In [17]:
# Load the saved model
loaded_model = torch.jit.load("model.pt")

# Create a sample input tensor
sample_input = torch.rand(1, 1024)

# Pass the sample input to the model
output = loaded_model(sample_input)

# Round each prediction in output[0]
rounded_predictions = output[0].round()

# Convert the rounded predictions to labels
reverse_vocab = create_reverse_vocab(annotations_vocab)
labels = convert_to_labels(rounded_predictions, reverse_vocab)

# Print the labels
print(len(labels))
print(labels)

39
['coronary artery disease', 'psoriasis', 'adenocarcinoma', 'ovarian disease', 'malignant pancreatic neoplasm', 'cribriform carcinoma of breast', 'ischemia reperfusion injury', 'schizophrenia', 'pilocytic astrocytoma', 'skin basal cell carcinoma', 'gastric neoplasm', 'colonic neoplasm', 'cancer', 'ischemic disease', 'interstitial cystitis', 'malignant colon neoplasm', 'transient ischemic attack', 'lung cancer', 'autoimmune disease', 'endometriosis', 'squamous cell carcinoma', 'non-small cell lung carcinoma', 'gastric cancer', 'glioblastoma', 'ovarian neoplasm', 'tubular adenocarcinoma', 'rheumatoid arthritis', 'Crohn disease', 'ulcerative colitis', 'cystic fibrosis', 'hereditary diffuse gastric adenocarcinoma', 'head and neck cancer', 'pancreatic neoplasm', 'primary ovarian failure', 'intellectual disability', 'lung adenocarcinoma', 'Sjogren syndrome', 'benign colon neoplasm', 'prostate cancer']


## Save Testing Dataset as CSV

In [23]:
def get_test_data(embeddings_test):
    """
    For each document in the MongoDB database where the embedding in embeddings_test corresponds to func_embedding, save the 'function' and 'mondo_names' to a pandas DataFrame.
    :param embeddings_test: The embeddings used for testing
    :return: A pandas DataFrame containing the 'function' and 'mondo_names' for each document
    """
    # Load environment variables
    load_dotenv()
    MONGO_URI = os.getenv("MONGODB_URI")
    MONGO_DB = "proteinExplorer"
    MONGO_COLLECTION = "protein_embeddings"

    # Connect to MongoDB
    client = MongoClient(MONGO_URI)
    db = client[MONGO_DB]
    collection = db[MONGO_COLLECTION]

    ids = []
    function = []
    labels = []

    for embedding in embeddings_test:
        document = collection.find_one({"func_embedding": embedding.tolist()})
        ids.append(document["_id"])
        function.append(document["function"])
        labels.append(document["mondo_names"])

    data = {"id": ids, "function": function, "mondo_names": labels}

    return pd.DataFrame(data)

In [24]:
test_df = get_test_data(embeddings_test)
test_df.head()

Unnamed: 0,id,function,mondo_names
0,23201,Microtubule-based anterograde translocator for...,"[amyotrophic lateral sclerosis type 1, atrial ..."
1,967,Required for pre-mRNA splicing as component of...,"[osteosarcoma, ovarian cancer]"
2,4363,Together with BCKDHB forms the heterotetrameri...,"[maple syrup urine disease, classic maple syru..."
3,1333,Component of the general transcription and DNA...,"[attention deficit hyperactivity disorder, ina..."
4,871,Catalyzes the formation of L-carnitine from ga...,"[astrocytic tumor, breast carcinoma, glioblast..."


In [22]:
# convert test_df to a csv file
test_df.to_csv("test_data.csv", index=False)

## Convert model to ONNX format

In [40]:
import onnxruntime as ort
import numpy as np
# Create an inference session
session = ort.InferenceSession("model.onnx", providers=['CPUExecutionProvider'])
print("Model input details:")
for input in session.get_inputs():
    print(input)

# Example of creating a random input matching the expected input shape
input_data = np.random.randn(1, 1024).astype(np.float32)

# Prepare the input dictionary
input_name = session.get_inputs()[0].name
input_dict = {input_name: input_data}

# Run the model
outputs = session.run(None, input_dict)

# Assuming the first output is what you need
output_data = outputs[0][0]
print("Output data:")
print(len(output_data))

Model input details:
NodeArg(name='l_embeddings_', type='tensor(float)', shape=[1, 1024])
Output data:
4582
