In [1]:
import pandas as pd
import numpy as np
import sent2vec
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance

#nltk.download('stopwords')
#nltk.download('punkt')

## Load BioSentVec model

Please specify the location of the BioSentVec model to model_path. It may take a while to load the model at the first time. Pre-trained model can be found here: https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioSentVec_PubMed_MIMICIII-bigram_d700.bin. For information on installing sent2vec, refer to following readme: https://github.com/epfml/sent2vec/tree/master


In [2]:
model_path = 'BioSentVec_PubMed_MIMICIII-bigram_d700.bin'
model = sent2vec.Sent2vecModel()
try:
    model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

model successfully loaded


## Text preprocessing

In [3]:
assay_data = pd.read_csv("AML_assay_data.csv")

In [4]:
stop_words = set(stopwords.words('english'))
def preprocess_sentence(text):
    
    # dataset specific normalisation
    text = text.replace('sulforhodamine b','srb')
    text = text.replace('PUBCHEM_BIOASSAY:', '')
    text = text.replace('PubChem BioAssay.:', '')
    text = text.split("(Class of assay: confirmatory)", 1)[0] # remove text following substring

    # punctucation, whitespace and captitalisation
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.replace(';', ' ; ')
    text = text.lower()
    text = text.strip()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

## Code to compute embeddings for assay descriptions

In [5]:
# Define a function to compute embeddings
def compute_embeddings(row):
    text = row['Description']
    embedding = np.squeeze(model.embed_sentence(preprocess_sentence(text)), axis=0)
    return embedding

# Apply the function to each row and create a new 'embeddings' column
assay_data['embeddings'] = assay_data.apply(compute_embeddings, axis=1)

In [6]:
# export assay data with embeddings as .csv file
assay_data.to_csv('AML_assays_with_embeddings.csv')