<a href="https://colab.research.google.com/github/nlei1/nlp-notebooks/blob/main/BioWordVec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np, pandas as pd 
import os
import zipfile
import torch
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from scipy import stats
import gensim
import gensim.downloader as api
import seaborn as sns
import matplotlib.pyplot as pyplot
%matplotlib inline

In [2]:
!wget -P /root/input/ -c 'https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin'

--2022-08-03 15:34:39--  https://ftp.ncbi.nlm.nih.gov/pub/lu/Suppl/BioSentVec/BioWordVec_PubMed_MIMICIII_d200.vec.bin
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 165.112.9.230, 2607:f220:41e:250::11, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13451441787 (13G) [application/octet-stream]
Saving to: ‘/root/input/BioWordVec_PubMed_MIMICIII_d200.vec.bin’


2022-08-03 15:40:16 (38.1 MB/s) - ‘/root/input/BioWordVec_PubMed_MIMICIII_d200.vec.bin’ saved [13451441787/13451441787]



In [10]:
model = gensim.models.KeyedVectors.load_word2vec_format(
     '/root/input/BioWordVec_PubMed_MIMICIII_d200.vec.bin',
      binary=True,
      limit=int(2E6)
)

In [4]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# path = '/content/drive/My Drive/embeddings'
# word_vectors = model.wv

# # save as KeyedVectors
# from gensim.models import KeyedVectors
# word_vectors.save(path)

  


In [11]:
url  = 'https://raw.githubusercontent.com/nlei1/csvs-for-proj/main/drugs-side-effects3.csv'
df = pd.read_csv(url, header=None, error_bad_lines=False)
insomnia_drug_names = df[0].tolist()
ind_dct = {k: v for v, k in enumerate(insomnia_drug_names)}

def get_words(drug_name):
  return [incom for incom in df.iloc[ind_dct[drug_name]] if str(incom) != 'nan']

def get_embeddings(words_lst):
  # takes in a name and returns a lst of embeddings of drug and its related words
  embeddings = []
  for item in words_lst:
    embeddings.append(model[item])
  return embeddings

def get_pca(embeddings, n_components):
  pca = PCA(n_components)
  pca_result = pca.fit_transform(embeddings)
  post_pca = pd.DataFrame(pca_result, columns = ['x','y'])
  return post_pca

def get_tsne(embeddings, p_perplexity, p_n_iter, n_components):
  tsne = TSNE(n_components, perplexity=p_perplexity, n_iter=p_n_iter)
  tsne_result = tsne.fit_transform(embeddings)
  post_tsne = pd.DataFrame(tsne_result, columns = ['x','y'])
  return post_tsne

def plot_with_labels(drug_name, pca=True, p_perplexity=1, p_n_iter=1000, n_components=2):
  words_lst = get_words(drug_name)
  embeddings_lst = get_embeddings(words_lst)
  if pca:
    post_reduction = get_pca(embeddings_lst, n_components)
  else:
    post_reduction = get_tsne(embeddings_lst, p_perplexity, p_n_iter, n_components)
  post_reduction_annotated = post_reduction.join(pd.DataFrame(words_lst, columns=['label']))
  title_str = drug_name + (": PCA" if pca else ": TSNE")
  ax = post_reduction_annotated.plot(x='x',y='y',kind='scatter',figsize=(10,10), title=title_str)
  post_reduction_annotated[['x','y','label']].apply(lambda x: ax.text(*x),axis=1)

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

def plot_insomnia_drugs():
  for drug in insomnia_drug_names:
    plot_with_labels(drug)
    plot_with_labels(drug, False)

def get_similarity_table(n=20):
  result_lst = []
  for drug in insomnia_drug_names:
    result_lst.append(pd.DataFrame(model.most_similar(positive=[drug], topn=n), columns=['name (' + drug + ")", 'similarity (' + drug + ")"]))
  return pd.concat(result_lst, axis=1, join="inner")

def get_effects_table(n=20):
  result_lst = []
  for drug in insomnia_drug_names:
    result_lst.append(pd.DataFrame(model.most_similar(positive=[drug, 'effects'], negative=['medication'], topn=n), columns=['name (' + drug + ")", 'similarity (' + drug + ")"]))
  return pd.concat(result_lst, axis=1, join="inner")

def forms_of_words_analysis():
  forms_of_words_url = 'https://raw.githubusercontent.com/nlei1/csvs-for-proj/main/forms-of-words.csv'
  forms_of_words_df = pd.read_csv(forms_of_words_url, header=None, error_bad_lines=False)
  word_lst = []
  for index, row in forms_of_words_df.iterrows():
    word_lst += (row.dropna()).tolist()
  new_lst = []
  for thing in word_lst:
    if thing in model.wv.vocab:
      new_lst.append(thing)
  embeddings_lst = get_embeddings(new_lst)
  post_reduction = get_tsne(embeddings_lst, p_perplexity=1, p_n_iter=1000, n_components=2)
  post_reduction_annotated = post_reduction.join(pd.DataFrame(new_lst, columns=['label']))
  ax = post_reduction_annotated.plot(x='x',y='y',kind='scatter',figsize=(10,10), title='TSNE')
  post_reduction_annotated[['x','y','label']].apply(lambda x: ax.text(*x),axis=1)
  post_reduction = get_pca(embeddings_lst, n_components=2)
  post_reduction_annotated = post_reduction.join(pd.DataFrame(new_lst, columns=['label']))
  ax = post_reduction_annotated.plot(x='x',y='y',kind='scatter',figsize=(10,10), title='PCA')
  post_reduction_annotated[['x','y','label']].apply(lambda x: ax.text(*x),axis=1)

In [13]:
import statistics
import scipy
import torch

biosimlex_url = 'https://raw.githubusercontent.com/cambridgeltl/bio-simverb/master/wvlib/word-similarities/bio-simlex/Bio-SimLex.txt'
biosimverb_url = 'https://raw.githubusercontent.com/cambridgeltl/bio-simverb/master/wvlib/word-similarities/bio-simverb/Bio-SimVerb.txt'
umnsrs_rel_url = 'https://raw.githubusercontent.com/cambridgeltl/bio-simverb/master/wvlib/word-similarities/UMNSRS/UMNSRS-rel.txt'
umnsrs_sim_url = 'https://raw.githubusercontent.com/cambridgeltl/bio-simverb/master/wvlib/word-similarities/UMNSRS/UMNSRS-sim.txt'

tensors_dict = {}

def sim_matrix(a, b, eps=1e-8):
  """
  added eps for numerical stability
  """
  a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
  a_norm = a / torch.max(a_n, eps * torch.ones_like(a_n))
  b_norm = b / torch.max(b_n, eps * torch.ones_like(b_n))
  sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
  return sim_mt.item()

def retrieve_embedding(item):
  lst = model[item]
  return torch.tensor([lst])

def evaluate(filename, num_rows_eval=50): 
  if filename.endswith(".csv"):
    data = (pd.read_csv(filename, sep=",")).iloc[:num_rows_eval]
  else:
    data = pd.read_csv(filename, sep="\t")
  human_similarity = []
  model_similarity = []
  counter = 0
  for i in data.iloc[:, 0:2].index:
    word1, word2 = data.iloc[i, 0], data.iloc[i, 1]
    if not ((word1 in model) and (word2 in model)):
      continue
    else:
      model_similarity.append(sim_matrix(retrieve_embedding(word1), retrieve_embedding(word2)))
      human_similarity.append(float(data.iloc[i, 2]))

  return scipy.stats.spearmanr(human_similarity, model_similarity)# , model_similarity

def novel_intrinsic_eval():
  group_a = ["zolpidem","eszopiclone","zaleplon","trazodone","amitriptyline","mirtazapine","doxepin","lorazepam","clonazepam","temazepam","triazolam","suvorexant","lemborexant","melatonin"]
  group_b = ["atorvastatin","acetaminophen","ibuprofen","levothyroxine","lisinopril","metformin","metoprolol","amlodipine","albuterol","omeprazole","losartan","gabapentin","hydrochlorothiazide","furosemide"]
  thetas = []
  for word1 in insomnia_drug_names:
    group1_scores = []
    group2_scores = []
    # group 1: similar
    for word2 in group_a:
      if not ((word1 in model) and (word2 in model)):
        # print(word1)
        # print(word2)
        continue
      else:
        group1_scores.append(sim_matrix(retrieve_embedding(word1), retrieve_embedding(word2)))

    # group 2: different
    for word2 in group_b:
      if not ((word1 in model) and (word2 in model)):
        # print(word1)
        # print(word2)
        continue
      else:
        group2_scores.append(sim_matrix(retrieve_embedding(word1), retrieve_embedding(word2)))
    
    if group1_scores and group2_scores:
      thetas.append(statistics.median(group1_scores) - statistics.median(group2_scores))
  return statistics.median(thetas)

def run_eval():
  print("BioSimLex:", evaluate(biosimlex_url))
  print("BioSimVerb:", evaluate(biosimverb_url))
  print("UMNSRS-REL:", evaluate(umnsrs_rel_url))
  print("UMNSRS-SIM:", evaluate(umnsrs_sim_url))
  print("Novel Intrinsic Task:", novel_intrinsic_eval())

run_eval()

BioSimLex: SpearmanrResult(correlation=0.7205657589428461, pvalue=6.310315624661459e-153)
BioSimVerb: SpearmanrResult(correlation=0.49286944052215226, pvalue=3.817606785201803e-62)
UMNSRS-REL: SpearmanrResult(correlation=0.5722957648037825, pvalue=1.9411914157533975e-47)
UMNSRS-SIM: SpearmanrResult(correlation=0.6319125266711013, pvalue=5.568675126110056e-59)
Novel Intrinsic Task: 0.16147756576538086


In [None]:
evaluate(biosimverb_url)

In [None]:
plot_insomnia_drugs()

In [None]:
get_similarity_table()

In [None]:
get_effects_table()

In [None]:
path = '/content/drive/My Drive/biowordvec_similarity.csv'
tmp_df = get_similarity_table(1000)

with open(path, 'w', encoding = 'utf-8-sig') as f:
  tmp_df.to_csv(f)




In [None]:
# sensitivity

new_url = "https://raw.githubusercontent.com/nlei1/csvs-for-proj/main/drugs-side-effects-4.csv"
new_df = pd.read_csv(new_url, header=None, error_bad_lines=False)
insomnia_drug_names2 = new_df[0].tolist()
ind_dct2 = {k: v for v, k in enumerate(insomnia_drug_names2)}

def get_words2(drug_name):
  return [incom for incom in new_df.iloc[ind_dct2[drug_name]] if str(incom) != 'nan']

def plot_with_labels2(drug_name, pca=True, p_perplexity=1, p_n_iter=1000, n_components=2):
  words_lst = get_words2(drug_name)
  embeddings_lst = get_embeddings(words_lst)
  if pca:
    post_reduction = get_pca(embeddings_lst, n_components)
  else:
    post_reduction = get_tsne(embeddings_lst, p_perplexity, p_n_iter, n_components)
  post_reduction_annotated = post_reduction.join(pd.DataFrame(words_lst, columns=['label']))
  title_str = drug_name + (": PCA" if pca else ": TSNE")
  ax = post_reduction_annotated.plot(x='x',y='y',kind='scatter',figsize=(10,10), title=title_str)
  post_reduction_annotated[['x','y','label']].apply(lambda x: ax.text(*x),axis=1)

def plot_insomnia_drugs2():
  for drug in insomnia_drug_names2:
    plot_with_labels2(drug)
    plot_with_labels2(drug, False)

plot_insomnia_drugs2()

In [None]:
words_lst = df[0].tolist() + ['atorvastatin', 'fluvastatin', 'lovastatin', 'pravastatin']
embeddings_lst = get_embeddings(words_lst)
post_reduction = get_pca(embeddings_lst, n_components=2)
post_reduction_annotated = post_reduction.join(pd.DataFrame(words_lst, columns=['label']))
ax = post_reduction_annotated.plot(x='x',y='y',kind='scatter',figsize=(10,10))
post_reduction_annotated[['x','y','label']].apply(lambda x: ax.text(*x),axis=1)

In [None]:
words_lst = df[0].tolist() + ['atorvastatin', 'fluvastatin', 'lovastatin', 'pravastatin']
embeddings_lst = get_embeddings(words_lst)
post_reduction = get_tsne(embeddings_lst, p_perplexity=1, p_n_iter=1000, n_components=2)
post_reduction_annotated = post_reduction.join(pd.DataFrame(words_lst, columns=['label']))
ax = post_reduction_annotated.plot(x='x',y='y',kind='scatter',figsize=(10,10))
post_reduction_annotated[['x','y','label']].apply(lambda x: ax.text(*x),axis=1)

In [None]:
forms_of_words_analysis()

In [None]:
similarities = model.wv.evaluate_word_pairs('/content/Bio-SimLex.txt')

In [None]:
similarities