[This notebook showing example of toy GO terms]

Rank all GO biological processes by the similarity with the LLM term. 

* % of other GO names have smaller semantic similarity with the GPT-4 name comparing to the assgined GO name



In [2]:
import pandas as pd
all_go = pd.read_csv('data/go_terms.csv', index_col=0)
len(all_go)

## Step 1 get the word embeddings for all the go terms (only need to run once for all)

In [3]:
## create embeddings for all GO Terms and save the embeddings 
from semanticSimFunctions import getSentenceEmbedding
from transformers import AutoTokenizer, AutoModel
import pandas as pd

SapBERT_tokenizer = AutoTokenizer.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
SapBERT_model = AutoModel.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')

all_go = pd.read_csv('data/go_terms.csv', index_col=0)
all_go_terms = all_go['Term_Description'].tolist()

all_go_terms_embeddings_dict = {}
for i, go_term in enumerate(all_go_terms):
    tensor = getSentenceEmbedding(go_term, SapBERT_tokenizer, SapBERT_model)
    all_go_terms_embeddings_dict[go_term] = tensor.numpy()  # Convert to numpy array

import pickle
with open('data/all_go_terms_embeddings_dict.pkl', 'wb') as handle:  
    pickle.dump(all_go_terms_embeddings_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
#check if embeddings are saved correctly
import pickle
with open('data/all_go_terms_embeddings_dict.pkl', 'rb') as handle:
    all_go_terms_embeddings_dict = pickle.load(handle)
print(len(all_go_terms_embeddings_dict))
# all_go_terms_embeddings_dict['cellular response to DNA damage stimulus']

## Step2: iterate through each GO term and its corresponsing LLM term, rank the similarity score of the LLM with all GO terms and fin where is the trueGO-LLM term is among the list


When running for the 1000 gene set, used the python function rank_GOterm_LLM_sim.py to run at the background

 ```
 python rank_GOterm_LLM_sim_rand.py --input_file data/GO_term_analysis/LLM_processed_selected_1000_go_terms.tsv --emb_file data/all_go_terms_embeddings_dict.pkl --topn 50 --output_file data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv --background_file data/GO_term_analysis/all_go_sim_scores.txt
 ```

the code at the bottom is just an example

In [4]:
%run rank_GOterm_LLM_sim_rand.py --input_file data/GO_term_analysis/LLM_processed_toy_example.tsv --emb_file data/all_go_terms_embeddings_dict.pkl --topn 50 --output_file data/GO_term_analysis/simrank_LLM_processed_toy_example.tsv --background_file data/GO_term_analysis/toy_all_go_sim_scores.txt

In [6]:
# sanity check
df = pd.read_csv('data/GO_term_analysis/simrank_LLM_processed_toy_example.tsv', sep='\t', index_col=0)
df.head()

### Check the rank similarity result of the 1000 gene sets 

In [11]:
import pandas as pd

rank_sim_df = pd.read_csv('data/GO_term_analysis/simrank_LLM_processed_selected_1000_go_terms.tsv', sep='\t')
## if duplicate
print(sum(rank_sim_df.duplicated(subset=['GO'])))
print(sum(rank_sim_df.duplicated(subset=['LLM Analysis'])))

## half point of the similarity distribution
rank_sim_sorted = rank_sim_df.sort_values(by='true_GO_term_sim_percentile', ascending=False)
print('half of the sample have the percentile score higher than: ',rank_sim_sorted.iloc[500-1]['true_GO_term_sim_percentile'])

## number of GO terms in top 10% of similarities
print('number of GO terms in top 10%: ', sum(rank_sim_df['true_GO_term_sim_percentile'] <= 0.1))

## number of GO terms ranked top 10 of similarities

print('number of GO terms ranked top 10: ', sum(rank_sim_df['sim_rank'] <= 10))

In [None]:
# rank the GO terms by the similarity of LLM name and GO term and pick top 25 and bottom 25 for manual evaluation
rank_sim_df.sort_values(by=['LLM_name_GO_term_sim'], ascending=False, inplace=True)
top = rank_sim_df.head(25)
bottom = rank_sim_df.tail(25)
combine_df = pd.concat([top,bottom], ignore_index=True)


# # add a column to randomly assign number from 1-5, each has the same number of GO terms
# team = [1,2,3,4,5]*10
# import random
# random.seed(2023)
# random.shuffle(team)
# combine_df['team'] = team



combine_df.to_csv('data/GO_term_analysis/best_25_worst_25_similarity_among1000GO.tsv', sep='\t', index=False)
