In [1]:
## create embeddings for all GO Terms and save the embeddings 
from semanticSimFunctions import getSentenceEmbedding
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm import tqdm

SapBERT_tokenizer = AutoTokenizer.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
SapBERT_model = AutoModel.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')

import pickle
# get the embeddings for the go terms we made in TASK1
with open('data/all_go_terms_embeddings_dict.pkl', 'rb') as handle:
    all_go_terms_embeddings_dict = pickle.load(handle)
print(len(all_go_terms_embeddings_dict))


In [2]:
# load our GO term table 
go_terms = pd.read_csv('data/go_terms.csv', index_col=0)
print(go_terms.head())

In [4]:
inputFile = "data/omics_revamped_LLM_genecounts_DF.tsv"
data = pd.read_csv(inputFile, sep='\t')
data.head()

In [2]:
# def calc_LLM_JI(supporting_count, query_gene_list, GO_gene_list):
    
#     union = len(set(query_gene_list).union(set(GO_gene_list)))
#     return supporting_count/union if union != 0 else 0
def calc_LLM_JI(supporting_count, query_size, GO_size):
    # if the best matching GO term size is smaller than the supporting count, then make the size be the same as the supporting count
    if GO_size < supporting_count:
         GO_size = supporting_count
    union = query_size + GO_size - supporting_count
    return supporting_count/union if union != 0 else 0


def get_closest_GO(LLM_name, all_go_terms_embeddings_dict):
        # get llm name embedding
        LLM_name_emb = getSentenceEmbedding(LLM_name, SapBERT_tokenizer, SapBERT_model)
        # get the cosine similarity between the LLM name and all GO terms
        LLM_GO_sim = {}
        for term, emb in all_go_terms_embeddings_dict.items():
            GO_emb = emb
            GO_llm_sim = cosine_similarity(LLM_name_emb, GO_emb)[0][0]
            LLM_GO_sim[term] = GO_llm_sim
        # get the best matching GO term
        best_matching_GO = max(LLM_GO_sim, key=LLM_GO_sim.get)
        return best_matching_GO

In [54]:
col_keep = ['Source','GeneSetID','GeneList','n_Genes', 'LLM Name', 'LLM_best_matching_GO','best_matching_GO_ID', 'best_matching_GO_term_genes', 'best_matching_GO_term_size', 'Supporting Count', 'LLM_JI']

# initialize the data frame 
df = data.copy()
df['LLM_best_matching_GO'] = None
df['best_matching_GO_ID'] = None
df['best_matching_GO_term_genes'] = None
df['best_matching_GO_term_size'] = None
df['LLM_JI'] = None

for ind, row in tqdm(df.iterrows(), total=df.shape[0]):
    # if not pd.isna(row['LLM_best_matching_GO']):
    #     # no need to calculate the embeddings again
    #     continue
    LLM_name = row['LLM Name'] 
    if LLM_name == 'System of unrelated proteins':
        # dont find best matching GO term for the one does not named
        continue
    else:
        best_matching_GO = get_closest_GO(LLM_name, all_go_terms_embeddings_dict)
        df.loc[ind, 'LLM_best_matching_GO'] = best_matching_GO
        df.loc[ind, 'best_matching_GO_ID'] = go_terms.loc[go_terms['Term_Description'] == best_matching_GO, 'GO'].values[0]
        # get GO gene list 
        gene_list = go_terms.loc[go_terms['Term_Description'] == best_matching_GO, 'Genes'].values[0].split(' ')
        df.loc[ind, 'best_matching_GO_term_genes'] = ' '.join(gene_list)
        # get the GO term size 
        term_size = go_terms.loc[go_terms['Term_Description'] == best_matching_GO, 'Gene_Count'].values[0]
        assert len(gene_list) == term_size, f'{best_matching_GO}: GO gene list and term size does not match'
        df.loc[ind, 'best_matching_GO_term_size'] = term_size
        # # get intersections 
       
        # intersection = list(set(gene_list).intersection(LLM_genes))
        # df.loc[ind, 'best_matching_GO_term_intersection'] = ' '.join(intersection)

        # calculate JI 
        LLM_genes = row['updated GeneList'].split(' ')
        supporting_count = row['Supporting Count']
        JI = calc_LLM_JI(supporting_count,LLM_genes, gene_list)
        df.loc[ind, 'LLM_JI'] = JI

    df.loc[:, col_keep].to_csv('data/omics_revamped_LLM_w_best_matching_GO_terms_for_JI.tsv', sep='\t', index=False)

print('DONE')
df.loc[:, col_keep].to_csv('data/omics_revamped_LLM_w_best_matching_GO_terms_for_JI.tsv', sep='\t', index=False)

In [3]:
llm_JI_df = pd.read_csv('data/omics_revamped_LLM_w_best_matching_GO_terms_for_JI.tsv', sep='\t')
llm_JI_df