In [1]:
import pandas as pd
import json 
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score,  prompt_for_name
from utils.server_model_query import server_model_chat
from utils.llm_analysis_utils import process_analysis, save_progress
from utils.genai_query import query_genai_model
from tqdm import tqdm
import constant
import openai
import os
import logging
import re
%load_ext autoreload

%autoreload 2


openai QUERY being run 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


In [2]:
## load variables
initialize = True # if True, then initialize the input table with llm names, analysis and score to None 
# Replace with your actual values
config_file = './jsonFiles/name_only_test_gpt_4.json'  # replace with your actual config file 
input_file = 'data/GO_term_analysis/100_selected_go_contaminated.csv' # replace with your actual input file
input_sep = ','  # replace with the separator
set_index = 'GO'  # replace with your column name that you want to set as index or None
gene_column = 'Genes'  # replace with your actual column name for the gene list
gene_sep = ' '  # replace with your actual separator
gene_features = None  # replace with your path to the gene features or None if you don't want to include in the prompt
direct = False # if True, then the prompt will be a direct sentence asking for a name and analysis from the gene set, otherwise default or customized prompt
out_file = 'data/GO_term_analysis/name_only/LLM_processed_only_name_gpt4'  # replace with your actual output file name


# load the config file
with open(config_file) as json_file:
    config = json.load(json_file)

# Load OpenAI key, context, and model used 
openai.api_key = os.environ["OPENAI_API_KEY"]

context = config['CONTEXT']
model = config['MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
if model.startswith('gpt'):
    rate_per_token = config['RATE_PER_TOKEN']
    DOLLAR_LIMIT = config['DOLLAR_LIMIT']
LOG_FILE = config['LOG_NAME']+'_log.json'

SEED = constant.SEED
column_prefix = model.split('-')[0]

In [3]:
df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)

genes = df.loc['GO:0061740', 'Genes'].split(' ')
print(prompt_for_name(genes))
prompt_test = prompt_for_name(genes)
openai_chat(context, prompt_test, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)


Propose a brief name for the most prominant biological process performed by the system.
    
Be concise, do not use unneccesary words. Be specific, avoid overly general names such as 'the proteins are involved in various cellular processes'
Be factual, do not editorialize.
    

Here are the interacting proteins:

Proteins: HSPA8, LAMP2, CLU.


117


('Chaperone-Mediated Autophagy (CMA)', None)

In [4]:
# handle the logger so it create a new one for each model run
def get_logger(filename):
    logger = logging.getLogger(filename)
    logger.setLevel(logging.INFO)
    if not logger.handlers:
        file_handler = logging.FileHandler(filename)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        file_handler.setFormatter(formatter)
        logger.addHandler(file_handler)
    return logger


def main(df):
    analysis_dict  = {}

    logger = get_logger(f'{out_file}.log')

    i = 0 #used for track progress and saving the file
    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        #only process None rows 
        if pd.notna(row[f'{column_prefix} Only Name']):
            continue
        
        gene_data = row[gene_column]
        # if gene_data is not a string, then skip
        if type(gene_data) != str:
            
            logger.warning(f'Gene set {idx} is not a string, skipping')
            continue
        genes = gene_data.split(gene_sep)
        
        if len(genes) >1000:
            logger.warning(f'Gene set {idx} is too big, skipping')
            continue

        try:
            prompt = prompt_for_name(genes)
            # print(prompt)
            finger_print = None
            if model.startswith('gpt'):
                print("Accessing OpenAI API")
                analysis, finger_print = openai_chat(context, prompt, model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)
            elif model.startswith('gemini'):
                print("Using Google Gemini API")
                analysis, error_message = query_genai_model(f"{context}\n{prompt}", model, temperature, max_tokens, LOG_FILE) 
            else:
                print("Using server model")
                analysis, error_message= server_model_chat(context, prompt, model, temperature, max_tokens,LOG_FILE, SEED)

            
            if analysis:
                # print(analysis)
                df.loc[idx, f'{column_prefix} Only Name'] = analysis
                analysis_dict[f'{idx}_{column_prefix}'] = analysis
                # Log success with fingerprint
                logger.info(f'Success for {idx} {column_prefix}.')
                if finger_print:
                    logger.info(f'GPT_Fingerprint for {idx}: {finger_print}')
                    
            else:
                logger.error(f'Error for query gene set {idx}: {error_message}')

        except Exception as e:
            logger.error(f'Error for {idx}: {e}')
            continue
        i += 1
        if i % 10 == 0:
            save_progress(df, analysis_dict, out_file)
            # df.to_csv(f'{out_file}.tsv', sep='\t', index=True)
            print(f"Saved progress for {i} genesets")
    # save the final file
    save_progress(df, analysis_dict, out_file)
    

In [5]:
if __name__ == "__main__":
    
    df = pd.read_csv(input_file, sep=input_sep, index_col=set_index)
    column_prefix = 'gpt4_default'
    print(column_prefix)
    
    if initialize:
        df[f'{column_prefix} Only Name'] = None
    main(df)  ## run with the real set 
    
    df.head()


gpt4_default


FileNotFoundError: [Errno 2] No such file or directory: '/Users/petersapountzis/Desktop/tulaneCBG/EnrichGPT/llm_evaluation_for_gene_set_interpretation/data/GO_term_analysis/name_only/LLM_processed_only_name_gpt4.log'

In [32]:
# check the time 
import json 
with open('logs/name_only_gpt_4_log.json') as json_file:
    data = json.load(json_file)

time_use = data['time_taken_total']
time_per_query = time_use/data['runs']
cost = data['dollars_spent']
dollor_per_query = cost/data['runs']
print(f'Time per query: {time_per_query} seconds')
print(f'Dollor per query: {dollor_per_query} dollars')


In [3]:
## run semantic similarities on GO names vs the llm name
from semanticSimFunctions import getSentenceEmbedding
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm import tqdm
import pickle

SapBERT_tokenizer = AutoTokenizer.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')
SapBERT_model = AutoModel.from_pretrained('cambridgeltl/SapBERT-from-PubMedBERT-fulltext')

nameOnly_df = pd.read_csv('data/GO_term_analysis/name_only/LLM_processed_only_name_gpt4.tsv', sep='\t', index_col='GO')
with open('./data/all_go_terms_3to100_embeddings_dict.pkl', 'rb') as handle:
    all_go_terms_embeddings_dict = pickle.load(handle)
 

df = nameOnly_df.copy()
df['LLM_name_GO_term_sim'] = None
for ind, row in tqdm(df.iterrows(), total=df.shape[0]):

    GO_term = row['Term_Description'] # the actual GO term
    # get the name column 
    name_col = [col for col in df.columns if 'name' in col.lower()][0]
    # print(name_col)
    LLM_name = row[name_col] # the LLM name
    # print(LLM_name)
    # get llm name embedding
    LLM_name_emb = getSentenceEmbedding(LLM_name, SapBERT_tokenizer, SapBERT_model)
    GO_emb = all_go_terms_embeddings_dict[GO_term]
    # calculate the cosine similarity
    sim = cosine_similarity(LLM_name_emb, GO_emb)[0][0]
    df.loc[ind, 'LLM_name_GO_term_sim'] = sim
df

In [12]:
import matplotlib.pyplot as plt
import seaborn as sns
# compare llm_go sim with the default pipeline 
default_df = pd.read_csv('./data/GO_term_analysis/model_compare/sim_rank_LLM_processed_model_compare_100set_gpt_4.tsv', sep='\t', index_col='GO')

comparison_df = pd.merge(df[['LLM_name_GO_term_sim']], default_df[['LLM_name_GO_term_sim']], left_index=True, right_index=True, how='inner')

# Rename columns for clarity
comparison_df.columns = ['LLM_name_GO_term_sim_nameOnly', 'LLM_name_GO_term_sim_namewithAnalysis']

fig, ax = plt.subplots(figsize=(5,5))
sns.violinplot(data=comparison_df, ax=ax, inner='quartile', cut=0)
ax.set_xticklabels(['Only Name', 'Name with Analysis'], rotation=45)
ax.set_ylabel('Semantic Similarity with GO Term Description')
sns.despine()



In [17]:
combined_df = pd.merge(df, default_df[['gpt_4_default Name', 'LLM_name_GO_term_sim']], left_index=True, right_index=True, how='inner')


In [18]:
for ind, row in tqdm(combined_df.iterrows(), total=combined_df.shape[0]):
    name_only = row['gpt4_default Only Name']
    name_w_ana = row['gpt_4_default Name']
    name_only_emb = getSentenceEmbedding(name_only, SapBERT_tokenizer, SapBERT_model)
    name_w_ana_emb = getSentenceEmbedding(name_w_ana, SapBERT_tokenizer, SapBERT_model)
    sim = cosine_similarity(name_only_emb, name_w_ana_emb)[0][0]
    combined_df.loc[ind, 'Name_only_vs_Name_w_ana_sim'] = sim
    
    
fig, ax = plt.subplots(figsize=(5,5))
sns.violinplot(data=combined_df, y='Name_only_vs_Name_w_ana_sim', ax=ax, inner='quartile', cut=0)