In [1]:
import os
import pandas as pd
import json 
import openai

In [2]:
import math

In [3]:
from utils.openai_query import openai_chat
from utils.prompt_factory import make_user_prompt_with_score
from utils.llm_analysis_utils import process_analysis, save_progress

In [4]:
openai.api_key = os.environ["OPENAI_API_KEY"] # Environment variable

In [5]:
runVersion = "additional"; # initial

In [6]:
geneSep = " "
inputFilePath = "data/omics_revamped_LLM_DF.tsv"; #"data/omics_revamped.txt"
jsonFilePath = "jsonFiles/OmicsRunLLM.json"
genesCol = "GeneList"
nameCol  = "GeneSetName"
outputFilePath = "data/omics_revamped_LLM_DF.tsv"

In [7]:
with open(jsonFilePath) as json_file:
    config = json.load(json_file)
    
context = config['CONTEXT']
gpt_model = config['GPT_MODEL']
temperature = config['TEMP']
max_tokens = config['MAX_TOKENS']
rate_per_token = config['RATE_PER_TOKEN']
LOG_FILE = config['LOG_NAME'] + '240129'+'log.json'
DOLLAR_LIMIT = config['DOLLAR_LIMIT']

In [8]:
SEED = 42

In [9]:
gpt_model

### Run GPT-4 query pipeline for NeST gene sets

In [10]:
df = pd.read_csv(inputFilePath, sep = "\t"); 

In [11]:
if runVersion == "initial":
    df['LLM Name'] = None
    df['LLM Analysis'] = None
    df['Score'] = None

In [12]:
for i, row in df.iterrows():
    
    term_genes = row[genesCol]
    genes = term_genes.split(geneSep) 
    
    if runVersion == "additional":
        if type(row['LLM Name']) == str:
            continue # skip this row because already done
   
    
    prompt = make_user_prompt_with_score(genes)

    analysis, finger_print = openai_chat(context, prompt, gpt_model, temperature, max_tokens, rate_per_token, LOG_FILE, DOLLAR_LIMIT, SEED)

    if analysis:
        llm_name, llm_score, llm_analysis = process_analysis(analysis)
        df.loc[i, 'LLM Name'] = llm_name
        df.loc[i, 'LLM Analysis'] = llm_analysis
        df.loc[i, 'Score'] = float(llm_score)

    else:
        #go_term = row['GO']
        name = row[nameCol]
        print(f'No analysis for {name}')
        df.loc[i, 'LLM Name'] = None
        df.loc[i, 'LLM Analysis'] = None
    #if (i%10 ==1):
    #    break
        
    # Keep on saving to not loose data if something happens
    if (i%10 == 1):
        print(i)
        df.to_csv(outputFilePath, sep = "\t",  index=False)
    

In [15]:
df.to_csv(outputFilePath, sep= '\t', index=False)
