# Benchmarking GPT-4

In [17]:
import pandas as pd
import numpy as np
from collections import Counter
import json
from dotenv import load_dotenv
import os
import openai
import os
import requests
import azure
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI
from rouge import Rouge
import evaluate
import re 

In [None]:

path = r"../analysis_and_tools_only.csv"
df = pd.read_csv(path)

In [14]:
from azure.identity import DefaultAzureCredential

def token_provider():
    # Create a DefaultAzureCredential instance
    credential = DefaultAzureCredential()
    # Get the token for the Azure OpenAI service
    token = credential.get_token("https://cognitiveservices.azure.com/.default")
    return token.token


In [None]:

token_provider = DefaultAzureCredential()                               

client = AzureOpenAI(
    api_version="2024-02-15-preview",
    azure_endpoint="",
    azure_ad_token_provider=token_provider
)

response = client.chat.completions.create(
    model="gpt-4-0125", # model = "deployment_name".
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Does Azure OpenAI support customer managed keys?"},
        {"role": "assistant", "content": "Yes, customer managed keys are supported by Azure OpenAI."},
        {"role": "user", "content": "Do other Azure AI services support this too?"}
    ]
)

print(response.choices[0].message.content)

TypeError: 'DefaultAzureCredential' object is not callable

In [4]:
df.head()

Unnamed: 0,id,date,title,content,upvote_count,author_id,extracted_keywords,answer_content,answer_upvote_count,category_type,agg_type
0,9535219,2022-08-18,Nextflow printing help message even when param...,Hi Everyone. I was trying to add help section ...,1,48122,"nextflow, align, bwa, alignment",there is no reserved word for 'help'. This is ...,5,"analysis, tool",analysis_and_tool
1,395057,2019-08-20,Stop BWA from storing unmapped reads,I am currently using BWA-MEM to map metagenomi...,2,25073,"bwa, alignment","<pre class=""pre""><code class=""language-bash"">b...",5,"analysis, tool",analysis_and_tool
2,364089,2019-02-14,bwa: fail to locate the index files,"Hi all,I'm trying to align a fastq file to a r...",1,52541,"alignment, bwa","I am not sure, but I think the cause of the er...",3,"analysis, tool",analysis_and_tool
3,6118,2011-03-04,Hmmbuild: How To Choose The Best Alignment For...,"Hi,I wonder whether it's better to remove weak...",7,950,"hmm, hmmer, alignment","Unless your protein be something new, the best...",6,"analysis, tool",analysis_and_tool
4,11534,2011-08-29,Can´T Find The Snps With Samtools (Only Get In...,"Hello everybody, Could anyone tell me how to g...",3,2549,"snp, indel, samtools, alignment","First simple thing to try: <a rel=""nofollow"" h...",8,"analysis, tool",analysis_and_tool


In [6]:
len(df)

71

In [28]:
df["extracted_keywords"].value_counts()[:25].sort_values(ascending=False)

extracted_keywords
bwa, alignment                                    5
RNA-Seq, samtools                                 3
alignment, bwa                                    3
samtools, alignment                               2
alignment, bwa, fastq                             1
bwa, NGS, alignment, hg38, bwa.kit                1
sam, output, bwa, alignment, format               1
bam, samtools, RNA-Seq                            1
bwa, bwt, bwa mem, contig, alignment              1
bwa, bwa-mem, alignment, variant calling, gatk    1
alignment, bwa, bowtie, read, genome              1
nextflow, align, bwa, alignment                   1
alignment, DNA-seq, bwa                           1
hmm, hmmer, alignment, consensus                  1
domain, retrieval, hmmer                          1
bwa, alignment, solid                             1
mirna, bwa, alignment                             1
bwa, sam, alignment, fastq                        1
bwa, alignment, vcf, bam                     

#### GPT-4 calls to QA pairs that are both analysis and tools. 

In [29]:
questions = df["content"].to_list()

np.random.seed(2222)

def gpt4_answer(client, questions, batch_size =10):
    gpt4_results = []
    prompt_check = "Return the response in a paragraph format."
    batch_num = (len(questions) + batch_size -1) // batch_size

    for i in range (batch_num):
        batch_questions = questions[i*batch_size: (i+1) *batch_size]  #calculate start and end index
        for question in batch_questions:
            messages = {"role": "user", "content": question + " " + prompt_check}



            completion_check = client.chat.completions.create(
                model="gpt-4-0125",
                temperature=0.0, 
                messages = [messages]
            )

        # Extract the response content for the first (and presumably only) choice
            if completion_check.choices:
                response_content = completion_check.choices[0].message.content
                print(response_content)
                gpt4_results.append(response_content)
            else:
                gpt4_results.append("")  # Append empty string if no response
       
    
    df = pd.DataFrame({'Question': questions, 'Answer': gpt4_results})
    
    return df

      
final_df = gpt4_answer(client, questions)


print(final_df)


The issue you're encountering with your Nextflow script is that the help message is always printed regardless of whether the `--help` flag is used or not. This happens because the help message is logged outside of any conditional statement that checks for the `--help` flag. In your script, you've set `params.help = false` by default, which is correct, but you haven't provided a mechanism for the user to set this flag to `true` via the command line. 

To resolve this issue, you need to add a condition to check if the `--help` flag is present and then set `params.help` to `true`. However, Nextflow doesn't directly support a `--help` flag in the same way it does for other parameters. Instead, you can use a workaround by checking for the presence of the `--help` flag in the script arguments and then manually setting the `params.help` to `true` before the help message is conditionally printed.

You can achieve this by adding a snippet at the beginning of your script that checks for the `--h

In [30]:
final_df.head()

Unnamed: 0,Question,Answer
0,Hi Everyone. I was trying to add help section ...,The issue you're encountering with your Nextfl...
1,I am currently using BWA-MEM to map metagenomi...,"Yes, you can configure BWA-MEM to output only ..."
2,"Hi all,I'm trying to align a fastq file to a r...",It seems like you're encountering an issue whe...
3,"Hi,I wonder whether it's better to remove weak...",When building a Hidden Markov Model (HMM) for ...
4,"Hello everybody, Could anyone tell me how to g...",It sounds like you're on the right track with ...


In [31]:
len(final_df)

71

In [32]:
final_df.head()

Unnamed: 0,Question,Answer
0,Hi Everyone. I was trying to add help section ...,The issue you're encountering with your Nextfl...
1,I am currently using BWA-MEM to map metagenomi...,"Yes, you can configure BWA-MEM to output only ..."
2,"Hi all,I'm trying to align a fastq file to a r...",It seems like you're encountering an issue whe...
3,"Hi,I wonder whether it's better to remove weak...",When building a Hidden Markov Model (HMM) for ...
4,"Hello everybody, Could anyone tell me how to g...",It sounds like you're on the right track with ...


In [33]:
final_df.to_csv("gpt4_results_analysis_tool_both", index=False)

### Rouge Calculations

In [7]:
path = r"...\03_benchmarking_llms\gpt-4\output_data\gpt4_results_analysis_tool_both"
rouge_gpt4 = pd.read_csv(path)

In [35]:

rouge = evaluate.load('rouge') #https://huggingface.co/spaces/evaluate-metric/rouge
predictions = rouge_gpt4["Answer"].to_list()
references = df["answer_content"].to_list()

results = rouge.compute(predictions=predictions,
                         references=references,
                        use_aggregator=True)

print("ROUGE-1:", round(results["rouge1"], 3))
print("ROUGE-2:", round(results["rouge2"], 3))
print("ROUGE-L:", round(results["rougeL"], 3))
print("ROUGE-Lsum:", round(results["rougeLsum"], 3))




ROUGE-1: 0.183
ROUGE-2: 0.029
ROUGE-L: 0.099
ROUGE-Lsum: 0.109


#### Bootstrapping 100 times

In [45]:
predictions = rouge_gpt4["Answer"].to_list()
references = df["answer_content"].to_list()

# Function to compute ROUGE and return scores
def compute_rouge(predictions, references):
    return rouge.compute(predictions=predictions, references=references, use_aggregator=True)

# Bootstrap sampling
n_iterations = 100
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'rougeLsum': []}

for i in range(n_iterations): 
    indices = np.random.randint(0, len(predictions), len(references)) #get indicies
    sampled_predictions = [predictions[i] for i in indices]
    sampled_references = [references[i] for i in indices]

    scores = compute_rouge(sampled_predictions, sampled_references)
    for key in rouge_scores.keys():
        rouge_scores[key].append(scores[key])


# Calculate confidence intervals
confidence_intervals = {key: (np.percentile(rouge_scores[key], 2.5),
                               np.percentile(rouge_scores[key], 97.5)) for key in rouge_scores}

# Print the results
print("Confidence Intervals for ROUGE Scores:")
for key, (lower, upper) in confidence_intervals.items():
    print(f"{key}: ({round(lower, 3)}, {round(upper, 3)})")


KeyboardInterrupt: 

## Model Self-Evaluation

#### Similarity Rating

In [9]:
predictions = rouge_gpt4["Answer"].to_list()

references = df["answer_content"].to_list()

questions = df["content"].to_list()

np.random.seed(2222)

def gpt4_evaluation_p1(client, predictions, references, questions, batch_size =10):
    p1_results = []
    prompt_template = "Return an integer from 1 to 5 that rates the similarity between answers {} and {}. A rating of 5 means the two answers are the same. The answer should only contain the number."
    batch_num = (len(questions) + batch_size -1) // batch_size

    for i in range(batch_num):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(questions))
        batch_questions = questions[batch_start:batch_end]
        batch_predictions = predictions[batch_start:batch_end]
        batch_references = references[batch_start:batch_end]

        for question, pred, ref in zip(batch_questions, batch_predictions, batch_references):
            prompt = prompt_template.format(pred, ref)
            messages = [{"role": "user", "content": prompt}]

            completion_check = client.chat.completions.create(
                model="gpt-4-0125",
                temperature=0.0, 
                messages = messages
            )

            if completion_check.choices:
                response_content = completion_check.choices[0].message.content
                p1_results.append(response_content)
            else:
                # Handle case where response is empty or not as expected
                p1_results.append(None)

    
    
    df = pd.DataFrame({'Question': questions, 'References': references, 'Predictions': predictions, 'Similarity_Rating': p1_results})
    
    return df

      
final_df = gpt4_evaluation_p1(client, predictions, references, questions, batch_size =10)
final_df.head()


Unnamed: 0,Question,References,Predictions,Similarity_Rating
0,Hi Everyone. I was trying to add help section ...,there is no reserved word for 'help'. This is ...,The issue you're encountering with your Nextfl...,4
1,I am currently using BWA-MEM to map metagenomi...,"<pre class=""pre""><code class=""language-bash"">b...","Yes, you can configure BWA-MEM to output only ...",5
2,"Hi all,I'm trying to align a fastq file to a r...","I am not sure, but I think the cause of the er...",It seems like you're encountering an issue whe...,2
3,"Hi,I wonder whether it's better to remove weak...","Unless your protein be something new, the best...",When building a Hidden Markov Model (HMM) for ...,3
4,"Hello everybody, Could anyone tell me how to g...","First simple thing to try: <a rel=""nofollow"" h...",It sounds like you're on the right track with ...,2


#### Accuracy Rating

In [10]:
np.random.seed(2222)

def gpt4_evaluation_p2(client, final_df, predictions, references, questions, batch_size =10):
    p2_results = []
    df = final_df
    prompt_template= "Return an integer from 1 to 5 that rates how well the answer {} addresses the question {}. A rating of 1 indicates poorly. The answer should only contain the number."
    
    batch_num = (len(questions) + batch_size -1) // batch_size
    
    for i in range(batch_num):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(questions))
        batch_questions = questions[batch_start:batch_end]
        batch_predictions = predictions[batch_start:batch_end]
        batch_references = references[batch_start:batch_end]

        for question, pred, ref in zip(batch_questions, batch_predictions, batch_references):
            prompt = prompt_template.format(pred, question)
            messages = [{"role": "user", "content": prompt}]

            
            completion_check = client.chat.completions.create(
                model="gpt-4-0125",
                temperature=0.0, 
                messages = messages
            )

            if completion_check.choices:
                response_content = completion_check.choices[0].message.content
                p2_results.append(response_content)
            else:
                p2_results.append("")  # Append empty string if no response
           
    df = final_df
    df["Accuracy"] = p2_results

    return df
      
final_df = gpt4_evaluation_p2(client, final_df, predictions, references, questions, batch_size =10)

final_df.head()

Unnamed: 0,Question,References,Predictions,Similarity_Rating,Accuracy
0,Hi Everyone. I was trying to add help section ...,there is no reserved word for 'help'. This is ...,The issue you're encountering with your Nextfl...,4,5
1,I am currently using BWA-MEM to map metagenomi...,"<pre class=""pre""><code class=""language-bash"">b...","Yes, you can configure BWA-MEM to output only ...",5,5
2,"Hi all,I'm trying to align a fastq file to a r...","I am not sure, but I think the cause of the er...",It seems like you're encountering an issue whe...,2,5
3,"Hi,I wonder whether it's better to remove weak...","Unless your protein be something new, the best...",When building a Hidden Markov Model (HMM) for ...,3,5
4,"Hello everybody, Could anyone tell me how to g...","First simple thing to try: <a rel=""nofollow"" h...",It sounds like you're on the right track with ...,2,5


In [11]:
final_df.head()

Unnamed: 0,Question,References,Predictions,Similarity_Rating,Accuracy
0,Hi Everyone. I was trying to add help section ...,there is no reserved word for 'help'. This is ...,The issue you're encountering with your Nextfl...,4,5
1,I am currently using BWA-MEM to map metagenomi...,"<pre class=""pre""><code class=""language-bash"">b...","Yes, you can configure BWA-MEM to output only ...",5,5
2,"Hi all,I'm trying to align a fastq file to a r...","I am not sure, but I think the cause of the er...",It seems like you're encountering an issue whe...,2,5
3,"Hi,I wonder whether it's better to remove weak...","Unless your protein be something new, the best...",When building a Hidden Markov Model (HMM) for ...,3,5
4,"Hello everybody, Could anyone tell me how to g...","First simple thing to try: <a rel=""nofollow"" h...",It sounds like you're on the right track with ...,2,5


#### Similarity Rating with references and predictions primed

In [12]:
np.random.seed(2244)

def gpt4_evaluation_p3(client, final_df, predictions, references, questions, batch_size =10):
    p3_results = []
    df = final_df
   
    prompt_template = "Return an integer from 1 to 5 that rates the similarity between the references{} and predictions {}. A rating of 5 means the two answers are the same. The answer should only contain the number."
    
    batch_num = (len(questions) + batch_size -1) // batch_size
    
    for i in range(batch_num):
        batch_start = i * batch_size
        batch_end = min((i + 1) * batch_size, len(questions))
        batch_questions = questions[batch_start:batch_end]
        batch_predictions = predictions[batch_start:batch_end]
        batch_references = references[batch_start:batch_end]

        for question, pred, ref in zip(batch_questions, batch_predictions, batch_references):
            prompt = prompt_template.format(pred, question)
            messages = [{"role": "user", "content": prompt}]

            
            completion_check = client.chat.completions.create(
                model="gpt-4-0125",
                temperature=0.0, 
                messages = messages
            )

            if completion_check.choices:
                response_content = completion_check.choices[0].message.content
                print(response_content)
                p3_results.append(response_content)
            else:
                p3_results.append("")  # Append empty string if no response
           
    df = final_df
    df["Similarity_Primed"] = p3_results

    return df

      
final_df = gpt4_evaluation_p3(client, final_df, predictions, references, questions, batch_size =10)

final_df.head()

5
5
5
5
5
5
4
5
4
5
4
4
4
5
5
4
5
4
4
4
5
5
4
4
5
5
5
5
4
4
4
5
5
5
4
5
5
4
5
5
5
5
5
5
5
5
5
5
5
5
5
5
4
5
5
5
4
5
5
4
5
5
4
4
5
5
4
5
4
5
1


Unnamed: 0,Question,References,Predictions,Similarity_Rating,Accuracy,Similarity_Primed
0,Hi Everyone. I was trying to add help section ...,there is no reserved word for 'help'. This is ...,The issue you're encountering with your Nextfl...,4,5,5
1,I am currently using BWA-MEM to map metagenomi...,"<pre class=""pre""><code class=""language-bash"">b...","Yes, you can configure BWA-MEM to output only ...",5,5,5
2,"Hi all,I'm trying to align a fastq file to a r...","I am not sure, but I think the cause of the er...",It seems like you're encountering an issue whe...,2,5,5
3,"Hi,I wonder whether it's better to remove weak...","Unless your protein be something new, the best...",When building a Hidden Markov Model (HMM) for ...,3,5,5
4,"Hello everybody, Could anyone tell me how to g...","First simple thing to try: <a rel=""nofollow"" h...",It sounds like you're on the right track with ...,2,5,5


In [142]:
final_df["Question"][49]

'Dear Members,Is there a way I can removes reads associated with a region (chr, start, end) from a .bam file (RNASeq data) prior to the application of HTSeq?I will greatly appreciate your feedbackNoushin'

In [143]:
final_df["Similarity_Rating"][49]

4

In [144]:
final_df["Accuracy"][49]

5

In [145]:
final_df["Similarity_Primed"][49]

5

In [146]:
final_df["Predictions"][49]

"Dear Noushin,\n\nYes, you can remove reads associated with a specific region (chr, start, end) from a .bam file before applying HTSeq for your RNASeq data analysis. This can be achieved by using tools like SAMtools or BEDTools, which are widely used for manipulating alignments in the SAM/BAM format. First, you would use SAMtools to index your BAM file if it's not already indexed. Then, you can use the 'view' command in SAMtools with the '-U' option to specify an output file for reads not matching the region you want to exclude. Alternatively, BEDTools' 'intersect' function allows you to exclude reads overlapping with a given region when you use the '-v' option. This approach requires you to create a BED file containing the regions you wish to exclude. After filtering out the unwanted reads, you can proceed with your analysis using HTSeq. It's important to ensure that the resulting BAM file is properly sorted and indexed, if necessary, before using it with HTSeq. This preprocessing ste

In [147]:
final_df["References"][49]

'<pre class="pre"><code class="language-bash">bedtools intersect -abam file.bam -b filter.bed -v &gt; filtered.bam/code></pre>filter.bed should containpre class="pre"><code class="language-bash">chr    start     end/code></pre>'

In [13]:
def extract_digit(text):
    text = str(text)  # Convert to string if not already
    match = re.search(r'\b\d\b', text)  # Match a single digit surrounded by word boundaries

    if match:
        return int(match.group())  # Return the matched digit as integer
    else:
        return None  # Return None if no digit found

# Apply the function to the DataFrame column and create a new column "Similarity_Ranking"
final_df['Similarity_Rating'] = final_df['Similarity_Rating'].apply(extract_digit)
final_df['Accuracy'] = final_df['Accuracy'].apply(extract_digit)
final_df['Similarity_Primed'] = final_df['Similarity_Primed'].apply(extract_digit)

In [14]:
final_df.to_csv("gpt4_self_evaluation", index= False)

#### Calculate Average and Median Values

In [116]:
# Calculate average (mean) of the 'Similarity' column
average_sim_accuracy = final_df['Similarity_Rating'].mean()

# Calculate median of the 'Similarity' column
median_sim_accuracy = final_df['Similarity_Rating'].median()

# Calculate average (mean) of the 'Accuracy' column
average_accuracy = final_df['Accuracy'].mean()

# Calculate median of the 'Accuracy' column
median_accuracy = final_df['Accuracy'].median()

# Calculate average (mean) of the 'Similarity Primed' column
average_sim_primed_accuracy = final_df['Similarity_Primed'].mean()

# Calculate median of the 'Similarity Primed' column
median__sim_primed_accuracy = final_df['Similarity_Primed'].median()

print(f"Similarity Average: {average_sim_accuracy}")
print(f"Similarity Median: {median_sim_accuracy}")
print(f"Accuracy Average: {average_accuracy}")
print(f"Accuracy Median: {median_accuracy}")
print(f"Similarity Primed Average: {average_sim_primed_accuracy}")
print(f"Similarity Primed Median: {median__sim_primed_accuracy}")

Similarity Average: 3.211267605633803
Similarity Median: 4.0
Accuracy Average: 4.816901408450704
Accuracy Median: 5.0
Similarity Primed Average: 4.619718309859155
Similarity Primed Median: 5.0


In [122]:
question =  "I have 63 DNA-seq files which I put through the GATK variant calling pipeline (https://gencore.bio.nyu.edu/variant-calling-pipeline-gatk4/)."

index = final_df[final_df['Question'] == question].index
print(index)

Index([], dtype='int64')


In [124]:
pip install fuzzywuzzy

Note: you may need to restart the kernel to use updated packages.


In [127]:
from fuzzywuzzy import fuzz, process
# The question you want to search for
question = "I have 63 DNA-seq files through the GATK variant calling pipeline"

# Use fuzzywuzzy to find the best match and its index
matches = process.extract(question, final_df['Question'], scorer=fuzz.token_sort_ratio, limit=1)

# Extract the best match and its index
best_match, score, idx = matches[0]

# Print the best match and its index
print(f"Best match: {best_match}")
print(f"Score: {score}")
print(f"Index of the best match: {idx}")


Best match: Dear Members,Is there a way I can removes reads associated with a region (chr, start, end) from a .bam file (RNASeq data) prior to the application of HTSeq?I will greatly appreciate your feedbackNoushin
Score: 36
Index of the best match: 49


In [None]:
path = r"./gpt4_self_evaluation"
self_gpt4 = pd.read_csv(path)

In [None]:
self_gpt4.head()