In [22]:
import pandas as pd
import os
import openai
import wandb
import tiktoken
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [3]:
with open('/workspaces/Wikistim-Summarization/api_key.txt', 'r') as file:
    API_KEY = file.read().strip()

In [4]:
data = pd.read_csv('/workspaces/Wikistim-Summarization/00_source_data/uncleaned_text.csv')
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
data['tokenized_text'] = data['full_article_text'].apply(lambda x: encoding.encode(x))
data['count_of_tokens'] = data['tokenized_text'].apply(lambda x: len(x))
longest_9 = data.sort_values(by='count_of_tokens', ascending=False).head(9)
shortest_9 = data.sort_values(by='count_of_tokens', ascending=True).head(9)

## Experiment 

In this experiment, we are testing the difference between the GPT 4 Turbo and GPT 4 Regular models. The goal of this analysis is to understand if GPT 4 Turbo has greater accuracy than GPT 4 Regular to warrant the additional cost. Because GPT 4 Regular has a significantly shorter context window, the papers will be chunked For GPT-4 Regular. The approach involves posing specific questions to GPT-4 for each chunk and utilizing the generated answers as contextual input for subsequent chunks. This sequential questioning and context transfer method are employed to simulate a continuous understanding of the entire document. Each chunk's context is preserved by incorporating the model's responses from the previous iterations, allowing for a more cohesive analysis despite the model's limitations.

This process ensures that, as we move through the document in chunks, GPT-4 Regular is provided with the necessary context from its preceding responses, aiming to maintain a coherent understanding of the entire paper. The goal is to assess how well GPT-4 Regular adapts to sequential questioning and context transfer, and whether it can effectively process information in this manner.

Simultaneously, GPT-4 Turbo undergoes the same sequential questioning process, and its responses are compared with those of GPT-4 Regular. By evaluating the accuracy, coherence, and contextual relevance of both models in this sequential context transfer setting, we aim to understand if the enhanced capabilities of GPT-4 Turbo justify its potential additional cost in scenarios where sequential processing is crucial for maintaining context and generating accurate responses.



We are testing the generation of two features: Number of Participants and Study Design. Specifically, we have two questions that we are posing to GPT 4 Turbo and GPT 4 Regular: 
* First read the following paper. Then guess at what the study design is and return that value. 

* First read the following paper. Then guess at how many participants were in the study and return that value. 


To evaluate performance of GPT 4 Turbo, we simply calculate the accuracy. For GPT 4 Regular, we calculate accuray using two different methodologies. One is binary and counts whether the model generates the correct answer from ANY chunk. The other counts the number of correctly generated answers across all of the chunks. If the model generates the correct answer more often then not, then that is considered a true positive. 

Below are the results followed by conclusions, next steps, and code. 

### GPT 4 Turbo

| Data      | Study Design Accuracy | Number of Participants Accuracy |
|-----------|------------------------|---------------------------------|
| 9 Long    |          88.89%              |                 100%                |
| 9 Short   |              88.89%          |                     77.76%            |


### GPT 4 Regular

| Data      | Study Design Accuracy (Correct Answer Anywhere) | Study Design Accuracy (By Vote) | Number of Participants Accuracy (Correct Answer Anywhere) | Number of Participants Accuracy (By Vote) |
|-----------|--------------------------------------------------|----------------------------------|-----------------------------------------------------------|-------------------------------------------|
| 9 Short    |   88.89%                                               |          77.78%                        |                     88.89%                                      |                    66.67%                       |
| 9 Long   |          88.89%                                        |             77.78%                     |                                   100%                        |                             22.22%              |


Example usage: 

In [69]:
from openai import OpenAI
import time
import pandas as pd

def chunk_text(text, max_tokens=4096):
    chunks = []
    current_chunk = ""

    for sentence in text.split(". "):  # Assuming sentences end with a period
        if len(current_chunk) + len(sentence) < max_tokens:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

def generate_prompt(chunk, context):
    prompt = f'First read the following paper. Then guess at how many participants were in the study and return that value."{context}" "{chunk}"'
    return prompt

def evaluate_model_on_dataset(api_key, model_name, prompt, dataset, column):
    client = OpenAI(api_key=api_key)

    evaluation_df = pd.DataFrame(columns=['correct_answer', 'final_gpt_answer', 'cosine_similarity'])

    previous_answer = ""  # Initialize the previous answer

    responses = []  # List to store responses for each chunk in each iteration

    for index, row in dataset.iterrows():
        print('this is index: ', index)
        full_text = row['full_article_text']

        # Chunk the text into smaller pieces
        text_chunks = chunk_text(full_text)

        chunk_responses = []  # List to store responses for each chunk in this iteration

        for i, chunk in enumerate(text_chunks):
            print(f'Processing chunk {i+1}/{len(text_chunks)}')
            context = previous_answer  # Use the previous answer as context

            try:
                # Introduce a lag (e.g., 2 seconds) before making the API call
                time.sleep(2)

                completion = client.chat.completions.create(
                    model=model_name,
                    messages=[
                        {"role": "user", "content": generate_prompt(chunk, context)}
                    ]
                )

                correct_answer_value = dataset.loc[index, column]

                # Extract the content from the response
                gpt_answer_value =  completion.choices[0]["message"]["content"]
                
                cosine_similarity_value = 0

                print(gpt_answer_value)

                # Update the previous answer for the next iteration
                previous_answer = gpt_answer_value

                # Store the response for this chunk
                chunk_responses.append(gpt_answer_value)

            except Exception as e:
                print(f"Error processing chunk {i+1}: {str(e)}")
                # Handle the error as needed, e.g., log it or skip the chunk

        # Store all chunk responses for this iteration
        responses.append(chunk_responses)

    # Update the final_gpt_answer in the DataFrame with the last response in each iteration
    for i, iteration_responses in enumerate(responses):
        evaluation_df.loc[evaluation_df.index.isin(range(i * len(text_chunks), (i + 1) * len(text_chunks))),
                        'final_gpt_answer'] = iteration_responses[-1]

    # Add the correct_answer values to the DataFrame
    evaluation_df['correct_answer'] = dataset[column]

    return evaluation_df, responses


In [70]:
test_shortest_9 = shortest_9.head(2)

In [72]:
chunk_text(test_shortest_9['full_article_text'].iloc[0])

['Clinical/Scientific Notes W. Thevathasan, FRACP* P. Mazzone, MD* A. Jha, MRCP A. Djamshidian, MD M. Dileone, MD V. Di Lazzaro, MD P. Brown, FRCP SPINAL CORD STIMULATION FAILED TO RELIEVE AKINESIA OR RESTORE LOCOMOTION IN PARKINSON DISEASE Dorsal column spinal stimulation in dopamine- depleted rodents was recently reported to disrupt pathologic corticostriatal synchronization, alleviate akinesia, and restore locomotion.1 This claim has prompted consideration that spinal stimulation “might become an efficient and less invasive alterna- tive for treatment of Parkinson disease (PD) in the future.” In this study, we investigated whether dorsal col- umn stimulation was of therapeutic benefit in 2 pa- tients with PD. Level of evidence. This study provides Class II evi- dence that for patients with moderate to severe mo- tor impairment from PD, high-frequency epidural cervical spinal cord stimulation does not significantly improve motor function as measured by the motor subsection of the Uni

In [71]:
# example usage 
model_name = "gpt-3.5-turbo" 
prompt = "First read the following paper. Then guess at how many participants were in the study and return that value."
dataset = test_shortest_9  
column = 'Number in study' 

result_df, resukt_responses = evaluate_model_on_dataset(API_KEY, model_name, prompt, dataset, column)

this is index:  14
Processing chunk 1/3
Error processing chunk 1: 'Choice' object is not subscriptable
Processing chunk 2/3
Error processing chunk 2: 'Choice' object is not subscriptable
Processing chunk 3/3


KeyboardInterrupt: 

In [52]:
shortest_9.index

pandas.core.indexes.base.Index

In [49]:
shortest_9.index

Index([14, 16, 4, 12, 2, 13, 17, 10, 11], dtype='int64')

In [55]:
resukt_responses[1]

[ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
 ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
 ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
 ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
 ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, too

In [58]:
shortest_9.index[1]

16

In [60]:
resukt_responses.index

<function list.index(value, start=0, stop=9223372036854775807, /)>

In [65]:
to_find = 'ChatCompletionMessage(content=\'It is not possible to determine the exact number of participants in the study based on the provided information.\', role=\'assistant\', function_call=None, tool_calls=None)'

In [68]:
resukt_responses[1]

[ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
 ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
 ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
 ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
 ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, too

In [67]:
resukt_responses[1].index(to_find)

ValueError: "ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None)" is not in list

In [63]:
list(resukt_responses).index

<function list.index(value, start=0, stop=9223372036854775807, /)>

In [64]:
resukt_responses

[[ChatCompletionMessage(content='Based on the information provided, there were 2 participants in the study.', role='assistant', function_call=None, tool_calls=None),
  ChatCompletionMessage(content='Based on the information provided, it is not possible to determine the exact number of participants in the study.', role='assistant', function_call=None, tool_calls=None),
  ChatCompletionMessage(content='Based on the information provided, it is not possible to determine the exact number of participants in the study.', role='assistant', function_call=None, tool_calls=None)],
 [ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
  ChatCompletionMessage(content='It is not possible to determine the exact number of participants in the study based on the provided information.', role='assistant', function_call=None, tool_calls=None),
  ChatCompletio

In [None]:
vectorizer = TfidfVectorizer()

for respon in resukt_responses:
    tfidf_matrix = vectorizer.fit_transform([text1, text2])

    cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [7]:
result_df

Unnamed: 0,correct_answer,final_gpt_answer,cosine_similarity
14,2,,
