#### Remove prompt from early LLMs responses

In [None]:
question_head = "You are a medical expert. Rate the following transcript of a YouTube video according to this question:"
questions = [
    "Are the aims of the video clear?",
    "Does the video achieve it's aims?",
    "Is the video relevant?",
    "Is the video clear what sources of information were used to compile the publication (other than the author)?",
    "Is the video clear when the information used or reported in the transcript was produced?",
    "Is the video balanced and unbiased?",
    "Does the video provide details of additional sources of support and information?",
    "Does the video refer to areas of uncertainty?",
    "Does the video describe how each treatment works?",
    "Does the video describe the benefits of each treatment?",
    "Does the video describe the risks of each treatment?",
    "Does the video describe what would happen if no treatment is used?",
    "Does the video describe how the treatment choices affect overall quality of life?",
    "Is the video clear that there may be more than one possible treatment choice?",
    "Does the video provide support for shared decision-making?",
]
question_tail = "Return an integer score from 1 to 5, where 1 means 'no', 2 to 4 means 'partially', and 5 means 'yes'. Then, explain your choice."

In [None]:
import re

def remove_prompt_from_output(output):
    """Check if the prompt exists at the beginning of llm_output and remove it"""
    # if output.startswith(prompt[:20]):
        # prompt_end = prompt[-7:]        # Get the last 7 characters of the prompt
    prompt_end = "Score:"
    idx = output.find(prompt_end)   # Find the index of the last characters of the prompt in the output
    if idx != -1:
        output = output[idx + len(prompt_end):]  # take response after the prompt
    return output

def check_repetition(text, min_words=5, min_occurrences=2):
    """
    Find repeated sentences longer than 'min_words' words, occurring more than 'min_occurrences' times in the given text.
    Some models are repeating some phrases in their output.    
    """
    # Split text into sentences
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    
    phrase_counter = {}             # Initialize a dictionary to store occurrences of phrases
    for sentence in sentences:
        words = sentence.split()        # Tokenize the sentence into words
        
        # Generate all possible phrases longer than min_length words
        for i in range(len(words) - min_words + 1):
            phrase = ' '.join(words[i:i+min_words])
            # Update the counter for this phrase
            if phrase in phrase_counter:
                phrase_counter[phrase] += 1
            else:
                phrase_counter[phrase] = 1
    
    # Filter phrases that occur more than min_occurrences times
    repeated_phrases = [phrase for phrase, count in phrase_counter.items() if count > min_occurrences]
    
    if len(repeated_phrases):
        return True
    else:
        return False

def extract_and_store_score(llm_output, df, video_id, question_num):
    """
    Extract score integer from beginning of LLM response.
    NOTE: Prompt should have been removed already from the response beginning.
    """
    questions_with_problem = []
    pattern = r'([0-5])'
    match = re.search(pattern, llm_output)  # Search for the first encountered integer
    if match:
        score = int(match.group(1))
        if 1 <= score <= 5:
            df.loc[df['Video ID'] == video_id, f'Q{question_num}'] = score
        elif score == 0:
            df.loc[df['Video ID'] == video_id, f'Q{question_num}'] = 1
    else:
        questions_with_problem.append(question_num)
        df.loc[df['Video ID'] == video_id, 'Problem'] = questions_with_problem
        # df.loc[df['Video ID'] == video_id, 'Problem'] = True
    
    if question_num == 15:
        columns_to_check = [f'Q{i}' for i in range(1, 16)]
        # Filter the DataFrame for the specific video_id
        filtered_df = df[df['Video ID'] == video_id]
        # Count the occurrences of 1 in columns Q1 to Q15 for the filtered DataFrame
        count_ones = sum(filtered_df[column].eq(1).sum() for column in columns_to_check)
        if count_ones >= 8:   # if there are many ones then probably prompt was not removed from response
            df.loc[df['Video ID'] == video_id, 'Problem'] = True

def store_and_check_response(llm_output, question_num, llm_scores_df, video_id, print_response=True):
    if print_response:
        print(f"Q{question_num} response: {llm_output}")# if question_num == 5 else None
        
    llm_scores_df.loc[llm_scores_df['Video ID'] == video_id, f'Response_{question_num}'] = llm_output     # store response
    llm_scores_df.loc[llm_scores_df['Video ID'] == video_id, 'Problem'] = check_repetition(llm_output)    # check for repeated phrases
    extract_and_store_score(llm_output, llm_scores_df, video_id, question_num)                            # extract and store score

def check_prompt_in_response(response):
    if 'Instruction' in response \
        and question_head in response \
        and question_tail in response \
        and 'Transcript: ' in response \
        and 'Score:' in response:
        return True
    return False

In [None]:
import pandas as pd
import os

dir = './Original_Responses'
files = os.listdir('./Original_Responses')
csv_files = [file for file in files if file.endswith('.csv')]

for file in csv_files:
    path_to_file = dir + '/' + file
    response_df = pd.read_csv(path_to_file, encoding='utf-8')
    for video_id in response_df['Video ID'].values:
        
        for question_num in range(1,16):
            response = str(response_df.loc[response_df['Video ID'] == video_id, f'Response_{question_num}'].iloc[0])
        
            if check_prompt_in_response(response):
                print(f"File: {file} | video ID: {video_id} | Question {question_num}")
                response = remove_prompt_from_output(response)
                store_and_check_response(response, question_num, response_df, video_id, print_response=False)

    new_file_path = './Fixed_Responses/' + file
    response_df.to_csv(new_file_path, index=False, encoding='utf-8')

#### Convert all 'True'/'False' in `Problem` column to a list along with question_nums that have a problem
#### Add `Transcripts` column to `responses_df`
Problem is either there is no integer or there is repetition in the response

In [2]:
import pandas as pd
import time
import os
import sys
sys.path.append('../../../Codes/LLM_Evaluations')  # Add the parent directory of LLM_Evaluations to the Python path

from llm_evaluation_utils import \
                        extract_score,   \
                        check_repetition,      \
                        QUESTION_HEAD, QUESTIONS, QUESTION_TAIL

In [21]:
transcripts_file_path = "../../../Codes/Getting_Transcripts/filtered_videos_transcripts.csv"
responses_dir = ".."

videos_df = pd.read_csv(transcripts_file_path, usecols=['Video ID', 'Transcript'], encoding='utf-8')
files_in_directory = os.listdir(responses_dir)


In [None]:
for file_name in files_in_directory:
    if file_name.endswith('.csv'):
        file_path = os.path.join(responses_dir, file_name)
        responses_df = pd.read_csv(file_path, encoding='utf-8')
        if 'Transcript' not in responses_df.columns:
            responses_df.insert(1, 'Transcript', videos_df['Transcript']) 
        print(file_name)

        problem_sets = pd.Series(dtype='object')

        for index, row in responses_df.iterrows():
            questions_with_problem = set()
            for i in range(1,16):
                response = row[f'Response_{i}']
                # print(index, i)
                if pd.isna(response) or extract_score(response) is None or check_repetition(response):
                    questions_with_problem.add(i)
                
                cell = row[f'Q{i}']
                if isinstance(cell, float) and pd.notna(cell):
                    # Convert the float to an integer
                    responses_df.at[index, f'Q{i}'] = int(cell)

            problem_sets.at[index] = questions_with_problem

        # Assign the Series to the DataFrame as a new column 'Problem'
        responses_df['Problem'] = problem_sets
        
        responses_df.to_csv(file_path, index=False, encoding='utf-8')

In [None]:
for file_name in files_in_directory:
    if file_name.endswith('.csv'):
        file_path = os.path.join(responses_dir, file_name)
        responses_df = pd.read_csv(file_path, encoding='utf-8')

        for i in range(1,16):
            responses_df[f'Q{i}'] = pd.to_numeric(responses_df[f'Q{i}'], errors='coerce', downcast='integer').astype('Int64')

        print(responses_df.head())
        responses_df.to_csv(file_path, index=False, encoding='utf-8')
        time.sleep(5)