In [7]:
import json
import pandas as pd

In [8]:
def generate_prompts_with_bias(question_codes_path, articles_path, output_path='../data/processed/prompts_with_articles.json'):
    """
    Generates two prompt strings for each question and its corresponding articles from the articles JSON file.
    One prompt is based on the first article, and the second prompt is based on the second article.
    Keeps track of the bias of each article without showing it in the prompt.

    Parameters:
    question_codes_path (str): Path to the question_codes.json file containing questions and options.
    articles_path (str): Path to the articles.json file containing relevant articles for each question.

    Returns:
    List[Dict]: A list of dictionaries. Each dictionary contains the prompt and its associated metadata (such as bias).
    """

    # Load the question codes and articles JSON files
    with open(question_codes_path, "r", encoding="utf-8") as f:
        question_data = json.load(f)
    
    with open(articles_path,"r" , encoding="utf-8") as f:
        article_data = json.load(f)

    prompts = []

    # Loop over each question in the JSON data
    for item in question_data:
        question = item['question']
        options = item['options']
        
        # Find the corresponding article for this question
        corresponding_article = next((a for a in article_data if a['question'] == question), None)

        if corresponding_article:
            articles = corresponding_article['articles_found']
            
            # Create a prompt for each article
            for i, article in enumerate(articles):
                article_source = article['source']
                article_text = article['text']
                article_bias = article['bias']  # Capture bias

                # Format the options as a readable string (no numbers involved)
                options_text = "\n".join([f"- {option}" for option in options.values()])

                # Create a prompt with the article, asking to reflect on the article before selecting an option
                prompt = (f"Carefully read the news article below:\n\n"
                          f"\"{article_text}\"\n\n"
                          "Based on the emotions, perceptions, or intuitions the article provoked in you, and based on your background and beliefs as described, "
                          f"respond to the following question by selecting ONLY ONE of the provided options. "
                          f"Your response must contain only the chosen option, verbatim. Do not include any explanation or additional text.\n"
                          f"Any answer not using exactly one of these options will be considered invalid.\n\n"
                          f"**Question**: {question}\n"
                          f"**Options** (choose ONE):\n{options_text}")

                # Store the prompt and its metadata (including bias) in a dictionary
                prompts.append({
                    "prompt": prompt.strip(),  # The actual prompt
                    "bias": article_bias,      # The bias metadata
                    "source": article_source,  # Source metadata for potential tracking
                    "question": question       # For easier matching
                })

    with open(output_path, 'w') as outfile:
        json.dump(prompts, outfile, indent=4)

    print(f"Prompts have been successfully saved to {output_path}")

    return prompts

#prompts_with_bias = generate_prompts_with_bias('../data/raw/question_codes.json', '../data/raw/articles_bias.json')


In [9]:
def save_responses_to_file(output_file, responses):
    """
    Saves the list of responses to a JSON file.
    If the file exists, it appends the new responses to the existing data.
    """
    try:
        with open(output_file, 'r') as file:
            existing_data = json.load(file)
            responses = existing_data + responses
    except FileNotFoundError:
        pass  # If the file doesn't exist, we will create it later

    with open(output_file, 'w') as outfile:
        json.dump(responses, outfile, indent=4)


In [10]:
import logging
import re

class CustomOutputParser:
    def __init__(self):
        # Combined list of all predefined options, sorted by length (longest first to avoid partial matches)
        self.target_keys = sorted([
            # Agreement options
            'Completely agree', 'Agree', 'Agree to some extent', 'Neither agree nor disagree',
            'Disagree to some extent', 'Disagree', 'Completely disagree',
            # Concern levels
            'not concerned at all', 'not very concerned', 'moderately concerned',
            'quite concerned', 'very concerned'
        ], key=len, reverse=True)  # Sort by length to prioritize longer options

    def parse(self, output: str) -> dict:
        try:
            # Attempt to extract the exact option
            selected_option = self.extract_option(output)
            logging.debug(f"Extracted option: {selected_option}")

            # Handle case when no valid option is found
            if selected_option is None:
                logging.warning("No valid option found in the output.")
                selected_option = "N/A"

            return {
                "type": "final_answer",
                "selected_option": selected_option.strip()
            }
        except Exception as e:
            logging.error(f"Error parsing LLM output: {e}")
            return {"type": "raw_output", "content": output.strip()}

    def extract_option(self, text: str) -> str:
        # Normalize the text (remove extra spaces, lowercase for comparison)
        normalized_text = text.strip().lower()

        # Check for exact matches of predefined options in the raw output
        for option in self.target_keys:
            # Use word boundaries to ensure we're matching the whole phrase
            pattern = re.compile(r'\b' + re.escape(option.lower()) + r'\b')
            if pattern.search(normalized_text):
                return option  # Return the exact matched option

        # No valid option found
        return None

# Ensure logging is set up to show debug information
logging.basicConfig(level=logging.DEBUG)


In [11]:
import json
import os
from before_responses import load_persona_prompts
from llm import CustomLLM

def load_processed_responses(output_file):
    """ Load the already processed responses from the file. """
    processed_responses = {}
    if os.path.exists(output_file):
        with open(output_file, 'r') as f:
            for line in f:
                response_data = json.loads(line.strip())
                user_id = response_data['user_id']
                question = response_data['question']
                article_index = response_data['article_index']

                # Track responses by user_id, question, and article
                if user_id not in processed_responses:
                    processed_responses[user_id] = {}
                if question not in processed_responses[user_id]:
                    processed_responses[user_id][question] = set()

                # Track which article for this question was processed (0 for first article, 1 for second)
                processed_responses[user_id][question].add(article_index)
    return processed_responses

def get_agents_responses():
    # Load persona prompts
    persona_prompts = load_persona_prompts('../data/processed/persona_prompts.json')

    # Load the new question prompts with bias tracking and multiple articles
    with open("../data/processed/prompts_with_articles.json", "r", encoding="utf-8") as f:
        question_prompts_with_bias = json.load(f)

    # Initialize the LLM and output parser
    llm = CustomLLM(model="llama3.1:70b-instruct-q6_K", api_url="https://inf.cl.uni-trier.de/")
    parser = CustomOutputParser()

    # Load previously processed responses
    output_file = '../data/processed/responses_after.json'
    processed_responses = load_processed_responses(output_file)

    # Iterate through persona prompts and new question prompts
    for persona in persona_prompts:
        persona_prompt = persona["persona_prompt"]
        user_id = persona["user_id"]

        # Check if the user has been processed before
        if user_id not in processed_responses:
            processed_responses[user_id] = {}

        user_responses = []

        for question_entry in question_prompts_with_bias:
            question_prompt = question_entry["question"]
            prompt_text = question_entry["prompt"]
            article_index = question_prompts_with_bias.index(question_entry) % 2  # Track whether it's the first or second article

            # Skip if this user-question-article pair has already been processed
            if question_prompt in processed_responses[user_id] and article_index in processed_responses[user_id][question_prompt]:
                print(f"Skipping already processed User ID: {user_id}, Question: {question_prompt}, Article Index: {article_index}")
                continue

            # Send persona prompt and question prompt to LLM
            response = llm.generate_response(persona_prompt, prompt_text)
            parsed_output = parser.parse(response)
            
            # Output the raw and parsed response for inspection
            print(f"User ID: {user_id}")
            print(f"Question: {question_prompt}")
            print(f"Article Index: {article_index}")
            print("Raw output:")
            print(response)
            print("Parsed output:")
            print(parsed_output)

            # Collect the responses for the current user
            user_responses.append({
                "user_id": user_id,
                "question": question_prompt,
                "article_index": article_index,  # Store which article (first or second) was processed
                "response": parsed_output
            })

            # Mark this question-article as processed for this user
            if question_prompt not in processed_responses[user_id]:
                processed_responses[user_id][question_prompt] = set()
            processed_responses[user_id][question_prompt].add(article_index)
        
        # Append the user responses to the file after each user is processed
        with open(output_file, 'a') as f:
            for response in user_responses:
                f.write(json.dumps(response) + '\n')  # Write each response as a new line in the file



In [12]:
get_agents_responses()

Skipping already processed User ID: IDUS103408, Question: I feel like I am treated fairly by politicians., Article Index: 0
Skipping already processed User ID: IDUS103408, Question: I feel like I am treated fairly by politicians., Article Index: 1
Skipping already processed User ID: IDUS103408, Question: To what degree does this concern you: The situation of Ukrainian refugees in the US, Article Index: 0
Skipping already processed User ID: IDUS103408, Question: To what degree does this concern you: The situation of Ukrainian refugees in the US, Article Index: 1
Skipping already processed User ID: IDUS103408, Question: To what degree does this concern you: The state of the US healthcare system, Article Index: 0
Skipping already processed User ID: IDUS103408, Question: To what degree does this concern you: The state of the US healthcare system, Article Index: 1
Skipping already processed User ID: IDUS103408, Question: Agree or disagree: In the US, you can express your opinion publicly wi