In [47]:
import os
import sys

# Get the absolute path to the parent directory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, parent_dir)

import pandas as pd
from dataclasses import dataclass
from collections import defaultdict
import re
from typing import List, Dict, Union, Any # union for [likert_score, text]
from chatbot_api.providers.openai import OpenAIProvider
from chatbot_api.base import Role, Message

@dataclass
class Argument:
    """Represents a single argument with its main point, subpoints, and supporting data"""
    main: str
    subpoints: List[str]
    likert_distribution: List[int] # Counts of responses [1,2,3,4,5]
    supporting_comments: List[List[Union[str, int]]] # [comment, index, likert_score]



In [48]:
class ArgumentProcessor:
    def __init__(self, llm_provider: OpenAIProvider, batch_size):
        self.llm_provider = llm_provider
        self.batch_size = batch_size

    async def _create_initial_prompt(self, comments_data: List[List[Union[str, int]]], topic: str) -> str:
        # Format comments with their Likert scores
        formatted_comments = "\n".join(f"Comment {comment[1]}: {comment[0]} [Index: {comment[1]}, Likert Score: {comment[2]}]" 
                                      for comment in comments_data)
        self.comments_data = comments_data  # Store comments for later use

        return f"""
            ### Instructions:
            1. Review the following comments and their Likert scores (1-5) on the topic: "{topic}"
            2. **Identify and categorize arguments into main topics**, tracking which comments support each argument.
            3. For each argument, carefully calculate the Likert distribution based on the scores of all supporting comments. **Double-check the calculation for consistency.**
            4. Ensure each argument includes:
               - A clear main point
               - Supporting subpoints
               - Distribution of Likert scores from supporting comments
               - List of indices of supporting comments
            5. Avoid redundancy by combining similar arguments
            6. Provide output in English, even if input is not
            
            ### Output format:
            <ARGUMENTS>
            MAIN: [Main Argument]
            SUB: [Supporting subpoint]
            DISTRIBUTION: [0, 0, 0, 0, 0] where the numbers represent counts of Likert scores 1-5, with index 0 corresponding to score 1, index 1 to score 2, etc.
            COMMENTS: [comment_index1,comment_index2,...] # indices of supporting comments
            </ARGUMENTS>

            ### Comments to analyze:
            {formatted_comments}"""

    async def _create_redundancy_prompt(self, arguments: List[Argument]) -> str:
        formatted_args = self._format_arguments_for_prompt(arguments)
        
        return f"""You are an expert at analyzing and organizing complex arguments while preserving their supporting data.

            ### Task
            Analyze and reorganize these arguments to create a more coherent structure while maintaining accurate tracking of supporting comments and Likert distributions.

            ### Guidelines
            1. When combining arguments:
               - Merge their supporting comment lists
               - Add their Likert distributions
               - Ensure no comment is counted twice
            
            2. Main Arguments should:
               - Be distinct and broad enough for multiple subpoints
               - Maintain clear links to supporting comments
            
            3. Track supporting data:
               - Preserve all comment indices when reorganizing
               - Update Likert distributions when merging arguments
               - Ensure distributions match supporting comments

            ### Input Arguments:
            {formatted_args}

            ### Output Format
            <ARGUMENTS>
            MAIN: [Main Argument]
            SUB: [Supporting subpoint]
            DISTRIBUTION: [n1,n2,n3,n4,n5]
            COMMENTS: [comment_index1,comment_index2,...]
            </ARGUMENTS>"""

    def _format_arguments_for_prompt(self, arguments: List[Argument]) -> str:
        formatted = "<ARGUMENTS>\n"
        for arg in arguments:
            formatted += f"MAIN: {arg.main}\n"
            for sub in arg.subpoints:
                formatted += f"SUB: {sub}\n"
            formatted += f"DISTRIBUTION: {arg.likert_distribution}\n"
            formatted += f"COMMENTS: {[comm[1] for comm in arg.supporting_comments]}\n"
        formatted += "</ARGUMENTS>"
        return formatted

    async def _parse_argument_structure(self, text: str) -> List[Argument]:
        '''Parse the LLM response into a list of arguments by extracting main points, subpoints, supporting comments, distributions and indices'''
        arguments = []
        current_main = None
        current_subs = []
        current_distribution = None  
        current_comments = []
        
        try:
            content = text.split("<ARGUMENTS>")[1].split("</ARGUMENTS>")[0].strip()
        except IndexError as e:
            print("Error: Could not find <ARGUMENTS> tags in LLM response")
            print(f"Raw response: {text}")
            return []

        for line_number, line in enumerate(content.split("\n"), 1):
            try:
                line = line.strip()
                if not line:
                    continue
                    
                if line.startswith("MAIN:"):
                    # Process previous argument if it exists
                    if current_main is not None:
                        try:
                            comment_indices = [int(idx) for idx in re.findall(r'\d+', str(current_comments))]
                        except ValueError as e:
                            print(f"Error parsing comment indices on line {line_number}: {current_comments}")
                            comment_indices = []
                        
                        try:
                            # Calculate the actual distribution from comments
                            actual_distribution = self._calculate_distribution(comment_indices)  # use the llm-based comment indices
                            supporting_comments = self._get_supporting_comments(comment_indices) # use the llm-based comment indices
                            
                            # If the LLM didn't provide a distribution, use the calculated one
                            if current_distribution is None:
                                print(f"Warning: No distribution provided for argument '{current_main}', using calculated distribution")
                                current_distribution = actual_distribution
                            
                            # Compare distributions and warn if they don't match
                            if current_distribution != actual_distribution:
                                print(f"Warning: Provided distribution {current_distribution} doesn't match calculated distribution {actual_distribution} for argument '{current_main}'")
                        except Exception as e:
                            print(f"Error processing argument '{current_main}': {e}")
                            actual_distribution = [0, 0, 0, 0, 0]
                            supporting_comments = []
                        
                        arguments.append(Argument(
                            current_main,
                            current_subs,
                            actual_distribution,  
                            supporting_comments
                        ))
                    
                    # Start new argument
                    current_main = line[5:].strip()
                    current_subs = []
                    current_distribution = None
                    current_comments = []
                
                elif line.startswith("SUB:"):
                    current_subs.append(line[4:].strip())
                elif line.startswith("DISTRIBUTION:"):
                    try:
                        # Parse the distribution string into a list of integers
                        dist_str = line[12:].strip()  # Remove "DISTRIBUTION:" prefix
                        dist_nums = re.findall(r'\d+', dist_str)
                        if len(dist_nums) == 5:
                            current_distribution = [int(n) for n in dist_nums]
                        else:
                            print(f"Warning: Invalid distribution format on line {line_number}: {dist_str}")
                    except ValueError as e:
                        print(f"Error parsing distribution on line {line_number}: {dist_str}")
                elif line.startswith("COMMENTS:"):
                    current_comments = line[9:]
                    
            except Exception as e:
                print(f"Error processing line {line_number}: {line}")
                print(f"Error details: {str(e)}")
                continue
    
        # Don't forget to process the last argument
        if current_main is not None:
            try:
                comment_indices = [int(idx) for idx in re.findall(r'\d+', str(current_comments))]
                actual_distribution = self._calculate_distribution(comment_indices)
                supporting_comments = self._get_supporting_comments(comment_indices)
                
                arguments.append(Argument(
                    current_main,
                    current_subs,
                    actual_distribution,
                    supporting_comments
                ))
            except Exception as e:
                print(f"Error processing final argument '{current_main}': {e}")
            
        return arguments

    async def process_batch(self, comments_data: List[List[Union[str, int]]], topic: str) -> List[Argument]:
        '''Process a batch of comments and return a list of arguments'''
        # First pass: Get initial argument structure with distributions
        initial_prompt = await self._create_initial_prompt(comments_data, topic)
        messages = [Message(Role("user"), initial_prompt)]
        response = await self.llm_provider.generate(messages, temperature=0.3)
        initial_arguments = await self._parse_argument_structure(response.content)
        return initial_arguments
        
        # # Second pass: Refine and reorganize while maintaining distributions
        # redundancy_prompt = await self._create_redundancy_prompt(initial_arguments)
        # messages = [Message(Role("user"), redundancy_prompt)]
        # response = await self.llm_provider.generate(messages, temperature=0.2)
        # final_arguments = await self._parse_argument_structure(response.content, comments_data)
        
        # return final_arguments

    async def format_arguments(self, arguments: List[Argument]) -> str:
        formatted = ""
        for arg in arguments:
            formatted += f"MAIN: {arg.main}\n"
            for sub in arg.subpoints:
                formatted += f"SUB: {sub}\n"
            formatted += f"DISTRIBUTION: {arg.likert_distribution}\n"
            formatted += f"SUPPORTING COMMENTS:\n"
            for comment in arg.supporting_comments:
                formatted += f"- {comment[0]} [Score: {comment[2]}]\n"
            formatted += "\n"
        return formatted
    
    def _calculate_distribution(self, comment_indices: List[int]) -> List[int]:
        """Calculate distribution of Likert scores from a list of comment indices.
        Each index corresponds to a comment in the original data."""
        distribution = [0, 0, 0, 0, 0]  # Initialize counts for scores 1-5
        for idx in comment_indices:
            try:
                # Find the comment in comments_data that matches this index
                matching_comment = next(
                    (comment for comment in self.comments_data if comment[1] == idx),
                    None
                )
                if matching_comment:
                    score = int(matching_comment[2])  # Get Likert score
                    if 1 <= score <= 5:
                        distribution[score - 1] += 1
                    else:
                        print(f"Warning: Invalid Likert score {score} for comment {idx}")
                else:
                    print(f"Warning: No comment found with index {idx}")
            except (TypeError, ValueError) as e:
                print(f"Error processing comment index {idx}: {e}")
        return distribution
    
    def _get_supporting_comments(self, comment_indices: List[int]) -> List[List[Union[str, int]]]:
        """Get full comment data for the given comment indices. Returns a list of 
        comments where each comment is [text, index, likert_score]."""
        supporting_comments = []
        for idx in comment_indices:
            # Find the comment that matches this index in our stored comments_data
            matching_comment = next(
                (comment for comment in self.comments_data if comment[1] == idx),
                []  # Return empty list if no match
            )
            if matching_comment:
                supporting_comments.append(matching_comment)
            else:
                print(f"Warning: No comment found with index {idx}")
        
        return supporting_comments

In [51]:
# Example usage
async def main():
    # Configuration
    api_key = os.getenv("OPENAI_API_KEY")
    model="gpt-4o-2024-11-20"
    openai_provider = OpenAIProvider(api_key, model)
    data_source_path = os.path.join(parent_dir, 'data', 'sources', 'kuntavaalit2021.csv')
    output_path = os.path.join(parent_dir, 'condensation', 'results', 'results_with_indices', 'vittu_v0.txt')
    n_comments = 50
    batch_size = 50
    topic = "Kun kunnan menoja ja tuloja tasapainotetaan, se on tehtävä mieluummin menoja karsimalla kuin veroja kiristämällä."
    question_index = 10

    # Get comments and prepare data
    df = pd.read_csv(data_source_path)
    comment_indices = df[f'q{question_index}.explanation_fi'].dropna()[:n_comments].index.tolist()
    comments_data = [
        [comment, idx, answer] 
        for comment, idx, answer in zip(
            df.loc[comment_indices, f'q{question_index}.explanation_fi'].tolist(),
            comment_indices,
            df.loc[comment_indices, f'q{question_index}.answer'].tolist()
        )
    ]

    # Process arguments
    processor = ArgumentProcessor(openai_provider, batch_size)
    arguments = await processor.process_batch(comments_data, topic)
    formatted_args = await processor.format_arguments(arguments)

    # Save the output to a file
    with open(output_path, 'w') as f:
        f.write(f'{n_comments} comments condensed with model "{model}"\n\n')
        f.write(formatted_args)
        
        # Write the raw argument data for further processing
        f.write("\nRAW ARGUMENT DATA:\n")
        for arg in arguments:
            f.write(f"\nArgument:\n")
            f.write(f"Main: {arg.main}\n")
            f.write(f"Subpoints: {arg.subpoints}\n")
            f.write(f"Distribution: {arg.likert_distribution}\n")
            f.write(f"Supporting Comments: {arg.supporting_comments}\n")

# run main
await main()

