In [49]:
import os
import sys
import re
# Get the absolute path to the parent directory (assumes this file is in 'condensation')
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, parent_dir)
# autoreload modules
%load_ext autoreload
%autoreload 2
from typing import List
import pandas as pd
from chatbot_api.providers.openai import OpenAIProvider
from chatbot_api.base import Role, Message

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
class Argument:
    def __init__(self, main_point: str, subpoints: List[str], score_distribution: List[int], source_indices: List[int]):
        self.main_point = main_point
        self.subpoints = subpoints
        self.score_distribution = score_distribution
        self.source_indices = source_indices

class ArgumentProcessor:
    def __init__(self, llm_provider: OpenAIProvider, batch_size: int):
        self.llm_provider = llm_provider
        self.batch_size = batch_size
        self.PROMPT_TEMPLATE = """
            ### Instructions:
            1. Review the following numbered comments on the topic: "{topic}"
            2. **Identify and categorize arguments into main topics**.
                - Main arguments are broad, high-level points.
                - Subpoints provide supporting details or reasoning for the main argument.
            3. Avoid redundancy by combining similar arguments or subpoints.
            4. For each argument and subpoint, include the indices of the source comments in square brackets, as shown in the output format below.
            5. Ensure each argument includes source indices which keep track of the comments that support the argument.
            6. Provide the output in Finnish.
            7. Ensure that no argument is repeated, and that each is categorized correctly, avoiding redundancy.
            
            ### Output Format: 
            <ARGUMENTS>
            MAIN: [Main Argument] [source_indices: 1,2,3]
            SUB: [Supporting subpoint] [source_indices: 1,2]
            SUB: [Another supporting subpoint] [source_indices: 3]
            </ARGUMENTS>

            ### Comments to analyze:
            {comments_text}
            """

    def get_relevant_scores(self, indices: List[int], all_likert_scores: List[int]) -> List[int]:
        """
        Get the Likert scores for the given indices from the full list of scores.
        
        Args:
            indices: List of indices for which to get scores
            all_likert_scores: Complete list of Likert scores
            
        Returns:
            List of Likert scores corresponding to the given indices
        """
        return [all_likert_scores[idx] for idx in indices if idx < len(all_likert_scores)]

    def calculate_score_distribution(self, scores: List[int]) -> List[int]:
        """
        Calculate the distribution of Likert scores (as percentages).
        
        Args:
            scores: List of Likert scores (1-5)
            
        Returns:
            List of 5 ints representing the percentage distribution of scores
        """
        if not scores:
            return [0] * 5
            
        distribution = [0] * 5
        for score in scores:
            if 1 <= score <= 5:  # Assuming Likert scale 1-5
                distribution[int(score-1)] += 1
                
        total = sum(distribution)
        return [count for count in distribution] if total > 0 else [0] * 5

    async def _create_initial_prompt(self, comments: List[str], indices: List[int], topic: str) -> str:
        """
        Create the initial prompt for the argument generation task with max number of comments that fit in the prompt.
        Returns the prompt and the number of comments that fit in the prompt.
        """

        fit_count = await self.llm_provider.fit_comment_args_count(comments, self.PROMPT_TEMPLATE.format(topic=topic, comments_text="")) # populate with topic and dummy comment text
        
        if fit_count == 0:
            raise ValueError("No comments fit in the prompt.")

        # Take only the messages that fit
        comments = comments[:fit_count]
        indices = indices[:fit_count]

        numbered_comments = [f"[{idx}] {comment}" for idx, comment in zip(indices, comments)]
        comments_text = "\n".join(numbered_comments)

        return self.PROMPT_TEMPLATE.format(topic=topic, comments_text=comments_text), fit_count

    async def _parse_arguments(self, response: str, likert_answers: List[int]) -> List[Argument]:
        arguments = []
        current_main = None
        current_subpoints = []
        current_indices = set()
        
        pattern = r'<ARGUMENTS>(.*?)</ARGUMENTS>'
        match = re.search(pattern, response, re.DOTALL)
        if not match:
            return arguments
            
        lines = match.group(1).strip().split('\n')
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
                
            indices_match = re.search(r'\[source_indices: ([\d,\s]+)\]', line)
            indices = [int(idx.strip()) for idx in indices_match.group(1).split(',')] if indices_match else []
            
            content = re.sub(r'\[source_indices: [\d,\s]+\]', '', line).strip()
            
            if line.startswith('MAIN:'):
                if current_main is not None:
                    relevant_scores = self.get_relevant_scores(list(current_indices), likert_answers)
                    score_distribution = self.calculate_score_distribution(relevant_scores)
                    arguments.append(Argument(
                        main_point=current_main,
                        subpoints=current_subpoints,
                        score_distribution=score_distribution,
                        source_indices=list(current_indices)
                    ))
                
                current_main = content.replace('MAIN:', '').strip()
                current_subpoints = []
                current_indices = set(indices)
                
            elif line.startswith('SUB:'):
                subpoint = content.replace('SUB:', '').strip()
                current_subpoints.append(subpoint)
                current_indices.update(indices)
        
        # Add the last argument
        if current_main is not None:
            relevant_scores = self.get_relevant_scores(list(current_indices), likert_answers)
            score_distribution = self.calculate_score_distribution(relevant_scores)
            arguments.append(Argument(
                main_point=current_main,
                subpoints=current_subpoints,
                score_distribution=score_distribution,
                source_indices=list(current_indices)
            ))
            
        return arguments

    async def process_batch(self, comments: List[str], topic: str, likert_answers: List[int] = None) -> List[Argument]:
        '''
        Process a batch of comments and generate arguments.
        Returns a list of Argument objects and the number of comments that were used.
        '''
        indices = list(range(len(comments)))
        prompt, n_comments_used = await self._create_initial_prompt(comments, indices, topic) # gets the max number of comments that fit in the prompt
        response = await self.llm_provider.generate([Message(Role.USER, prompt)])
        
        # Parse arguments and calculate distributions in one step
        arguments = await self._parse_arguments(response.content, likert_answers if likert_answers else [])
        return arguments, n_comments_used

    async def format_arguments(self, arguments: List[Argument]) -> str:
        formatted_output = []
        for i, arg in enumerate(arguments, 1):
            formatted_output.append(f"\nArgument {i}:")
            formatted_output.append(f"Main point: {arg.main_point}")
            formatted_output.append(f"Source indices: {arg.source_indices}")
            
            if arg.subpoints:
                formatted_output.append("Supporting points:")
                for j, subpoint in enumerate(arg.subpoints, 1):
                    formatted_output.append(f"  {j}. {subpoint}")
            
            if arg.score_distribution:
                formatted_output.append("\nLikert Score Distribution:")
                for score_nominality, prevalence in enumerate(arg.score_distribution, 1):
                    formatted_output.append(f"  Score {score_nominality}: {prevalence:} answers")
            
            formatted_output.append("-" * 50)
        
        return "\n".join(formatted_output)

In [51]:
# TO DO: get the topic from the data source, now it is manually set
async def main():
    # Params
    n_comments = 400
    batch_size = 400
    topic = "Kun kunnan menoja ja tuloja tasapainotetaan, se on tehtävä mieluummin menoja karsimalla kuin veroja kiristämällä."  # Can be modified based on your needs

    # Config
    api_key = os.getenv("OPENAI_API_KEY")
    model="gpt-4o-2024-11-20"
    openai_provider = OpenAIProvider(api_key, model)    
    processor = ArgumentProcessor(openai_provider, batch_size)

    # Setup paths
    data_source_path = os.path.join(parent_dir, 'data', 'sources', 'kuntavaalit2021.csv')
    output_path = os.path.join(parent_dir, 'condensation', 'results','results_with_indices', 'version2.txt')

    # Read and prepare data
    df = pd.read_csv(data_source_path)

    # Choose a subset of comments to process
    question_index = 10
    explanation_column_name = f'q{question_index}.explanation_fi'
    likert_column_name = f'q{question_index}.answer'

    # Get comments and their original indices
    comment_mask = df[explanation_column_name].notna()
    comment_indices = df[comment_mask][explanation_column_name].index[:n_comments].tolist()
    comments = df.loc[comment_indices, explanation_column_name].tolist()
    
    # Get corresponding Likert scores
    likert_answers = df.loc[comment_indices, likert_column_name].tolist()
    
    # Process arguments
    arguments, n_comments_used = await processor.process_batch(
        comments=comments,
        topic=topic,
        likert_answers=likert_answers
    )

    print(f"Number of comments used: {n_comments_used} out of the attempted {n_comments}")
    
    # Format and save results
    formatted_args = await processor.format_arguments(arguments)

    # Print the comments for each argument
    for i, arg in enumerate(arguments):
        print(f"\nArgument {i+1}: {arg.main_point}")
        for idx in arg.source_indices:
            print(f"{idx}: {comments[idx]}")
    
    # Save to file
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(formatted_args)

await main()

Number of comments used: 390 out of the attempted 400

Argument 1: Menojen karsiminen on parempi vaihtoehto kuin verojen korottaminen, sillä se voi vähentää kuntalaisten taloudellista kuormitusta ja lisätä kunnan houkuttelevuutta.
3: Mahdollisimman turhia menoja pitäisi karsia, ei tietenkään tärkeistä palveluista.
5: Menot tulojen mukaan.
6: Verojen korotus on mielestäni toimivampi ratkaisu. Kunnalla on harvoin sellaisia menoja, joista voisi karsia. Leikattavat menoerät liittyvät hyvin usein lapsiin, nuoriin, vanhuksiin sekä yhdistyksiin. Kun leikataan päivähoidosta tai opetuksesta, leikkaamme myös tulevaisuudesta. Seniorikuntalaisilla on oikeus hyvään ja turvalliseen elämään, johon kuuluvat tietyt verovaroin tuotetut palvelut. Yhdistykset tuottavat meidän vapaa-aikaamme toimintaa ja tapahtumia, joten niiden toiminnan lamauttaminen olisi kaikissa tilanteissa epäviisasta. 
263: Toimintaa tehostamalla saadaan säästöjä aikaiseksi.
8: Kunnan tulee tarjota riittävät jo lainsäädännöllisetkin