# Imports

In [None]:
from openai import OpenAI
import pandas as pd
import glob
import os
from dotenv import load_dotenv
import json
import re
import time

load_dotenv()


# Load embedded data with similarity scores

In [2]:
df = pd.read_csv('data/06_LLM_data/full_merged_data.csv')
# rename main similarity score to similarity_score_0
df['similarity_score_0'] = df['similarity_score']
df.drop('similarity_score', axis=1, inplace=True)

# Set lists for medium according to political alignment

In [None]:
rechts = ['WEWO', 'WEW', 'WDAY', 'NZZO']
rechtsliberal = ['NZZS', 'NZZ', 'SAW']
mitte = ['LUZ', 'ZWAS', 'ZWAO', 'ZWA', 'ZWSO', 'ZWAF', 'SRFA', 'SRF', 'SRFV', 'AZO', 'AZM', 'RTS', 'RTSV']
linksliberal = ['BAZ', 'NNBS', 'TLM', 'TLMD', 'HEU', 'NNHEU', 'TDG', 'NNTDG', 'BZ', 'NNBE', 'BLI', 'BLIO', 'NNTA', 'TA', 'TAZT', 'NNTA', 'TPS', 'TPSO', 'TAS', 'TLMD', 'BLI', 'BLIO']
links = ['WOZ', 'SBLI']

medium_groups = {
    'rechts': rechts,
    'rechtsliberal': rechtsliberal,
    'mitte': mitte,
    'linksliberal': linksliberal,
    'links': links
}
similarity_columns = [
    'similarity_score_0', 'similarity_score_1', 'similarity_score_2',
    'similarity_score_3', 'similarity_score_4', 'similarity_score_5',
    'similarity_score_6', 'similarity_score_7'
]


# Get Top scoring 'text' element per similarity_score

In [None]:
# Iterate over each group in the dictionary
for group_name, group_list in medium_groups.items():
    # Assuming df_selected is your DataFrame, and you have a 'text' column for each row

    # Create an empty DataFrame to store the top 500 results for each column
    top_500_all_scores = pd.DataFrame()

    # Filter the DataFrame for the current group
    df_selected = df[df['medium_code'].isin(group_list)]

    # Dictionary to store top 500 rows for each similarity score column
    top_500_dict = {}

    # Iterate over each similarity score column
    for col in similarity_columns:
        # Sort by the similarity score column in descending order and select top 500 rows
        top_500 = df_selected.sort_values(by=col, ascending=False).head(500)

        # Add the top 500 rows to the dictionary with the column name as the key
        top_500_dict[col] = top_500['text'].reset_index(drop=True)

    # Convert the dictionary to a DataFrame
    top_500_all_scores = pd.DataFrame(top_500_dict)

    # Save the top 500 elements for each column to a CSV file, using the group name
    top_500_all_scores.to_csv(f'data/06_LLM_data/inputs/{group_name}_top_500_similarity_scores.csv', index=False)


# Extract Consideration Statements with GPT for each similarity score

In [None]:
# Activate OpenAI Client

client = OpenAI()

## Setting up the Assistant
### (We only need to do this once)

In [None]:
# Create a vector store
vector_store = client.beta.vector_stores.create(name="DRI_Examples")

# Ready the files for upload to OpenAI
file_paths = ["data/06_LLM_data/similarity_scores/dri_example_statements.txt"]
file_streams = [open(path, "rb") for path in file_paths]

# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

vector_store_id = vector_store.id

print(file_batch.status)
print(file_batch.file_counts)

In [None]:
name = f"DRI Considerations Assistant"

instructions = f"""
You are an expert political Scientist. Your task is to find
Consideration Statements for a DRI Survey on health costs in Switzerland from
different media outlets grouped by political leaning on several subtopics.
The following is a short introduction to the concept of DRI:

"
The Deliberative Reason Index (DRI) is a tool used to measure the quality and
improvement of group reasoning during deliberative processes. It focuses on
evaluating reasoning at the group level, rather than individual level, emphasizing
how group members collectively construct an understanding of an issue and agree
on the terms of reasonableness.

Consideration statements are specific statements in a survey designed to capture
participants' opinions or evaluations of relevant factors, arguments, or aspects
of an issue under discussion. These statements reflect the various interests,
values, beliefs, or facts that participants consider important when reasoning
about the issue.

In the context of the Deliberative Reason Index (DRI), consideration statements
help identify the range of viewpoints or concerns that individuals bring to a
deliberative process. They serve as a way to assess how participants weigh
different aspects of an issue before arriving at preferences for actions or policies.
"

Consideration statements are typically concisely formulated and express a single
premise for an argument or viewpoint. They are designed to capture the essence of
a particular consideration that participants may have when discussing a given topic.
Good consideration statements are clear, specific, and relevant to the issue at hand.
They should not include demands for policy actions or express preferences for
specific outcomes.

Good examples include:
- "Private solutions are more efficient than government intervention in managing healthcare costs."
- "A growing older population is leading to higher utilization of healthcare services, increasing overall costs."
- "There is a significant lack of financial incentives for health prevention in Switzerland, leading to spiraling healthcare costs."ArithmeticError

Bad examples include:
- "Introducing a unified health insurance system could reduce costs by decreasing expenditures on marketing, administrative fees, and high management salaries currently spread across multiple insurance companies."
- "We need to introduce cost targets in the health system to manage the rapid growth of healthcare costs."
- "Closing underutilized hospitals in Switzerland would result in significant cost savings and improve overall efficiency in health service delivery."

Additionally read through the dri_example_statements.csv and familiarize yourself with the
semantic structure of consideration statements.
Be prepared to produce a list of the top 5 Consideration statements in JSON Format, based
on the media outlets that will be procided to you.
When prompted, make sure to produce all statements in English and in close semantic
similarity to the examples. Do not include anything but a list of statements in
JSON Format in your responses
"""


assistant = client.beta.assistants.create(
    name=name,
    instructions=instructions,
    model="gpt-4o",
    tools=[{"type": "file_search"}],
)

asssistant_id = assistant.id

assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}},
)


## Prompt the Assistant
### (Do this every run)

In [None]:
def customize_prompt(political_leaning, similarity_score):
    # Define a dictionary mapping similarity scores to corresponding prompts
    similarity_prompts = {
        0: """
        Focus specifically on the topic of health costs in Switzerland.
        """,
        1: """
        Focus specifically on the topic of the health system of Switzerland and non-incremental policy reforms.
        """,
        2: """
        Focus specifically on the topic of Increasing utilisation of healthcare services per person.
        """,
        3: """
        Focus specifically on the topic of hospital planning and financing.
        """,
        4: """
        Focus specifically on the topic of the design/configuration/organisation of compulsory health insurance.
        """,
        5: """
        Focus specifically on the topic of a lack of incentives for health prevention.
        """,
        6: """
        Focus specifically on the topic of the coordination of health services.
        """,
        7: """
        Focus specifically on the topic of the financial burden of health costs on households (costs distribution).
        """
    }

    # Retrieve the correct prompt based on the similarity score
    prompt_similarity = similarity_prompts.get(similarity_score, similarity_prompts[0])

    # Final prompt creation with the selected prompt similarity
    prompt = f"""
        Create a list of 5 consideration statements for a DRI Survey on health costs in
        Switzerland from the media outlets provided below. All media outlets provided come from
        {political_leaning}-leaning media outlets. Make sure to do justice to the
        perspectives and opinions described in the media outlets. The statements should be
        in close semantic similarity to the examples provided in the dri_example_statements.csv
        file but the content should be representative of the political opinion of the media outlets.
        Make sure to capture the whole range of viewpoints and concerns that is represented in
        the media outlets. Then boil it down to the top 5 most important consideration statements.
        Consideration statements are typically concisely formulated and express a single
        premise for an argument or viewpoint. They are designed to capture the essence of
        a particular consideration that participants may have when discussing a given topic.

        {prompt_similarity}

        Format your response as a list of consideration statements in JSON format.
        Make sure not to include anything else but the JSON object in your response.
    """

    return prompt


In [None]:
def get_consideration_statements(political_leaning, similarity_score_list):
    statements_df = pd.DataFrame(index=range(10))

    for similarity_score in similarity_score_list:
        # Path to the directory containing your CSV files
        csv_path = f'data/06_LLM_data/input_data/{political_leaning}_top_500_similarity_scores.csv'
        df = pd.read_csv(csv_path)
        print("df loaded with shape:")
        print(df.shape)

        txt_file_path = f'data/06_LLM_data/input_data/{political_leaning}_top_500_similarity_score_{similarity_score}.txt'

        # Save the column as a text file, each value on a new line
        df[f'similarity_score_{similarity_score}'].to_csv(txt_file_path, index=False, header=False)
        print(f"File {txt_file_path} has been created with content:")
        print(df[f'similarity_score_{similarity_score}'].head())

        message_file = client.files.create(
        file=open(txt_file_path, "rb"),
        purpose="assistants"
        )
        print("message_file:")
        print(message_file)

        # Create a thread and attach the file to the message
        thread = client.beta.threads.create(
        messages=[
            {
            "role": "user",
            "content": customize_prompt(political_leaning, similarity_score),
            # Attach the new file to the message.
            "attachments": [
                { "file_id": message_file.id, "tools": [{"type": "file_search"}] }
            ],
            }
        ]
        )

        if os.path.exists(txt_file_path):
            os.remove(txt_file_path)
            print(f"File {txt_file_path} has been removed.")
        else:
            print(f"File {txt_file_path} does not exist.")

        print("Waiting for the assistant to generate the consideration statements...")
        run = client.beta.threads.runs.create_and_poll(
            thread_id=thread.id, assistant_id=assistant.id
        )

        messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
        print("messages:")
        print(messages)

        message_content = messages[0].content[0].text.value

        # Remove the markdown ```json``` block and newlines using regex
        cleaned_content = re.sub(r'```json|```', '', message_content).strip()

        # Load the cleaned content as a JSON array
        statements = json.loads(cleaned_content)

        # Create the column name dynamically
        column_name = f"Statements, similarity_score_{similarity_score}"

        # Pad the list of statements with NaN to match the length of the DataFrame
        padded_statements = statements + [None] * (len(statements_df) - len(statements))

        # Add the new column to the DataFrame
        statements_df[column_name] = padded_statements

        # Wait for 20 seconds before continuing to the next iteration to avoid rate limiting
        print(f"Waiting for 20 seconds before the next iteration...")
        time.sleep(20)


    return statements_df

In [None]:
# Run the Generation of DRI Statements

# Define the political leaning and similarity score
political_leaning_list = ['right', 'right-liberal', 'centrist', 'left-liberal', 'left']
political_leaning= political_leaning_list[0]

similarity_score_list = [i for i in range(8)]

# Define vector store and assistant IDs if not already defined
vector_store_id = ''
asssistant_id = ''

df = get_consideration_statements(political_leaning, similarity_score_list)


In [None]:
# Save

df.to_csv('data/06_LLM_data/outputs/consideration_statements/centrist_consideration_statements.csv', index=False)

# Extract Policy Preferences with GPT for each similarity score

In [None]:
# Activate OpenAI Client

client = OpenAI()


## Setting up the Assistant
### (We only need to do this once)

In [None]:
# Create a vector store
vector_store = client.beta.vector_stores.create(name="DRI_policy_options")

# Ready the files for upload to OpenAI
file_paths = ["data/06_LLM_data/similarity_scores/dri_example_policy_options.txt"]
file_streams = [open(path, "rb") for path in file_paths]

# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
  vector_store_id=vector_store.id, files=file_streams
)

vector_store_id = vector_store.id

In [None]:
name = f"DRI Policy Assistant"

instructions = f"""
You are an expert political Scientist. Your task is to find
Policy Options for a DRI Survey on health costs in Switzerland from
different media outlets grouped by political leaning on several subtopics.
The following is a short introduction to the concept of DRI:

"
The Deliberative Reason Index (DRI) is a tool used to measure the quality and
improvement of group reasoning during deliberative processes. It focuses on
evaluating reasoning at the group level, rather than individual level, emphasizing
how group members collectively construct an understanding of an issue and agree
on the terms of reasonableness.

Policy Options are specific possible actions or strategies that will be ranked
by preferance within the DRI survey. These options reflect the various policy changes
that are discussed in the public discourse on to the issue at hand.

In the context of the Deliberative Reason Index (DRI), policy options
help identify the range of preferences and opinions concerning policy demands
that individuals bring to a deliberative process.
"

Policy options are typically concisely formulated and express a single
strategy or action. They are designed to capture the main solutions discussed
within the public media discourse on a given topic.
Good policy options are clear, specific, and relevant to the issue at hand.
They should include demands for policy actions and express preferences for
specific outcomes. They can include a statement of the problem if necessary.

Additionally read through the dri_example_policy_options.csv and familiarize yourself with the
semantic structure of consideration statements.
Be prepared to produce a list of the top policy options in JSON Format, based
on the media outlets that will be procided to you.
When prompted, make sure to produce all statements in English and in close semantic
similarity to the examples. Do not include anything but a list of statements in
JSON Format in your responses
"""


assistant = client.beta.assistants.create(
    name=name,
    instructions=instructions,
    model="gpt-4o",
    tools=[{"type": "file_search"}],
)

asssistant_id = assistant.id

assistant = client.beta.assistants.update(
  assistant_id=assistant.id,
  tool_resources={"file_search": {"vector_store_ids": [vector_store_id]}},
)

## Prompt the Assistant
### (Do this every run)

In [15]:
def customize_prompt(political_leaning):
    # Define a dictionary mapping similarity scores to corresponding prompts

    # Final prompt creation with the selected prompt similarity
    prompt = f"""
        Create a list of policy options for a DRI Survey on health costs in
        Switzerland from the media outlets provided below. All media outlets provided come from
        {political_leaning}-leaning media outlets. Make sure to do justice to the
        perspectives and opinions described in the media outlets. The statements should be
        in close semantic similarity to the examples provided in the dri_example_policy_options.csv
        file but the content should be representative of the political opinion of the media outlets.
        Make sure to capture the whole range of policy proposals that is represented in
        the media outlets. Then boil it down to the top 5 most important policy options.
        Policy options are typically concisely formulated and express a single
        action or strategy to a give issue.

        Format your response as a list of consideration statements in JSON format.
        Make sure not to include anything else but the JSON object in your response.
    """

    return prompt


In [None]:
def get_consideration_statements(political_leaning):
    policies_df = pd.DataFrame(index=range(10))

    # Path to the directory containing your CSV files
    csv_path = f'data/06_LLM_data/input_data/{political_leaning}_top_500_similarity_scores.csv'
    df = pd.read_csv(csv_path)
    print("df loaded with shape:")
    print(df.shape)

    txt_file_path = f'data/06_LLM_data/input_data/{political_leaning}_top_500_similarity_score.txt'

    # Save the column as a text file, each value on a new line
    df.to_csv(txt_file_path, index=False, header=False)
    print(f"File {txt_file_path} has been created")

    message_file = client.files.create(
    file=open(txt_file_path, "rb"),
    purpose="assistants"
    )
    print("message_file:")
    print(message_file)

    # Create a thread and attach the file to the message
    thread = client.beta.threads.create(
    messages=[
        {
        "role": "user",
        "content": customize_prompt(political_leaning),
        # Attach the new file to the message.
        "attachments": [
            { "file_id": message_file.id, "tools": [{"type": "file_search"}] }
        ],
        }
    ]
    )

    if os.path.exists(txt_file_path):
        # os.remove(txt_file_path)
        print(f"File {txt_file_path} has been removed.")
    else:
        print(f"File {txt_file_path} does not exist.")

    print("Waiting for the assistant to generate the policy options...")
    run = client.beta.threads.runs.create_and_poll(
        thread_id=thread.id, assistant_id=assistant.id
    )

    messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))
    print("messages:")
    print(messages)

    message_content = messages[0].content[0].text.value

    # Remove the markdown ```json``` block and newlines using regex
    cleaned_content = re.sub(r'```json|```', '', message_content).strip()

    # Now you can load the cleaned content as a JSON array
    statements = json.loads(cleaned_content)

    # Create the column
    column_name = f"Statements, similarity_score"

    # Pad the list of statements with NaN to match the length of the DataFrame
    padded_statements = statements + [None] * (len(policies_df) - len(statements))

    # Add the new column to the DataFrame
    policies_df[column_name] = padded_statements


    return policies_df

In [None]:
# Run the Generation of DRI Statements

# Define the political leaning and similarity score
political_leaning_list = ['right', 'right-liberal', 'centrist', 'left-liberal', 'left']
political_leaning= political_leaning_list[0]

similarity_score_list = [i for i in range(8)]

# Define vector store and assistant IDs if not already defined
vector_store_id = ''
asssistant_id = ''

df = get_consideration_statements(political_leaning_list[4])

# Rename accordingly
df_left = df
# df_leftliberal = df
# df_centrist = df
# df_rightliberal = df
# df_right = df

In [37]:
# Merged DFs
merged_df = pd.concat([df_right, df_rightliberal, df_centrist,
                       df_leftliberal, df_left], axis=1)

# Clean DF
df_cleaned = merged_df.dropna()


# Saving

In [None]:
# Save DF
df_cleaned.to_csv('data/06_LLM_data/outputs/policy_options/policy_preferences.csv', index=False)