In [15]:
from openai import OpenAI
import os
import pandas as pd
from pydantic import BaseModel, Field
from typing import List
from tqdm import tqdm
import json
from random import sample
import jsonlines as jsonl
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv('/Users/guida/llm_argument_tasks/.env')

api_key = os.environ.get('OPENAI_API_KEY')
client = OpenAI()

In [16]:
class RelationClassification(BaseModel):
    id: str = Field(description="The ID of the comment being analyzed")    
    label: int = Field(description="The label associated with the argument") 

In [17]:
def prep_fewshot_samples_2ways(samples_file, n):
    df = pd.read_csv(samples_file)
    ids = df['id'].to_list()
    sampled = sample(ids, n)
    print(sampled)
    df = df[df['id'].isin(sampled)]
    comment = df.iloc[0]['comment_text']
    output = f"Comment: {comment}\n The comment attacks (1), or supports (5) the following argument(s):\n"
    #print(output)
    for i, row in df.iterrows():
        argument = row['argument_text']
        output = f"{output} Argument {i}: {argument}\n"
        label = row['label']
        print(label)
        if label == 2:
            label = 1
            print(label)
        if label == 4:
            label = 5
            print(label)
        output = f"{output} Label: {label}\n\n"
    return output

In [18]:
def prep_fewshot_samples_5ways(samples_file, n):
    df = pd.read_csv(samples_file)
    ids = df['id'].to_list()
    sampled = sample(ids, n)
    print(sampled)
    df = df[df['id'].isin(sampled)]
    comment = df.iloc[0]['comment_text']
    output = f"Comment: {comment}\n The comment explicitly attacks (1), implicitly attacks (2), implicitly supports (4), or explicitly supports (5) the following argument(s):\n"
    #print(output)
    for i, row in df.iterrows():
        argument = row['argument_text']
        output = f"{output} Argument {i}: {argument}\n"
        label = row['label']
        output = f"{output} Label: {label}\n\n"
    return output

In [19]:
def classify_text_2ways(id: str, comment_text: str, argument: str, topic: str, samples: str) -> dict:
    completion = client.beta.chat.completions.parse(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment about {topic} in relation to a specific argument. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument.
            - 5 if the comment supports the argument.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.

            Some examples:
            {samples}
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        response_format=RelationClassification,
        temperature=0,
        top_p=1,
    )
    return completion.choices[0].message.content

In [20]:
def classify_text_5ways(id: str, comment_text: str, argument: str, topic: str, samples: str) -> dict:
    completion = client.beta.chat.completions.parse(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment about {topic} in relation to a specific argument. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument explicitly.
            - 2 if the comment attacks the argument implicitly/vaguely.
            - 4 if the comment supports the argument implicitly/vaguely.
            - 5 if the comment supports the argument explicitly.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.

            Some examples:
            {samples}
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        response_format=RelationClassification,
        temperature=0,
        top_p=1,
    )
    return completion.choices[0].message.content

In [21]:
def process_comments_with_arguments(df: pd.DataFrame, output_file: str, topic: str, samples: str, detailed = False):
    with jsonl.open(output_file, mode='w') as writer:
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']
            if detailed == False:
                model = classify_text_2ways
            else:
                model = classify_text_5ways
            try:
                gpt_response = model(comment_id, comment_text, argument_text, topic, samples)
                classification = json.loads(gpt_response)
                output_entry = {"id": comment_id, "label": classification["label"]}
                writer.write(output_entry)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

In [22]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_2ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_one_shot.csv', 1)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification2ways_gpt_1shot.jsonl', 'gay marriage', samples, detailed=False)

['108arg2']
4
5


Processing comments: 100%|██████████| 431/431 [07:01<00:00,  1.02it/s]


In [23]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_2ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv',5)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification2ways_gpt_5shot.jsonl', 'gay marriage', samples, detailed=False)

['108arg2', '175arg4', '161arg4', '5arg5', '198arg5']
1
2
1
4
5
5
4
5


Processing comments: 100%|██████████| 431/431 [07:03<00:00,  1.02it/s]


In [24]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_one_shot.csv',1)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification5ways_gpt_1shot.jsonl', 'gay marriage', samples, detailed=True)

['108arg2']


Processing comments: 100%|██████████| 431/431 [07:11<00:00,  1.00s/it]


In [25]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv',5)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification5ways_gpt_5shot.jsonl', 'gay marriage', samples, detailed=True)

['5arg5', '175arg4', '198arg5', '161arg4', '108arg2']


Processing comments:   0%|          | 0/431 [00:00<?, ?it/s]

Processing comments: 100%|██████████| 431/431 [07:19<00:00,  1.02s/it]


In [26]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_2ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_one_shot.csv', 1)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification2ways_gpt_1shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=False)

['414721757arg6']
2
1


Processing comments: 100%|██████████| 317/317 [05:45<00:00,  1.09s/it]


In [27]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_2ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv',5)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification2ways_gpt_5shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=False)

['414721922arg3', '414721727arg3', '414721757arg6', '414721831arg6', '414721738arg1']
1
2
1
4
5
5
5


Processing comments: 100%|██████████| 317/317 [05:31<00:00,  1.05s/it]


In [28]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_one_shot.csv',1)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification5ways_gpt_1shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=True)

['414721757arg6']


Processing comments:   0%|          | 0/317 [00:00<?, ?it/s]

Processing comments: 100%|██████████| 317/317 [05:22<00:00,  1.02s/it]


In [29]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv',5)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification5ways_gpt_5shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=True)

['414721727arg3', '414721922arg3', '414721738arg1', '414721757arg6', '414721831arg6']


Processing comments: 100%|██████████| 317/317 [05:39<00:00,  1.07s/it]
