In [1]:
# Data manipulation
import os
import pandas as pd
import csv
import json
import jsonlines
import jsonlines as jl
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from dotenv import load_dotenv
from typing import List
import re

# Machine Learning
import torch
import asyncio
import aiohttp
import torch.nn as nn
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Transformers and Langchain
from pydantic import ValidationError, BaseModel, Field

# API and utility
from huggingface_hub import login
from together import Together
import time
from tqdm import tqdm
import accelerate
from openai import OpenAI

load_dotenv('/Users/guida/llm_argument_tasks/.env')

api_key = os.environ.get('OPENAI_API_KEY')
client = OpenAI()

In [3]:
class RelationClassification(BaseModel):
    id: str = Field(description="The ID of the comment being analyzed")    
    label: int = Field(description="The label associated with the argument") 

In [4]:
def classify_text_gpt(id: str, comment_text: str, argument: str) -> dict:
    completion = client.beta.chat.completions.parse(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment about gay marriage in relation to gay marriage. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument explicitly.
            - 2 if the comment attacks the argument implicitly/vaguely.
            - 4 if the comment supports the argument implicitly/vaguely.
            - 5 if the comment supports the argument explicitly.
            
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        response_format=RelationClassification,
        temperature=0,
        top_p=1,
    )
    return completion.choices[0].message.content

In [5]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_no_3.csv')

def process_comments_with_arguments(df: pd.DataFrame, output_file: str):
    with jsonlines.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']
            try:
                gpt_response = classify_text_gpt(comment_id, comment_text, argument_text)
                classification = json.loads(gpt_response)
                writer.write(classification)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

process_comments_with_arguments(gm, 'comarg_gm_relation_identification_polarity_without3_gpt.jsonl')

Processing comments: 100%|██████████| 436/436 [05:05<00:00,  1.43it/s]


In [6]:
def classify_text_ugip(id: str, comment_text: str, argument: str) -> dict:
    completion = client.beta.chat.completions.parse(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment in relation to a specific argument about whether "Under God" should appear in the US Pledge of Allegiance.
            You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument.
            - 3 if the comment makes no use of the argument.
            - 5 if the comment supports the argument.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        response_format=RelationClassification,
        temperature=0,
        top_p=1,
    )
    return completion.choices[0].message.content

In [7]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_no_3.csv')

def process_comments_with_arguments(df: pd.DataFrame, output_file: str):
    with jsonlines.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']
            try:
                gpt_response = classify_text_ugip(comment_id, comment_text, argument_text)
                classification = json.loads(gpt_response)
                writer.write(classification)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification_polarity_without3_gpt.jsonl')

Processing comments: 100%|██████████| 322/322 [03:36<00:00,  1.49it/s]
