In [1]:
# Data manipulation
import os
import pandas as pd
import csv
import json
import jsonlines
import jsonlines as jl
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from dotenv import load_dotenv
from typing import List
import re

# Machine Learning
import torch
import asyncio
import aiohttp
import torch.nn as nn
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Transformers and Langchain
from pydantic import ValidationError, BaseModel, Field

# API and utility
from huggingface_hub import login
from together import Together
import time
from tqdm import tqdm
import accelerate

load_dotenv('/Users/guida/llm_argument_tasks/.env')

api_key = os.environ.get('TOGETHER_API_KEY')
client = Together(api_key=api_key)

In [2]:
class RelationClassification(BaseModel):
    id: str = Field(description="The ID of the comment being analyzed")    
    label: int = Field(description="The label associated with the argument") 

In [3]:
def classify_text(id: str, comment_text: str, argument: str) -> dict:
    extract = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment about gay marriage in relation to gay marriage. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - Label 1: if the comment attacks the argument.
            - Label 5: if the comment supports the argument.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        temperature=0,
        top_p=1,
        top_k=1,
        response_format={
            "type": "json_object",
            "schema": RelationClassification.model_json_schema(),
        }
    )
    return json.loads(extract.choices[0].message.content)

In [6]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_no_3.csv')

def process_comments_with_arguments(df: pd.DataFrame, output_file: str):
    with jsonlines.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']

            try:
                classification = classify_text(comment_id, comment_text, argument_text)
                writer.write(classification)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                continue

process_comments_with_arguments(gm, 'comarg_gm_relation_identification_nopol_without3_llama.jsonl')

Processing comments: 100%|██████████| 436/436 [09:19<00:00,  1.28s/it]


In [7]:
def classify_text_ugip(id: str, comment_text: str, argument: str) -> dict:
    extract = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment in relation to a specific argument about whether "Under God" should appear in the US Pledge of Allegiance. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - Label 1: if the comment attacks the argument.
            - Label 5: if the comment supports the argument.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        temperature=0,
        top_p=1,
        top_k=1,
        response_format={
            "type": "json_object",
            "schema": RelationClassification.model_json_schema(),
        }
    )
    return json.loads(extract.choices[0].message.content)

In [8]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_no_3.csv')

def process_comments_with_arguments(df: pd.DataFrame, output_file: str):
    with jsonlines.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']
            try:
                classification = classify_text_ugip(comment_id, comment_text, argument_text)
                writer.write(classification)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                continue

process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification_nopol_without3_llama.jsonl')

Processing comments: 100%|██████████| 322/322 [07:10<00:00,  1.34s/it]
