In [24]:
from openai import OpenAI
import os
import pandas as pd
from pydantic import BaseModel, Field
from typing import List
from tqdm import tqdm
import json
from random import sample
import jsonlines as jsonl
from openai import OpenAI


# Data manipulation
import os
import pandas as pd
import csv
import json
import jsonlines
import jsonlines as jl
from pathlib import Path
from dotenv import load_dotenv
from typing import List
import re

# Machine Learning
import torch
import torch.nn as nn
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Transformers and Langchain
from pydantic import ValidationError, BaseModel, Field

# API and utility
from huggingface_hub import login
from together import Together
import time
from tqdm import tqdm
import accelerate


load_dotenv()

api_key = os.environ.get('TOGETHER_API_KEY')
client = Together(api_key=api_key)


In [25]:
class RelationClassification(BaseModel):
    id: str = Field(description="The ID of the comment being analyzed")    
    label: int = Field(description="The label associated with the argument") 

In [26]:
def prep_fewshot_samples_3ways(samples_file, n):
    df = pd.read_csv(samples_file)
    ids = df['id'].to_list()
    sampled = sample(ids, n)
    print(sampled)
    df = df[df['id'].isin(sampled)]
    comment = df.iloc[0]['comment_text']
    output = f"Comment: {comment}\n The comment attacks (1), makes no use (3), or supports (5) the following argument(s):\n"
    #print(output)
    for i, row in df.iterrows():
        argument = row['argument_text']
        output = f"{output} Argument {i}: {argument}\n"
        label = row['label']
        print(label)
        if label == 2:
            label = 1
            print(label)
        if label == 4:
            label = 5
            print(label)
        output = f"{output} Label: {label}\n\n"
    return output

In [27]:
def prep_fewshot_samples_5ways(samples_file, n):
    df = pd.read_csv(samples_file)
    ids = df['id'].to_list()
    sampled = sample(ids, n)
    print(sampled)
    df = df[df['id'].isin(sampled)]
    comment = df.iloc[0]['comment_text']
    output = f"Comment: {comment}\n The comment explicitly attacks (1), implicitly attacks (2), makes no use (3), implicitly supports (4), or explicitly supports (5) the following argument(s):\n"
    #print(output)
    for i, row in df.iterrows():
        argument = row['argument_text']
        output = f"{output} Argument {i}: {argument}\n"
        label = row['label']
        output = f"{output} Label: {label}\n\n"
    return output

In [28]:
def classify_text_3ways(id: str, comment_text: str, argument: str, topic: str, samples: str) -> dict:
    extract = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment about {topic} in relation to a specific argument. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument.
            - 3 if the comment makes no use of the argument.
            - 5 if the comment supports the argument.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.

            Some examples:
            {samples}
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        temperature=0,
        top_k=1,
        top_p=1,
        response_format={
            "type": "json_object",
            "schema": RelationClassification.model_json_schema(),
        }
    )
    
    return json.loads(extract.choices[0].message.content)

In [29]:
def classify_text_5ways(id: str, comment_text: str, argument: str, topic: str, samples: str) -> dict:
    extract = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment about {topic} in relation to a specific argument. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument explicitly.
            - 2 if the comment attacks the argument implicitly/vaguely.
            - 3 if the comment makes no use of the argument.
            - 4 if the comment supports the argument implicitly/vaguely.
            - 5 if the comment supports the argument explicitly.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.

            Some examples:
            {samples}
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        temperature=0,
        top_k=1,
        top_p=1,
        response_format={
            "type": "json_object",
            "schema": RelationClassification.model_json_schema(),
        }
    )
    
    return json.loads(extract.choices[0].message.content)

In [30]:
def process_comments_with_arguments(df: pd.DataFrame, output_file: str, topic: str, samples: str, detailed = False):
    with jsonl.open(output_file, mode='w') as writer:
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']
            if detailed == False:
                model = classify_text_3ways
            else:
                model = classify_text_5ways
            try:
                classification = model(comment_id, comment_text, argument_text, topic, samples)
                writer.write(classification)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

In [15]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_3ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv', 1)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification3ways_llama_1shot.jsonl', 'gay marriage', samples, detailed=False)

['11arg1']
3


Processing comments:   0%|          | 0/1279 [00:00<?, ?it/s]

Processing comments: 100%|██████████| 1279/1279 [27:36<00:00,  1.30s/it]


In [23]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv', 1)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification5ways_llama_1shot.jsonl', 'gay marriage', samples, detailed=False)

['11arg2']


Processing comments:   3%|▎         | 35/1279 [00:48<28:53,  1.39s/it]


KeyboardInterrupt: 

In [31]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv', 1)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification5ways_llama_1shot.jsonl', 'gay marriage', samples, detailed=False)

['11arg6']


Processing comments: 100%|██████████| 1279/1279 [27:36<00:00,  1.30s/it]


In [16]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv', 5)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification5ways_llama_5shot.jsonl', 'gay marriage', samples, detailed=True)

['11arg6', '11arg1', '11arg3', '11arg4', '11arg2']


Processing comments:   0%|          | 0/1279 [00:00<?, ?it/s]

Processing comments: 100%|██████████| 1279/1279 [28:36<00:00,  1.34s/it]


In [17]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_3ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv', 5)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification3way_llama_5shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=False)

['414721783arg5', '414721767arg4', '414721767arg5', '414721767arg6', '414721783arg6']
3
3
3
3
3


Processing comments: 100%|██████████| 1007/1007 [22:16<00:00,  1.33s/it]


In [14]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_3ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv', 1)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification3way_llama_1shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=False)

['414721767arg5']
3


Processing comments:   0%|          | 0/1007 [00:00<?, ?it/s]

Processing comments:  73%|███████▎  | 731/1007 [16:39<05:09,  1.12s/it]

An unexpected error occurred for comment: Religion has nothing in common with the basis the ... - Error: Error code: 400 - {"message": "Bad Request", "type_": "invalid_request_error"}


Processing comments: 100%|██████████| 1007/1007 [23:07<00:00,  1.38s/it]


In [None]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_3ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv', 1)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification3way_llama_1shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=False)

In [15]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv', 5)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification5way_llama_5shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=False)

['414721783arg6', '414721783arg4', '414721767arg5', '414721767arg6', '414721767arg4']


Processing comments: 100%|██████████| 1007/1007 [22:22<00:00,  1.33s/it]
