In [40]:
# Data manipulation
import os
import pandas as pd
import csv
import json
import jsonlines
import jsonlines as jl
from pathlib import Path
from dotenv import load_dotenv
from typing import List
import re

# Machine Learning
import torch
import torch.nn as nn
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Transformers and Langchain
from pydantic import ValidationError, BaseModel, Field

# API and utility
from huggingface_hub import login
from together import Together
import time
from tqdm import tqdm
import accelerate


load_dotenv()

api_key = os.environ.get('TOGETHER_API_KEY')
client = Together(api_key=api_key)

In [36]:
def get_llm(model_type):
    if model_type == "llama":
        return client.chat.completions.create(
            model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
            max_tokens=512,
            messages=[],
            temperature=0,
            top_p=1,
            top_k=1,
        )
llm = get_llm("llama")

In [37]:
class ArgumentClassification(BaseModel):
    id: str = Field(description="The ID the comment being analyzed")    
    label: int = Field(description="The label associated with the argument (0 or 1)") 

In [38]:
def classify_text(id: str, comment_text: str, argument: str) -> dict:
    extract = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment about gay marriage in relation to a specific argument. You need to:
            Identify if the comment makes use of the given argument. If it does, assign the label 1. If it does not, assign the label 0.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        response_format={
            "type": "json_object",
            "schema": ArgumentClassification.model_json_schema(),
        }
    )
    
    return json.loads(extract.choices[0].message.content)


## GAY MARRIAGE

In [41]:
gm = pd.read_csv('../../clean_data/GM_structured.csv')

def process_comments_with_arguments(df: pd.DataFrame, output_file: str):
    with jsonlines.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']

            try:
                classification = classify_text(comment_id, comment_text, argument_text)
                writer.write(classification)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                continue

process_comments_with_arguments(gm, 'comarg_gm_argument_identification.jsonl')

Processing comments:   4%|▎         | 46/1285 [01:55<2:41:42,  7.83s/it]

An unexpected error occurred for comment: All these arguments on my left are and have always... - Error: Error code: 429 - {"message": "Request was rejected due to request rate limiting. Your rate limits are 60 RPM (1 QPS) and 60000 TPM (1000 TPS). See details: https://docs.together.ai/docs/rate-limits", "type_": "credit_limit"}


Processing comments:   4%|▍         | 56/1285 [02:39<58:24,  2.85s/it]  


KeyboardInterrupt: 

In [None]:
ugip = pd.read_csv('../../clean_data/UGIP_structured.csv')

def process_comments_with_arguments(df: pd.DataFrame, output_file: str):
    with jsonlines.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']

            try:
                classification = classify_text(comment_id, comment_text, argument_text)
                writer.write(classification)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                continue

process_comments_with_arguments(ugip, 'comarg_ugip_argument_identification.jsonl')

Processing comments: 100%|██████████| 1013/1013 [36:30<00:00,  2.16s/it] 


## Convert JSON into CSV for evaluation

In [9]:
input_ugip = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_ugip_argument_identification.json'
output_ugip = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_ugip_argument_identification.csv'

with open(input_ugip, 'r') as f:
    data = json.load(f)


with open(output_ugip, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['comment_text', 'argument_text', 'label'])
    
    for inner_list in data:
        for item in inner_list:  # Iterate through each inner list
            # Assuming each item is a dictionary and has 'comment', 'argument', and 'label'
            writer.writerow([item['comment'], item['argument'], item['label']])

In [7]:
input_gm = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_gm_argument_identification.json'
output_gm = '/Users/guida/llm_argument_tasks/output_files/llama3/comarg_gm_argument_identification.csv'

with open(input_gm, 'r') as f:
    data = json.load(f)

with open(output_gm, 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(['comment_text', 'argument_text', 'label'])
    
    for inner_list in data:
        for item in inner_list:  # Iterate through each inner list
            # Assuming each item is a dictionary and has 'comment', 'argument', and 'label'
            writer.writerow([item['comment'], item['argument'], item['label']])