In [None]:
# Data manipulation
import os
import pandas as pd
import csv
import json
import jsonlines
import jsonlines as jl
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from dotenv import load_dotenv
from typing import List
import re

# Machine Learning
import torch
import asyncio
import aiohttp
import torch.nn as nn
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Transformers and Langchain
from pydantic import ValidationError, BaseModel, Field

# API and utility
from huggingface_hub import login
from together import Together
import time
from tqdm import tqdm
import accelerate

load_dotenv('/Users/guida/llm_argument_tasks/.env')

api_key = os.environ.get('TOGETHER_API_KEY')
client = Together(api_key=api_key)

In [None]:
class ArgumentSpan(BaseModel):
    id: str = Field(description="The ID of the comment being analyzed")    
    span: str = Field(description="The span of text in the comment that makes use of the argument") 

In [None]:
def classify_text(id: str, comment_text: str, topic: str, argument_text: str) -> dict:
    extract = client.chat.completions.create(
        messages=[
            {"role": "system", "content": f"""
            Analyze the given comment in relation to a specific argument about {topic}. You need to:
            Identify the relevant span of text where the comment makes use of the given argument. 
            Provide the exact span of the text in the comment that makes use of the argument.
            Do NOT include the comment or the argument in the response, or any additional text.
            
            The argument to analyze is: {argument_text}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "span": "the relevant span of text"
            }}
            
            Analyze the following comment in relation to the given argument:
            """},
            {"role": "user", "content": comment_text},
        ],
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        temperature=0,
        top_k=1,
        top_p=1,
        response_format={
            "type": "json_object",
            "schema": ArgumentSpan.model_json_schema(),
        }
    )
    
    return json.loads(extract.choices[0].message.content)

In [None]:
def process_dataframe_comments(df: pd.DataFrame, topic: str) -> List[dict]:
    label_to_argument = topic_label_to_argument.get(topic, {}) 
    with jsonlines.open(f'yru_{topic}_identification_with_negatives.jsonl', mode='a') as writer:
        for idx, row in tqdm(df.iterrows(), desc="Processing comments", unit="comment", total=len(df)):
            comment_id = row['id'] 
            comment_text = row['text']  
            comment_label = row['label']  

            argument_text = label_to_argument.get(comment_label)
            try:
                classification = classify_text(
                    id=comment_id, 
                    comment_text=comment_text,  
                    topic=topic,
                    argument_text=argument_text
                )
                writer.write(classification)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                continue

    return list(writer)

In [None]:
ab = pd.read_csv('../../clean_data/yru_abortion.csv')
topic = 'abortion'

process_dataframe_comments(ab, topic)