In [38]:

from random import sample
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login
from tqdm import tqdm
import tensorflow as tf
import pandas as pd
import accelerate
import jsonlines
import torch
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os, csv, re
import openai
import inspect
import typing_extensions as typing


from modelsmith import Forge, VertexAIGenerativeModel

from typing import List
from pydantic import ValidationError, BaseModel, Field
import pandas as pd
from pathlib import Path
import jsonlines as jl
import json
import time
from tqdm import tqdm
 

import google.generativeai as genai
from google.generativeai.types import RequestOptions
from google.api_core import retry
from google.auth import default, transport
from modelsmith import Forge, VertexAIGenerativeModel
from vertexai.generative_models import GenerationConfig, GenerativeModel, Part
from dotenv import load_dotenv
import vertexai

load_dotenv('/Users/guida/llm_argument_tasks/.env')

api_key = os.environ.get('OPENAI_API_KEY')
PROJECT_ID = os.environ.get('GEMINI_PROJECT_ID')
LOCATION = "us-central1"

vertexai.init(
        project="leas-team",
    )

In [39]:
class RelationClassification(typing.TypedDict):
    id: str 
    label: int 

In [40]:
def prep_fewshot_samples_2ways(samples_file, n):
    df = pd.read_csv(samples_file)
    ids = df['id'].to_list()
    sampled = sample(ids, n)
    print(sampled)
    df = df[df['id'].isin(sampled)]
    comment = df.iloc[0]['comment_text']
    output = f"Comment: {comment}\n The comment attacks (1), or supports (5) the following argument(s):\n"
    #print(output)
    for i, row in df.iterrows():
        argument = row['argument_text']
        output = f"{output} Argument {i}: {argument}\n"
        label = row['label']
        print(label)
        if label == 2:
            label = 1
            print(label)
        if label == 4:
            label = 5
            print(label)
        output = f"{output} Label: {label}\n\n"
    return output

In [41]:
def prep_fewshot_samples_5ways(samples_file, n):
    df = pd.read_csv(samples_file)
    ids = df['id'].to_list()
    sampled = sample(ids, n)
    print(sampled)
    df = df[df['id'].isin(sampled)]
    comment = df.iloc[0]['comment_text']
    output = f"Comment: {comment}\n The comment explicitly attacks (1), implicitly attacks (2), implicitly supports (4), or explicitly supports (5) the following argument(s):\n"
    #print(output)
    for i, row in df.iterrows():
        argument = row['argument_text']
        output = f"{output} Argument {i}: {argument}\n"
        label = row['label']
        output = f"{output} Label: {label}\n\n"
    return output

In [42]:
def classify_text_2ways(id: str, comment_text: str, argument: str, topic: str, samples: str) -> dict:
        model = genai.GenerativeModel("gemini-1.5-flash")
        
        safety_settings = {
            "HARM_CATEGORY_HARASSMENT": "block_none",
            "HARM_CATEGORY_HATE_SPEECH": "block_none",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
            "HARM_CATEGORY_DANGEROUS_CONTENT": "block_none"
        }
        prompt=f"""
            Analyze the given comment about {topic} in relation to a specific argument. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument.
            - 5 if the comment supports the argument.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.

            Some examples:
            {samples}
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            {comment_text},
            """
        response = model.generate_content(
                    prompt,
                    generation_config=genai.types.GenerationConfig(
                        response_mime_type="application/json",
                        response_schema=RelationClassification,
                        temperature=0,
                        top_p=1,
                    ),
                    safety_settings=safety_settings
        )
                
        return response.text

In [43]:
def classify_text_5ways(id: str, comment_text: str, argument: str, topic: str, samples: str) -> dict:
        model = genai.GenerativeModel("gemini-1.5-flash")
        
        safety_settings = {
            "HARM_CATEGORY_HARASSMENT": "block_none",
            "HARM_CATEGORY_HATE_SPEECH": "block_none",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
            "HARM_CATEGORY_DANGEROUS_CONTENT": "block_none"
        }
        prompt=f"""
            Analyze the given comment about {topic} in relation to a specific argument. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument explicitly.
            - 2 if the comment attacks the argument implicitly/vaguely.
            - 4 if the comment supports the argument implicitly/vaguely.
            - 5 if the comment supports the argument explicitly.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.

            Some examples:
            {samples}
            
            The argument to analyze is: {argument}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            {comment_text},
            """
        response = model.generate_content(
                    prompt,
                    generation_config=genai.types.GenerationConfig(
                        response_mime_type="application/json",
                        response_schema=RelationClassification,
                        temperature=0,
                        top_p=1,
                    ),
                    safety_settings=safety_settings
        )
                
        return response.text

In [44]:
def process_comments_with_arguments(df: pd.DataFrame, output_file: str, topic: str, samples: str, detailed = False):
    with jsonlines.open(output_file, mode='a') as writer:
        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']
            if detailed == False:
                model = classify_text_2ways
            else:
                model = classify_text_5ways
            try:
                gemini_response = model(comment_id, comment_text, argument_text, topic, samples)
                classification = json.loads(gemini_response)
                writer.write(classification)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

In [45]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_2ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_one_shot.csv', 1)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification2ways_gemini_1shot.jsonl', 'gay marriage', samples, detailed=False)

['175arg4']
2
1


Processing comments: 100%|██████████| 431/431 [03:16<00:00,  2.20it/s]


In [46]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_2ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv',5)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification2ways_gemini_5shot.jsonl', 'gay marriage', samples, detailed=False)

['198arg5', '161arg4', '175arg4', '108arg2', '5arg5']
1
2
1
4
5
5
4
5


Processing comments: 100%|██████████| 431/431 [03:15<00:00,  2.20it/s]


In [47]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_one_shot.csv',1)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification5ways_gemini_1shot.jsonl', 'gay marriage', samples, detailed=True)

['198arg5']


Processing comments: 100%|██████████| 431/431 [03:15<00:00,  2.20it/s]


In [48]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv',5)
process_comments_with_arguments(gm, 'comarg_gm_relation_identification5ways_gemini_5shot.jsonl', 'gay marriage', samples, detailed=True)

['161arg4', '175arg4', '5arg5', '198arg5', '108arg2']


Processing comments: 100%|██████████| 431/431 [03:15<00:00,  2.21it/s]


In [49]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_2ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_one_shot.csv', 1)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification2ways_gemini_1shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=False)

['414721831arg6']
4
5


Processing comments:   0%|          | 0/317 [00:00<?, ?it/s]

Processing comments: 100%|██████████| 317/317 [02:36<00:00,  2.02it/s]


In [50]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_2ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv',5)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification2ways_gemini_5shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=False)

['414721831arg6', '414721757arg6', '414721738arg1', '414721922arg3', '414721727arg3']
1
2
1
4
5
5
5


Processing comments: 100%|██████████| 317/317 [02:37<00:00,  2.01it/s]


In [51]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_one_shot.csv',1)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification5ways_gemini_1shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=True)

['414721727arg3']


Processing comments:   0%|          | 0/317 [00:00<?, ?it/s]

Processing comments: 100%|██████████| 317/317 [02:35<00:00,  2.04it/s]


In [52]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_main.csv')
samples = prep_fewshot_samples_5ways('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv',5)
process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification5ways_gemini_5shot.jsonl', 'whether "Under God" should appear in the US Pledge of Allegiance', samples, detailed=True)

['414721727arg3', '414721922arg3', '414721738arg1', '414721757arg6', '414721831arg6']


Processing comments: 100%|██████████| 317/317 [02:37<00:00,  2.02it/s]
