In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from huggingface_hub import login
from tqdm import tqdm
import tensorflow as tf
import pandas as pd
import accelerate
import jsonlines
import torch
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import os, csv, re
import openai
import inspect
import typing_extensions as typing


from modelsmith import Forge, VertexAIGenerativeModel

from typing import List
from pydantic import ValidationError, BaseModel, Field
import pandas as pd
from pathlib import Path
import jsonlines as jl
import json
import time
from tqdm import tqdm
 

import google.generativeai as genai
from google.generativeai.types import RequestOptions
from google.api_core import retry
from google.auth import default, transport
from modelsmith import Forge, VertexAIGenerativeModel
from vertexai.generative_models import GenerationConfig, GenerativeModel, Part
from dotenv import load_dotenv
import vertexai

load_dotenv('/Users/guida/llm_argument_tasks/.env')

api_key = os.environ.get('OPENAI_API_KEY')
PROJECT_ID = os.environ.get('GEMINI_PROJECT_ID')
LOCATION = "us-central1"

vertexai.init(
        project="leas-team",
    )

In [2]:
class RelationClassification(typing.TypedDict):
    id: str 
    label: int

In [8]:
def classify_text(id: str, comment_text: str, argument_text: str) -> dict:
    model = genai.GenerativeModel("gemini-1.5-flash")
    
    safety_settings = {
        "HARM_CATEGORY_HARASSMENT": "block_none",
        "HARM_CATEGORY_HATE_SPEECH": "block_none",
        "HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
        "HARM_CATEGORY_DANGEROUS_CONTENT": "block_none"
    }
    
    prompt = f"""
            Analyze the given comment about gay marriage in relation to gay marriage. You need to:
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument.
            - 3 if the comment makes no use of the argument.
            - 5 if the comment supports the argument.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.
            
            The argument to analyze is: {argument_text}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            {comment_text}
            """
    response = model.generate_content(
                    prompt,
                    generation_config=genai.types.GenerationConfig(
                        response_mime_type="application/json",
                        response_schema=RelationClassification,
                        temperature=0,
                        top_p=1,
                    ),
                    safety_settings=safety_settings
    )
                
    return response.text

In [9]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_structured.csv')

def process_comments_with_arguments(df: pd.DataFrame, output_file: str):
    with jsonlines.open(output_file, mode='a') as writer:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']
            try:
                gemini_response = classify_text(
                    id=comment_id,
                    comment_text=comment_text,
                    argument_text=argument_text
                )
                
                classification = json.loads(gemini_response)
                writer.write(classification)

            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

process_comments_with_arguments(gm, 'comarg_gm_relation_identification_3_gemini.jsonl')

Processing comments:   0%|          | 0/1285 [00:00<?, ?it/s]

Processing comments: 100%|██████████| 1285/1285 [10:38<00:00,  2.01it/s]


In [10]:
def classify_text_ugip(id: str, comment_text: str, argument_text: str) -> dict:
    model = genai.GenerativeModel("gemini-1.5-flash")
    
    safety_settings = {
        "HARM_CATEGORY_HARASSMENT": "block_none",
        "HARM_CATEGORY_HATE_SPEECH": "block_none",
        "HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
        "HARM_CATEGORY_DANGEROUS_CONTENT": "block_none"
    }
    
    prompt = f"""
            Analyze the given comment in relation to a specific argument about whether "Under God" should appear in the US Pledge of Allegiance.
            Identify if the comment makes use of the given argument. Assign the following labels:
            - 1 if the comment attacks the argument.
            - 3 if the comment makes no use of the argument.
            - 5 if the comment supports the argument.
            Do NOT use any other label.
            Do NOT include the comment or the argument in the response.
            
            The argument to analyze is: {argument_text}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "label": "the label for the use of the argument in the comment"
            }}
            
            Analyze the following comment in relation to the given argument:
            {comment_text}
            """
    response = model.generate_content(
                    prompt,
                    generation_config=genai.types.GenerationConfig(
                        response_mime_type="application/json",
                        response_schema=RelationClassification,
                        temperature=0,
                        top_p=1,
                    ),
                    safety_settings=safety_settings
    )
                
    return response.text

In [11]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured.csv')

def process_comments_with_arguments(df: pd.DataFrame, output_file: str):
    with jsonlines.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing comments"):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument_text = row['argument_text']
            try:
                gemini_response = classify_text_ugip(
                    id=comment_id,
                    comment_text=comment_text,
                    argument_text=argument_text
                )
                
                classification = json.loads(gemini_response)
                writer.write(classification)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 3}
                writer.write(error_entry)
                continue

process_comments_with_arguments(ugip, 'comarg_ugip_relation_identification_3_gemini.jsonl')

Processing comments:   0%|          | 0/1013 [00:00<?, ?it/s]

Processing comments: 100%|██████████| 1013/1013 [08:51<00:00,  1.91it/s]
