In [20]:
from openai import OpenAI
import os
import pandas as pd
from pydantic import BaseModel, Field
from typing import List
from tqdm import tqdm
import json
from random import sample
import jsonlines as jsonl
from openai import OpenAI

import google.generativeai as genai
from google.generativeai.types import RequestOptions
from google.api_core import retry
from google.auth import default, transport
from modelsmith import Forge, VertexAIGenerativeModel
from vertexai.generative_models import GenerationConfig, GenerativeModel, Part
from dotenv import load_dotenv
import vertexai
import typing_extensions as typing

load_dotenv('/Users/guida/llm_argument_tasks/.env')

api_key = os.environ.get('OPENAI_API_KEY')
PROJECT_ID = os.environ.get('GEMINI_PROJECT_ID')
LOCATION = "us-central1"

vertexai.init(
        project="leas-team",
    )

In [21]:
class ArgumentSpan(typing.TypedDict):
    id: str   
    span: str 

In [22]:
# Dictionary for label-to-argument mappings for each topic
topic_label_to_argument = {
    "abortion": {
        "p-right": "Abortion is a woman’s right.",
        "p-rape": "Rape victims need it to be legal.",
        "p-not_human": "A fetus is not a human yet, so it's okay to abort.",
        "p-mother_danger": "Abortion should be allowed when a mother's life is in danger.",
        "p-baby_ill_treatment": "Unwanted babies are ill-treated by parents and/or not always adopted.",
        "p-birth_ctrl": "Birth control fails at times and abortion is one way to deal with it.",
        "p-not_murder": "Abortion is not murder.",
        "p-sick_mom": "Mother is not healthy/financially solvent.",
        "p-other": "Others",
        "c-adopt": "Put baby up for adoption.",
        "c-kill": "Abortion kills a life.",
        "c-baby_right": "An unborn baby is a human and has the right to live.",
        "c-sex": "Be willing to have the baby if you have sex.",
        "c-bad_4_mom": "Abortion is harmful for women.",
        "c-other": "Others"
    },
    "gayRights": {
        "p-normal": "Gay marriage is like any other marriage.",
        "p-right_denied": "Gay people should have the same rights as straight people.",
        "p-no_threat_for_child": "Gay parents can adopt and ensure a happy life for a baby.",
        "p-born": "People are born gay.",
        "p-religion": "Religion should not be used against gay rights.",
        "p-Other": "Others",
        "c-religion": "Religion does not permit gay marriages.",
        "c-abnormal": "Gay marriages are not normal/against nature.",
        "c-threat_to_child": "Gay parents cannot raise kids properly.",
        "c-gay_problems": "Gay people have problems and create social issues.",
        "c-Other": "Others"
    },
    "obama": {
        "p-economy": "Fixed the economy.",
        "p-War": "Ending the wars.",
        "p-republicans": "Better than the republican candidates.",
        "p-decision_policies": "Makes good decisions/policies.",
        "p-quality": "Has qualities of a good leader.",
        "p-health": "Ensured better healthcare.",
        "p-foreign_policies": "Executed effective foreign policies.",
        "p-job": "Created more jobs.",
        "p-Other": "Others",
        "c-economy": "Destroyed our economy.",
        "c-War": "Wars are still on.",
        "c-job": "Unemployment rate is high.",
        "c-health": "Healthcare bill is a failure.",
        "c-decision_policies": "Poor decision-maker.",
        "c-republicans": "We have better republicans than Obama.",
        "c-quality": "Not eligible as a leader.",
        "c-foreign_policies": "Ineffective foreign policies.",
        "c-Other": "Others"
    },
    "marijuana": {
        "p-not_addictive": "Not addictive.",
        "p-medicine": "Used as a medicine for its positive effects.",
        "p-legal": "Legalized marijuana can be controlled and regulated by the government.",
        "p-right": "Prohibition violates human rights.",
        "p-no_damage": "Does not cause any damage to our bodies.",
        "p-Other": "Others",
        "c-health": "Damages our bodies.",
        "c-mind": "Responsible for brain damage.",
        "c-illegal": "If legalized, people will use marijuana and other drugs more.",
        "c-crime": "Causes crime.",
        "c-addiction": "Highly addictive.",
        "c-Other": "Others"
    }
}

In [23]:
def prep_fewshot_samples(samples_file, topic, n):
    df = pd.read_csv(samples_file)

    if n !=5:
        ids = df['uid'].to_list()
        sampled = sample(ids, n)
        #print(sampled)
        df = df[df['uid'].isin(sampled)]
    
    output = ''
    #print(output)
    for i, row in df.iterrows():
        comment = row['text']
        output = f"{output}\n Comment: {comment}\n"
        argument_type = row['label']
        argument = topic_label_to_argument[topic][argument_type]
        output = f"{output} Argument {i}: {argument}\n"
        span = row['line']
        output = f"{output} Span: {span}\n\n"
    return output

In [24]:
def classify_text(id: str, comment_text: str, topic: str, argument_text: str, samples:str) -> dict:
        model = genai.GenerativeModel("gemini-1.5-flash")
        
        safety_settings = {
            "HARM_CATEGORY_HARASSMENT": "block_none",
            "HARM_CATEGORY_HATE_SPEECH": "block_none",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
            "HARM_CATEGORY_DANGEROUS_CONTENT": "block_none"
        }
        prompt = f"""
            Analyze the given comment in relation to a specific argument about {topic}. You need to:
            Identify the relevant span of text where the comment makes use of the given argument. 
            Provide the exact span of the text in the comment that makes use of the argument.
            Only report the exact original fragment of text. Do NOT paraphrase. Do NOT include any additional text.

            Some examples:
            {samples}
            
            The argument to analyze is: {argument_text}
            
            Provide your response in the following JSON format:
            
            {{
                "id": "{id}",
                "span": "the relevant span of text"
            }}
            
            Analyze the following comment in relation to the given argument:
            {comment_text},
        """
        response = model.generate_content(
                        prompt,
                        generation_config=genai.types.GenerationConfig(
                            response_mime_type="application/json",
                            response_schema=ArgumentSpan,
                            temperature=0,
                            top_p=1,
                        ),
                        safety_settings=safety_settings
        )
                    
        return response.text

In [25]:
def process_dataframe_comments(df: pd.DataFrame, topic: str, samples: str, n: int) -> List[dict]:
    label_to_argument = topic_label_to_argument.get(topic, {}) 
    with jsonl.open(f'yru_{topic}_span_identification_gemini_{n}shot.jsonl', mode='a') as writer:
        for idx, row in tqdm(df.iterrows(), desc="Processing comments", unit="comment", total=len(df)):
            comment_id = row['id'] 
            comment_text = row['text']  
            comment_label = row['label']  

            argument_text = label_to_argument.get(comment_label)

            try:
                gpt_response = classify_text(
                    id=comment_id,
                    comment_text=comment_text,
                    topic=topic,
                    argument_text=argument_text,
                    samples=samples
                )
                
                classification = json.loads(gpt_response)
                output_entry = {"id": comment_id, "span": classification["span"]}
                writer.write(output_entry)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "span": ""}
                writer.write(error_entry)
                continue

            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "span": ""}
                writer.write(error_entry)
                continue

In [26]:
#topic = 'abortion'
topic = 'marijuana'

n = 5

main_df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_main.csv')

samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_shots.csv', topic, n)
process_dataframe_comments(main_df, topic, samples, n)

Processing comments:   0%|          | 0/686 [00:00<?, ?comment/s]

Processing comments: 100%|██████████| 686/686 [06:25<00:00,  1.78comment/s]


In [27]:
topic = 'abortion'

n = 5

main_df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_main.csv')

samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_shots.csv', topic, n)
process_dataframe_comments(main_df, topic, samples, n)

Processing comments:   0%|          | 0/734 [00:00<?, ?comment/s]

Processing comments:  29%|██▉       | 213/734 [01:58<06:01,  1.44comment/s]

An unexpected error occurred for comment: 1) And yet, half the people here were talking abou... - Error: Unknown field for Candidate: finish_message


Processing comments:  93%|█████████▎| 684/734 [06:11<00:27,  1.81comment/s]

An unexpected error occurred for comment: "I have simply seen no argument that comes close t... - Error: Unknown field for Candidate: finish_message


Processing comments:  93%|█████████▎| 685/734 [06:12<00:26,  1.83comment/s]

An unexpected error occurred for comment: "I have simply seen no argument that comes close t... - Error: Unknown field for Candidate: finish_message


Processing comments: 100%|██████████| 734/734 [06:39<00:00,  1.84comment/s]


In [28]:
#topic = 'abortion'
topic = 'obama'

n = 5

main_df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_main.csv')

samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_shots.csv', topic, n)
process_dataframe_comments(main_df, topic, samples, n)

Processing comments:   0%|          | 0/641 [00:00<?, ?comment/s]

Processing comments:  37%|███▋      | 240/641 [02:17<03:15,  2.06comment/s]

An unexpected error occurred for comment: obama only won because of all the welfare niggas o... - Error: Unknown field for Candidate: finish_message


Processing comments:  86%|████████▌ | 549/641 [05:19<01:04,  1.42comment/s]

An unexpected error occurred for comment: Iâm a Dr Ron Paul kind of guy. I donât vote ba... - Error: Unknown field for Candidate: finish_message


Processing comments:  86%|████████▌ | 550/641 [05:20<00:58,  1.55comment/s]

An unexpected error occurred for comment: Iâm a Dr Ron Paul kind of guy. I donât vote ba... - Error: Unknown field for Candidate: finish_message


Processing comments: 100%|██████████| 641/641 [06:12<00:00,  1.72comment/s]


In [29]:
#topic = 'abortion'
topic = 'gayRights'

n = 5

main_df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_main.csv')

samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_shots.csv', topic, n)
process_dataframe_comments(main_df, topic, samples, n)

Processing comments:   0%|          | 0/767 [00:00<?, ?comment/s]

Processing comments:   6%|▋         | 48/767 [00:26<06:51,  1.75comment/s]

An unexpected error occurred for comment: Naw fags can't get married in England.. they get c... - Error: Unknown field for Candidate: finish_message


Processing comments: 100%|██████████| 767/767 [07:12<00:00,  1.77comment/s]


In [30]:
#topic = 'abortion'
topic = 'marijuana'

n = 1

main_df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_main.csv')

samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_shots.csv', topic, n)
process_dataframe_comments(main_df, topic, samples, n)

Processing comments: 100%|██████████| 686/686 [05:44<00:00,  1.99comment/s]


In [31]:
topic = 'abortion'

n = 1

main_df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_main.csv')

samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_shots.csv', topic, n)
process_dataframe_comments(main_df, topic, samples, n)

Processing comments:   0%|          | 0/734 [00:00<?, ?comment/s]

Processing comments:  21%|██        | 152/734 [01:16<05:04,  1.91comment/s]

An unexpected error occurred for comment: 1. I've yet to see anyone ask a pro-lifer this que... - Error: Unknown field for Candidate: finish_message


Processing comments:  23%|██▎       | 168/734 [01:24<04:43,  2.00comment/s]

An unexpected error occurred for comment: sigh  I want to say yes for the most part. I think... - Error: Unknown field for Candidate: finish_message


Processing comments:  93%|█████████▎| 685/734 [05:51<00:26,  1.83comment/s]

An unexpected error occurred for comment: "I have simply seen no argument that comes close t... - Error: Unknown field for Candidate: finish_message


Processing comments: 100%|██████████| 734/734 [06:17<00:00,  1.94comment/s]


In [32]:
#topic = 'abortion'
topic = 'obama'

n = 1

main_df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_main.csv')

samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_shots.csv', topic, n)
process_dataframe_comments(main_df, topic, samples, n)

Processing comments:   0%|          | 0/641 [00:00<?, ?comment/s]

Processing comments:  37%|███▋      | 240/641 [02:10<04:25,  1.51comment/s]

An unexpected error occurred for comment: obama only won because of all the welfare niggas o... - Error: Unknown field for Candidate: finish_message


Processing comments:  86%|████████▌ | 549/641 [04:56<00:56,  1.63comment/s]

An unexpected error occurred for comment: Iâm a Dr Ron Paul kind of guy. I donât vote ba... - Error: Unknown field for Candidate: finish_message


Processing comments:  86%|████████▌ | 550/641 [04:57<00:52,  1.74comment/s]

An unexpected error occurred for comment: Iâm a Dr Ron Paul kind of guy. I donât vote ba... - Error: Unknown field for Candidate: finish_message


Processing comments: 100%|██████████| 641/641 [05:44<00:00,  1.86comment/s]


In [33]:
#topic = 'abortion'
topic = 'gayRights'

n = 1

main_df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_main.csv')

samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_shots.csv', topic, n)
process_dataframe_comments(main_df, topic, samples, n)

Processing comments:   0%|          | 0/767 [00:00<?, ?comment/s]

Processing comments:  49%|████▉     | 376/767 [03:15<03:10,  2.05comment/s]

An unexpected error occurred for comment: Actually at this rate earth will be overpopulated ... - Error: Unknown field for Candidate: finish_message


Processing comments: 100%|██████████| 767/767 [06:40<00:00,  1.92comment/s]
