In [18]:

from openai import OpenAI
import os
import pandas as pd
import jsonlines
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from typing import List
from tqdm import tqdm
from together import Together
import json, csv
load_dotenv('/Users/guida/llm_argument_tasks/.env')


import google.generativeai as genai
from google.generativeai.types import RequestOptions
from google.api_core import retry
from google.auth import default, transport
from modelsmith import Forge, VertexAIGenerativeModel
from vertexai.generative_models import GenerationConfig, GenerativeModel, Part
from dotenv import load_dotenv
import vertexai

In [19]:
PROJECT_ID = os.environ.get('GEMINI_PROJECT_ID')
LOCATION = "us-central1"

In [20]:
from openai import OpenAI
import os
import pandas as pd
from pydantic import BaseModel, Field
from typing import List
from tqdm import tqdm
import json
from random import sample
import jsonlines as jsonl
from openai import OpenAI
import typing_extensions as typing


In [21]:
class ArgumentClassification(typing.TypedDict):
    id: str 
    label: int 

In [22]:
label_mapping = {1: 1, 2: 1, 3: 0, 4: 1, 5:1}

In [23]:
# Dictionary for label-to-argument mappings for each topic
topic_label_to_argument = {
    "abortion": {
        "p-right": "Abortion is a woman’s right.",
        "p-rape": "Rape victims need it to be legal.",
        "p-not_human": "A fetus is not a human yet, so it's okay to abort.",
        "p-mother_danger": "Abortion should be allowed when a mother's life is in danger.",
        "p-baby_ill_treatment": "Unwanted babies are ill-treated by parents and/or not always adopted.",
        "p-birth_ctrl": "Birth control fails at times and abortion is one way to deal with it.",
        "p-not_murder": "Abortion is not murder.",
        "p-sick_mom": "Mother is not healthy/financially solvent.",
        "p-other": "Others",
        "c-adopt": "Put baby up for adoption.",
        "c-kill": "Abortion kills a life.",
        "c-baby_right": "An unborn baby is a human and has the right to live.",
        "c-sex": "Be willing to have the baby if you have sex.",
        "c-bad_4_mom": "Abortion is harmful for women.",
        "c-other": "Others"
    },
    "gayRights": {
        "p-normal": "Gay marriage is like any other marriage.",
        "p-right_denied": "Gay people should have the same rights as straight people.",
        "p-no_threat_for_child": "Gay parents can adopt and ensure a happy life for a baby.",
        "p-born": "People are born gay.",
        "p-religion": "Religion should not be used against gay rights.",
        "p-Other": "Others",
        "c-religion": "Religion does not permit gay marriages.",
        "c-abnormal": "Gay marriages are not normal/against nature.",
        "c-threat_to_child": "Gay parents cannot raise kids properly.",
        "c-gay_problems": "Gay people have problems and create social issues.",
        "c-Other": "Others"
    },
    "obama": {
        "p-economy": "Fixed the economy.",
        "p-War": "Ending the wars.",
        "p-republicans": "Better than the republican candidates.",
        "p-decision_policies": "Makes good decisions/policies.",
        "p-quality": "Has qualities of a good leader.",
        "p-health": "Ensured better healthcare.",
        "p-foreign_policies": "Executed effective foreign policies.",
        "p-job": "Created more jobs.",
        "p-Other": "Others",
        "c-economy": "Destroyed our economy.",
        "c-War": "Wars are still on.",
        "c-job": "Unemployment rate is high.",
        "c-health": "Healthcare bill is a failure.",
        "c-decision_policies": "Poor decision-maker.",
        "c-republicans": "We have better republicans than Obama.",
        "c-quality": "Not eligible as a leader.",
        "c-foreign_policies": "Ineffective foreign policies.",
        "c-Other": "Others"
    },
    "marijuana": {
        "p-not_addictive": "Not addictive.",
        "p-medicine": "Used as a medicine for its positive effects.",
        "p-legal": "Legalized marijuana can be controlled and regulated by the government.",
        "p-right": "Prohibition violates human rights.",
        "p-no_damage": "Does not cause any damage to our bodies.",
        "p-Other": "Others",
        "c-health": "Damages our bodies.",
        "c-mind": "Responsible for brain damage.",
        "c-illegal": "If legalized, people will use marijuana and other drugs more.",
        "c-crime": "Causes crime.",
        "c-addiction": "Highly addictive.",
        "c-Other": "Others"
    }
}

In [24]:
def prep_fewshot_samples_comarg(samples_file, topic, n):
    df = pd.read_csv(samples_file)
    ids = df['id'].to_list()
    sampled = sample(ids, n)
    print(sampled)
    df = df[df['id'].isin(sampled)]
    comment = df.iloc[0]['comment_text']
    output = f"Comment: {comment}\n The following arguments are present (1) or not present (0) in this comment:\n"
    for i, row in df.iterrows():
        argument = row['argument_text']
        output = f"{output} Argument {i}: {argument}\n"
        label = label_mapping[row['label']]
        output = f"{output} Label: {label}\n\n"
    return output

In [25]:
def prep_fewshot_samples(samples_file, topic):
    df = pd.read_csv(samples_file)
    output = ''
    for i, row in df.iterrows():
        comment = row['text']
        output = f"{output}\n Comment: {comment}\n The following arguments are present (1) or not present (0) in this comment:\n"
        argument_type = row['label']
        argument = topic_label_to_argument[topic][argument_type]
        output = f"{output} Argument {i}: {argument}\n"
        label = row['present']
        output = f"{output} Label: {label}\n\n"
    return output

In [26]:
def classify_text(id: str, comment_text: str, topic: str, argument: str, samples: str) -> dict:
        model = genai.GenerativeModel("gemini-1.5-flash")
        
        safety_settings = {
            "HARM_CATEGORY_HARASSMENT": "block_none",
            "HARM_CATEGORY_HATE_SPEECH": "block_none",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
            "HARM_CATEGORY_DANGEROUS_CONTENT": "block_none"
        }
        
        prompt = f"""
                Analyze whether the following comment about {topic} contains a specific argument.

                Comment to analyze: {comment_text}
                Argument to check for: {argument}

                Instructions:
                1. Determine if the comment explicitly or implicitly uses the given argument
                2. Assign a binary label:
                - 1 if the argument is present
                - 0 if the argument is not present

                Requirements:
                - Only use 1 or 0 as labels
                - Provide output in valid JSON format
                - Do not repeat or include the input text in the response
                - Focus solely on the presence/absence of the specific argument

                Return your analysis in this exact JSON format:
                {{
                    "id": "{id}",
                    "label": label_value
                }}

                where label_value must be either 1 or 0 (without quotes)
                
                Some examples:

                {samples}

                Analyze the following comment in relation to the given argument:

                {comment_text}
            """
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                response_mime_type="application/json",
                response_schema=ArgumentClassification,
                temperature=0,
                top_p=1,
            ),
            safety_settings=safety_settings
    )
        
        return response.text

In [27]:
def process_dataframe_comments(df: pd.DataFrame, topic: str, file_name: str, samples: str, n: int):
    
    with jsonl.open(f'/Users/guida/llm_argument_tasks/evaluation-lea/comarg-task1/gemini/Split1/1-s/comarg_{file_name}_identification_gemini_{n}shot_split_1.jsonl', mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), desc="Processing comments", unit="comment", total=len(df)):
            comment_id = row['id'] 
            comment_text = row['comment_text']  
            argument = row['argument_text']  
            try:
                classification = classify_text(
                    id=comment_id, 
                    comment_text=comment_text,  
                    topic=topic,
                    argument=argument,
                    samples=samples
                )

                classification = json.loads(classification)
                output_entry = {"id": comment_id, "label": classification["label"]}
                #print(output_entry)
                writer.write(output_entry)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 0}
                writer.write(error_entry)
                continue
            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 0}
                writer.write(error_entry)
            continue

In [29]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments_main.csv')
topic = 'gay marriage'
file_name = 'gm'
n = 1
samples = prep_fewshot_samples_comarg('/Users/guida/llm_argument_tasks/clean_data/GM_structured_one_shot.csv', topic, n)
process_dataframe_comments(gm, topic, file_name, samples, n)

['108arg2']


Processing comments:   0%|          | 0/1379 [00:00<?, ?comment/s]

Processing comments: 100%|██████████| 1379/1379 [15:43<00:00,  1.46comment/s]


In [None]:
gm = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/GM_all_arguments_main.csv')
topic = 'gay marriage'
file_name = 'gm'
n = 5
samples = prep_fewshot_samples_comarg('/Users/guida/llm_argument_tasks/clean_data/GM_structured_shots.csv', topic, n)

process_dataframe_comments(gm, topic, file_name, samples, n)

['198arg5', '5arg5', '108arg2', '161arg4', '175arg4']


Processing comments: 100%|██████████| 1379/1379 [13:48<00:00,  1.66comment/s]


## UGIP


In [None]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments_main.csv')
topic = 'whether "Under God" should appear in the US Pledge of Allegiance'
file_name = 'ugip'
n = 5
samples = prep_fewshot_samples_comarg('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_shots.csv', topic, n)

process_dataframe_comments(ugip, topic, file_name, samples, n)

['414721738arg1', '414721922arg3', '414721831arg6', '414721727arg3', '414721757arg6']


Processing comments:   0%|          | 0/2094 [00:00<?, ?comment/s]

Processing comments: 100%|██████████| 2094/2094 [22:06<00:00,  1.58comment/s]


In [None]:
ugip = pd.read_csv('/Users/guida/llm_argument_tasks/clean_data/UGIP_all_arguments_main.csv')
topic = 'whether "Under God" should appear in the US Pledge of Allegiance'
file_name = 'ugip'
n = 1
samples = prep_fewshot_samples_comarg('/Users/guida/llm_argument_tasks/clean_data/UGIP_structured_one_shot.csv', topic, n)

process_dataframe_comments(ugip, topic, file_name, samples, n)

['414721757arg6']


Processing comments:   0%|          | 0/2094 [00:00<?, ?comment/s]

Processing comments: 100%|██████████| 2094/2094 [22:27<00:00,  1.55comment/s]


In [None]:
def classify_text(id: str, comment_text: str, topic: str, argument: str, samples: str) -> dict:
        model = genai.GenerativeModel("gemini-1.5-flash")
        
        safety_settings = {
            "HARM_CATEGORY_HARASSMENT": "block_none",
            "HARM_CATEGORY_HATE_SPEECH": "block_none",
            "HARM_CATEGORY_SEXUALLY_EXPLICIT": "block_none",
            "HARM_CATEGORY_DANGEROUS_CONTENT": "block_none"
        }
        
        prompt = f"""
                Analyze whether the following comment about {topic} contains a specific argument.

                Comment to analyze: {comment_text}
                Argument to check for: {argument}

                Instructions:
                1. Determine if the comment explicitly or implicitly uses the given argument
                2. Assign a binary label:
                - 1 if the argument is present
                - 0 if the argument is not present

                Requirements:
                - Only use 1 or 0 as labels
                - Provide output in valid JSON format
                - Do not repeat or include the input text in the response
                - Focus solely on the presence/absence of the specific argument

                Return your analysis in this exact JSON format:
                {{
                    "id": "{id}",
                    "label": label_value
                }}

                where label_value must be either 1 or 0 (without quotes)
                
                Some examples:

                {samples}

                Analyze the following comment in relation to the given argument:

                {comment_text}
            """
        response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                response_mime_type="application/json",
                response_schema=ArgumentClassification,
                temperature=0,
                top_p=1,
            ),
            safety_settings=safety_settings
    )
        
        return response.text

In [None]:
def process_dataframe_comments(df: pd.DataFrame, topic: str, samples: str, n: int):
    label_to_argument = topic_label_to_argument.get(topic, {}) 
    with jsonl.open(f'/Users/guida/llm_argument_tasks/run_all_k_shots/task1/shot_1/yru_{topic}_identification_gemini_{n}shot.jsonl', mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), desc="Processing comments", unit="comment", total=len(df)):
            comment_id = row['uid'] 
            comment_text = row['text']  
            comment_label = row['label']  

            argument_text = label_to_argument.get(comment_label)
            try:
                classification = classify_text(
                    id=comment_id, 
                    comment_text=comment_text,  
                    topic=topic,
                    argument=argument_text,
                    samples=samples
                )
                classification = json.loads(classification)
                output_entry = {"id": comment_id, "label": classification["label"]}
                #print(output_entry)
                writer.write(output_entry)
            except json.JSONDecodeError as e:
                print(f"JSONDecodeError for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 0}
                writer.write(error_entry)
                continue
            except Exception as e:
                print(f"An unexpected error occurred for comment: {comment_text[:50]}... - Error: {e}")
                error_entry = {"id": comment_id, "label": 0}
                writer.write(error_entry)
            continue

In [27]:
topic = 'abortion'

for n in [1, 5]:

    df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_with_negatives_main.csv')
    
    samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_with_negatives_{n}shot.csv', topic)
    process_dataframe_comments(df, topic, samples, n)

Processing comments:   0%|          | 0/6685 [00:00<?, ?comment/s]

Processing comments: 100%|██████████| 6685/6685 [1:10:03<00:00,  1.59comment/s]
Processing comments: 100%|██████████| 6685/6685 [1:15:29<00:00,  1.48comment/s]


## Marijuana

In [28]:
topic = 'marijuana'

for n in [1, 5]:

    df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_with_negatives_main.csv')
    
    samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_with_negatives_{n}shot.csv', topic)

    process_dataframe_comments(df, topic, samples, n)

Processing comments:   0%|          | 2/5011 [00:01<56:26,  1.48comment/s]  

Processing comments: 100%|██████████| 5011/5011 [56:25<00:00,  1.48comment/s]  
Processing comments: 100%|██████████| 5011/5011 [1:01:11<00:00,  1.36comment/s]


## Obama

In [29]:
topic = 'obama'

for n in [1, 5]:

    df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_with_negatives_main.csv')
    
    samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_with_negatives_{n}shot.csv', topic)

    process_dataframe_comments(df, topic, samples, n)

Processing comments:   0%|          | 0/7915 [00:00<?, ?comment/s]

Processing comments: 100%|██████████| 7915/7915 [1:27:39<00:00,  1.50comment/s]
Processing comments: 100%|██████████| 7915/7915 [1:30:56<00:00,  1.45comment/s]  


## Gay Rights

In [30]:
topic = 'gayRights'

for n in [1, 5]:

    df = pd.read_csv(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_with_negatives_main.csv')
    
    samples = prep_fewshot_samples(f'/Users/guida/llm_argument_tasks/clean_data/yru_{topic}_with_negatives_{n}shot.csv', topic)

    process_dataframe_comments(df, topic, samples, n)

Processing comments:   0%|          | 0/5847 [00:00<?, ?comment/s]

Processing comments: 100%|██████████| 5847/5847 [1:03:48<00:00,  1.53comment/s]
Processing comments: 100%|██████████| 5847/5847 [1:05:02<00:00,  1.50comment/s]


## Run remaining splits

In [35]:
def check_existing_output(output_file):
    """Check if output file exists and is complete (not empty/corrupted)"""
    if not os.path.exists(output_file):
        return False
    try:
        # Try to read the file to ensure it's not corrupted
        with jsonl.open(output_file) as reader:
            entries = list(reader)
        return len(entries) > 0
    except:
        return False

def create_split_directories(base_output_path, n_splits):
    """Create directories for each split if they don't exist"""
    for split in range(2, n_splits + 1):
        split_dir = os.path.join(base_output_path, f'Split{split}')
        os.makedirs(split_dir, exist_ok=True)
    return base_output_path

def run_splits_for_dataset(dataset_type, topic, base_data_path, splits_path, base_output_path, n_splits, n_shots):
    """
    Run the analysis for a dataset across multiple splits, skipping already processed files.
    """
    # Create split directories
    create_split_directories(base_output_path, n_splits)
    
    for n_shot in n_shots:
        for split in range(2, n_splits + 1):
            # Define output directory for this split
            output_dir = os.path.join(base_output_path, f'Split{split}')
            
            # Construct file names based on dataset type
            if dataset_type == 'comarg':
                if topic.lower() == 'gm':
                    base_file = 'GM_all_arguments_main.csv'
                    split_file = f'GM_all_arguments_main_{n_shot}shot_split_{split}.csv'
                    file_name = 'gm'
                else:  # UGIP
                    base_file = 'UGIP_all_arguments_main.csv'
                    split_file = f'UGIP_all_arguments_main_{n_shot}shot_split_{split}.csv'
                    file_name = 'ugip'
                
                output_file = os.path.join(output_dir, f'comarg_{file_name}_identification_gemini_{n_shot}shot_split_{split}.jsonl')
                
                # Skip if file already exists and is complete
                if check_existing_output(output_file):
                    print(f"Skipping existing file: {output_file}")
                    continue
                
                # Read the data
                df = pd.read_csv(os.path.join(base_data_path, base_file))
                topic_name = 'gay marriage' if topic.lower() == 'gm' else 'whether "Under God" should appear in the US Pledge of Allegiance'
                
                # Get samples for this split
                samples = prep_fewshot_samples_comarg(
                    os.path.join(splits_path, split_file),
                    topic_name,
                    n_shot
                )
                
                process_dataframe_comments(df, topic_name, file_name, samples, n_shot, output_file)
                
            else:  # YRU datasets
                base_file = f'yru_{topic}_with_negatives_main.csv'
                split_file = f'yru_{topic}_with_negatives_main_{n_shot}shot_split_{split}.csv'
                output_file = os.path.join(output_dir, f'yru_{topic}_identification_gemini_{n_shot}shot_split_{split}.jsonl')
                
                # Skip if file already exists and is complete
                if check_existing_output(output_file):
                    print(f"Skipping existing file: {output_file}")
                    continue
                    
                
                print(f"Processing file: {output_file}")
                
                # Read the data
                df = pd.read_csv(os.path.join(base_data_path, base_file))
                
                # Get samples for this split
                samples = prep_fewshot_samples(
                    os.path.join(splits_path, split_file),
                    topic
                )
                
                process_yru_dataframe_comments(df, topic, samples, n_shot, output_file)

def process_dataframe_comments(df, topic, file_name, samples, n_shot, output_file):
    """Modified version of the original process_dataframe_comments for comarg datasets"""
    with jsonl.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), desc="Processing comments", unit="comment", total=len(df)):
            comment_id = row['id']
            comment_text = row['comment_text']
            argument = row['argument_text']
            
            try:
                classification = classify_text(
                    id=comment_id,
                    comment_text=comment_text,
                    topic=topic,
                    argument=argument,
                    samples=samples
                )
                classification = json.loads(classification)
                output_entry = {"id": comment_id, "label": classification["label"]}
                writer.write(output_entry)
            except (json.JSONDecodeError, Exception) as e:
                print(f"Error processing comment {comment_id}: {str(e)}")
                error_entry = {"id": comment_id, "label": 0}
                writer.write(error_entry)
                
def process_yru_dataframe_comments(df, topic, samples, n_shot, output_file):
    """Modified version of the original process_dataframe_comments for YRU datasets"""
    label_to_argument = topic_label_to_argument.get(topic, {})
    with jsonl.open(output_file, mode='w') as writer:
        for idx, row in tqdm(df.iterrows(), desc="Processing comments", unit="comment", total=len(df)):
            comment_id = row['uid']
            comment_text = row['text']
            comment_label = row['label']
            argument_text = label_to_argument.get(comment_label)
            
            try:
                classification = classify_text(
                    id=comment_id,
                    comment_text=comment_text,
                    topic=topic,
                    argument=argument_text,
                    samples=samples
                )
                
                classification = json.loads(classification)
                output_entry = {"id": comment_id, "label": classification["label"]}
                writer.write(output_entry)
            except (json.JSONDecodeError, Exception) as e:
                print(f"Error processing comment {comment_id}: {str(e)}")
                error_entry = {"id": comment_id, "label": 0}
                writer.write(error_entry)
                
def check_existing_output(file_path):
    """Check if the output file already exists and is complete."""
    return os.path.exists(file_path)

def create_split_directories(base_output_path, n_splits):
    """Create directories for a specific split (e.g., Split5)."""
    split_dir = os.path.join(base_output_path, f'Split{n_splits}')
    os.makedirs(split_dir, exist_ok=True)
    return split_dir

def run_specific_split_for_yru(topic, n_shot, base_data_path, splits_path, base_output_path):
    """Run the process specifically for 'yru_gayRights_identification_gemini_1shot_split_2.jsonl'."""
    split = 2  # Since we are re-running Split2
    output_dir = create_split_directories(base_output_path, split)
    
    output_file = os.path.join(output_dir, f'yru_{topic}_identification_gemini_{n_shot}shot_split_{split}.jsonl')
    
    # Check if the file already exists and is complete
    if check_existing_output(output_file):
        print(f"Skipping existing file: {output_file}")
        return
    
    print(f"Processing file: {output_file}")
    # Read the data
    base_file = f'yru_{topic}_with_negatives_main.csv'
    df = pd.read_csv(os.path.join(base_data_path, base_file))
    
    # Get samples for this split
    split_file = f'yru_{topic}_with_negatives_main_{n_shot}shot_split_{split}.csv'
    samples = prep_fewshot_samples(
        os.path.join(splits_path, split_file),
        topic
    )
    
    # Process the dataframe comments and save the results
    process_yru_dataframe_comments(df, topic, samples, n_shot, output_file)

# Run the specific split for 'gayRights' and '1shot' on Split2
base_data_path = '/Users/guida/llm_argument_tasks/clean_data'
splits_path = '/Users/guida/llm_argument_tasks/run_all_k_shots/k-shots'
base_output_path = '/Users/guida/llm_argument_tasks/results_T1_gemini_splits'

# Call the function to process this specific split
run_specific_split_for_yru('gayRights', 1, base_data_path, splits_path, base_output_path)
                
"""base_data_path = '/Users/guida/llm_argument_tasks/clean_data'
splits_path = '/Users/guida/llm_argument_tasks/run_all_k_shots/k-shots'
base_output_path = '/Users/guida/llm_argument_tasks/results_T1_gemini_splits'

# Process ComArg datasets
run_splits_for_dataset('comarg', 'gm', base_data_path, splits_path, base_output_path, 4, [1, 5])
run_splits_for_dataset('comarg', 'ugip', base_data_path, splits_path, base_output_path, 4, [1, 5])

# Process YRU datasets
for topic in ['abortion', 'marijuana', 'obama', 'gayRights']:
    run_splits_for_dataset('yru', topic, base_data_path, splits_path, base_output_path, 4, [1, 5])"""

Processing file: /Users/guida/llm_argument_tasks/results_T1_gemini_splits/Split2/yru_gayRights_identification_gemini_1shot_split_2.jsonl


Processing comments: 100%|██████████| 5847/5847 [1:05:28<00:00,  1.49comment/s]


"base_data_path = '/Users/guida/llm_argument_tasks/clean_data'\nsplits_path = '/Users/guida/llm_argument_tasks/run_all_k_shots/k-shots'\nbase_output_path = '/Users/guida/llm_argument_tasks/results_T1_gemini_splits'\n\n# Process ComArg datasets\nrun_splits_for_dataset('comarg', 'gm', base_data_path, splits_path, base_output_path, 4, [1, 5])\nrun_splits_for_dataset('comarg', 'ugip', base_data_path, splits_path, base_output_path, 4, [1, 5])\n\n# Process YRU datasets\nfor topic in ['abortion', 'marijuana', 'obama', 'gayRights']:\n    run_splits_for_dataset('yru', topic, base_data_path, splits_path, base_output_path, 4, [1, 5])"

In [36]:
def run_specific_comarg_splits(dataset_type, topics, base_data_path, splits_path, output_path, n_shot):
    """
    Run specific splits for ComArg datasets.
    
    Args:
        dataset_type (str): Type of dataset ('comarg')
        topics (list): List of topics to process ('gm' or 'ugip')
        base_data_path (str): Path to base data directory
        splits_path (str): Path to splits directory
        output_path (str): Path to save output files
        n_shot (int): Number of shots to use
    """
    for topic in topics:
        # Construct file names based on topic
        if topic.lower() == 'gm':
            base_file = 'GM_all_arguments_main.csv'
            split_file = f'GM_all_arguments_main_{n_shot}shot_split_1.csv'
            file_name = 'gm'
        else:  # UGIP
            base_file = 'UGIP_all_arguments_main.csv'
            split_file = f'UGIP_all_arguments_main_{n_shot}shot_split_1.csv'
            file_name = 'ugip'
        
        output_file = os.path.join(output_path, f'comarg_{file_name}_identification_gemini_{n_shot}shot_split_1.jsonl')
        
        # Skip if file already exists and is complete
        if check_existing_output(output_file):
            print(f"Skipping existing file: {output_file}")
            continue
        
        print(f"Processing file: {output_file}")
        
        # Read the data
        df = pd.read_csv(os.path.join(base_data_path, base_file))
        topic_name = 'gay marriage' if topic.lower() == 'gm' else 'whether "Under God" should appear in the US Pledge of Allegiance'
        
        # Get samples for this split
        samples = prep_fewshot_samples_comarg(
            os.path.join(splits_path, split_file),
            topic_name,
            n_shot
        )
        
        process_dataframe_comments(df, topic_name, file_name, samples, n_shot, output_file)

base_data_path = '/Users/guida/llm_argument_tasks/clean_data'
splits_path = '/Users/guida/llm_argument_tasks/run_all_k_shots/k-shots'
output_path = '/Users/guida/llm_argument_tasks/evaluation-lea/comarg-task1/gemini/Split1/5-s'

topics_to_run = ['gm', 'ugip']
run_specific_comarg_splits('comarg', topics_to_run, base_data_path, splits_path, output_path, n_shot=5)

Processing file: /Users/guida/llm_argument_tasks/evaluation-lea/comarg-task1/gemini/Split1/5-s/comarg_gm_identification_gemini_5shot_split_1.jsonl
['198arg5', '175arg4', '161arg4', '5arg5', '108arg2']


Processing comments: 100%|██████████| 1379/1379 [15:02<00:00,  1.53comment/s]


Processing file: /Users/guida/llm_argument_tasks/evaluation-lea/comarg-task1/gemini/Split1/5-s/comarg_ugip_identification_gemini_5shot_split_1.jsonl
['414721738arg1', '414721831arg6', '414721757arg6', '414721922arg3', '414721727arg3']


Processing comments:  37%|███▋      | 771/2094 [08:56<43:58,  1.99s/comment]  

Error processing comment 414721771arg3: 500 Internal error encountered.


Processing comments:  37%|███▋      | 772/2094 [08:56<35:42,  1.62s/comment]

Error processing comment 414721771arg4: 500 Internal error encountered.


Processing comments: 100%|██████████| 2094/2094 [23:41<00:00,  1.47comment/s]


In [33]:
def create_split_directories(base_output_path, n_splits):
    """Create directories for a specific split (Split5)"""
    split_dir = os.path.join(base_output_path, 'Split5')
    os.makedirs(split_dir, exist_ok=True)
    return base_output_path

def run_splits_for_dataset(dataset_type, topic, base_data_path, splits_path, base_output_path, n_shots):
    """
    Run the analysis for a dataset on Split5, skipping already processed files.
    """
    # Create the directory for Split5
    create_split_directories(base_output_path, 5)
    
    for n_shot in n_shots:
        # Define output directory for Split5
        output_dir = os.path.join(base_output_path, 'Split5')
        
        # Process Split5 only
        split = 5
        if dataset_type == 'comarg':
            if topic.lower() == 'gm':
                base_file = 'GM_all_arguments.csv'
                split_file = f'GM_all_arguments_main_{n_shot}shot_split_{split}.csv'
                file_name = 'gm'
            else:  # UGIP
                base_file = 'UGIP_all_arguments_main.csv'
                split_file = f'UGIP_all_arguments_main_{n_shot}shot_split_{split}.csv'
                file_name = 'ugip'
            
            output_file = os.path.join(output_dir, f'comarg_{file_name}_identification_gemini_{n_shot}shot_split_{split}.jsonl')
            
            # Skip if file already exists and is complete
            if check_existing_output(output_file):
                print(f"Skipping existing file: {output_file}")
                continue
            
            # Read the data
            df = pd.read_csv(os.path.join(base_data_path, base_file))
            topic_name = 'gay marriage' if topic.lower() == 'gm' else 'whether "Under God" should appear in the US Pledge of Allegiance'
            
            # Get samples for this split
            samples = prep_fewshot_samples_comarg(
                os.path.join(splits_path, split_file),
                topic_name,
                n_shot
            )
            
            process_dataframe_comments(df, topic_name, file_name, samples, n_shot, output_file)
            
        else:  # YRU datasets
            base_file = f'yru_{topic}_with_negatives_main.csv'
            split_file = f'yru_{topic}_with_negatives_main_{n_shot}shot_split_{split}.csv'
            output_file = os.path.join(output_dir, f'yru_{topic}_identification_gemini_{n_shot}shot_split_{split}.jsonl')
            
            # Skip if file already exists and is complete
            if check_existing_output(output_file):
                print(f"Skipping existing file: {output_file}")
                continue

            print(f"Processing file: {output_file}")
            # Read the data
            df = pd.read_csv(os.path.join(base_data_path, base_file))
            
            # Get samples for this split
            samples = prep_fewshot_samples(
                os.path.join(splits_path, split_file),
                topic
            )
            
            process_yru_dataframe_comments(df, topic, samples, n_shot, output_file)

def check_existing_output(file_path):
    """Check if the output file already exists and is complete."""
    return os.path.exists(file_path)

def create_split_directories(base_output_path, n_splits):
    """Create directories for a specific split (e.g., Split5)."""
    split_dir = os.path.join(base_output_path, f'Split{n_splits}')
    os.makedirs(split_dir, exist_ok=True)
    return split_dir

def run_specific_split_for_yru(topic, n_shot, base_data_path, splits_path, base_output_path):
    """Run the process specifically for 'yru_gayRights_identification_gemini_1shot_split_2.jsonl'."""
    split = 2  # Since we are re-running Split2
    output_dir = create_split_directories(base_output_path, split)
    
    output_file = os.path.join(output_dir, f'yru_{topic}_identification_gemini_{n_shot}shot_split_{split}.jsonl')
    
    # Check if the file already exists and is complete
    if check_existing_output(output_file):
        print(f"Skipping existing file: {output_file}")
        return
    
    print(f"Processing file: {output_file}")
    # Read the data
    base_file = f'yru_{topic}_with_negatives_main.csv'
    df = pd.read_csv(os.path.join(base_data_path, base_file))
    
    # Get samples for this split
    split_file = f'yru_{topic}_with_negatives_main_{n_shot}shot_split_{split}.csv'
    samples = prep_fewshot_samples(
        os.path.join(splits_path, split_file),
        topic
    )
    
    # Process the dataframe comments and save the results
    process_yru_dataframe_comments(df, topic, samples, n_shot, output_file)

# Run the specific split for 'gayRights' and '1shot' on Split2
base_data_path = '/Users/guida/llm_argument_tasks/clean_data'
splits_path = '/Users/guida/llm_argument_tasks/run_all_k_shots/k-shots'
base_output_path = '/Users/guida/llm_argument_tasks/results_T1_llama_splits'

# Call the function to process this specific split
run_specific_split_for_yru('gayRights', 1, base_data_path, splits_path, base_output_path)

# Run only on Split5
""""base_data_path = '/Users/guida/llm_argument_tasks/clean_data'
splits_path = '/Users/guida/llm_argument_tasks/run_all_k_shots/k-shots'
base_output_path = '/Users/guida/llm_argument_tasks/results_T1_llama_splits'

# Process ComArg datasets for Split5
run_splits_for_dataset('comarg', 'gm', base_data_path, splits_path, base_output_path, [1, 5])
run_splits_for_dataset('comarg', 'ugip', base_data_path, splits_path, base_output_path, [1, 5])

# Process YRU datasets for Split5
for topic in ['gayRights']:
    run_splits_for_dataset('yru', topic, base_data_path, splits_path, base_output_path, [1, 5])"""


Processing file: /Users/guida/llm_argument_tasks/results_T1_llama_splits/Split2/yru_gayRights_identification_gemini_1shot_split_2.jsonl


NameError: name 'process_yru_dataframe_comments' is not defined