## 1. Install dependencies

In [1]:
!pip install PyEvALL
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.metrics.metricfactory import MetricFactory
from pyevall.utils.utils import PyEvALLUtils
import pandas as pd
import json
import re
import time
import math
import os
import google.generativeai as genai
import random



## 2. JSON manipulation
This section contains the functions used to process the dataset and store the answers given by the model

In [2]:
def json_df(json_location, dataframe_name):
    """
    Load a JSON file and convert it into a pandas DataFrame.

    Parameters:
    json_location (str): The file path of the JSON file.
    dataframe_name (str): The name of the resulting DataFrame.

    Returns:
    pandas.DataFrame: The DataFrame created from the JSON data.
    """
    with open(json_location, 'r') as f:
        data = json.load(f)
    dataframe_name = pd.DataFrame.from_dict(data, orient='index')
    return dataframe_name

def load_prompt(file_path):
    """
    Load the prompt from a file.

    Args:
        file_path (str): The path to the file containing the prompt.

    Returns:
        str: The loaded prompt.

    """
    with open(file_path, 'r') as file:
        prompt = file.readline().strip()
    return prompt

def save_responses(responses_dict, output_dir='.', base_filename='NO_FILE_NAME_RMIT', increment_filename=True):
    """
    Save responses to a JSON file.

    Args:
        responses_dict (dict): A dictionary containing the responses to be saved.
        output_dir (str, optional): The directory where the JSON file will be saved. Defaults to '.'.
        base_filename (str, optional): The base filename for the JSON file. Defaults to 'NO_FILE_NAME_RMIT'.
        increment_filename (bool, optional): Whether to increment the filename if it already exists. Defaults to True.

    Raises:
        ValueError: If responses_dict is not a dictionary.

    """
    if not isinstance(responses_dict, dict):
        raise ValueError("responses_dict must be a dictionary.")

    os.makedirs(output_dir, exist_ok=True)
    filename = f"{base_filename}.json" if not increment_filename else get_next_filename(output_dir, base_filename)
    full_path = os.path.join(output_dir, filename)

    # Convert dictionary to list for JSON dumping
    responses_list = list(responses_dict.values())

    try:
        with open(full_path, 'w') as file:
            json.dump(responses_list, file, indent=4)
        print(f"Data saved to {filename}")
    except IOError as e:
        print(f"Failed to save data: {e}")

def get_next_filename(output_dir, base_filename):
    """
    Get the next available filename for a given base filename in the specified output directory.

    Args:
        output_dir (str): The directory where the files are stored.
        base_filename (str): The base filename to be used.

    Returns:
        str: The next available filename in the format "{base_filename}_{number}.json".

    """
    pattern = re.compile(rf"{re.escape(base_filename)}_(\d+).json")
    max_number = 0
    for filename in os.listdir(output_dir):
        match = pattern.match(filename)
        if match:
            number = int(match.group(1))
            if number > max_number:
                max_number = number
    return f"{base_filename}_{max_number + 1}.json"


## 3. Query the model
This section cotains the functions used to submit the tweets to Gemini and get classification answers based on task and evaluation method. Before running, configure a valid API key.

In [3]:
def get_answer(tweet, eval, prompt, df, idx, task):
  """
    Returns a response from the Gemini API based on the given parameters.

    Args:
        tweet (str): The tweet for which the response is requested.
        api_key (str): The API key for accessing the Gemini API.
        eval (str): The evaluation mode ('hard' or 'soft').
        prompt (str): The prompt value for the conversation.
        df (pandas.DataFrame): The DataFrame containing study levels and gender information.
        idx (int): The index of the current row in the DataFrame.
        task (int): The task number.

    Returns:
        dict: A dictionary containing the tweet and the response from the API.

    Raises:
        Exception: If an error occurs during the API request.

    """

  delimiter = "####"
  response_value = ""
  prompt_value = f""""Sexism is defied as prejudice or discrimination based on sex or gender, especially against women and girls. Although its origin is unclear, the term sexism emerged from the ‚Äúsecond-wave‚Äù feminism of the 1960s through ‚Äô80s and was most likely modeled on the civil rights movement‚Äôs term racism (prejudice or discrimination based on race). Sexism can be a belief that one sex is superior to or more valuable than another sex. It imposes limits on what men and boys can and should do and what women and girls can and should do. The concept of sexism was originally formulated to raise consciousness about the oppression of girls and women, although by the early 21st century it had sometimes been expanded to include the oppression of any sex, including men and boys, intersex people, and transgender people. A text is a DIRECT sexist message if the intention is to write a message that is sexist by itself or incites to be sexist. A text is a REPORTED sexist message if the intention is to report and share a sexist situation suffered by a woman or women in first or third person. A text is JUDGMENTAL if the intention is to condemn sexist situations or behaviours. A sexist text is labeled as IDEOLOGICAL-INEQUALITY if it includes messages that discredit the feminist movement or if it includes messages that reject inequality between men and women or if it present men as victims of gender-based oppression. A sexist text is labeled as STEREOTYPING-DOMINANCE if it includes messages that express false ideas about women that suggest they are more suitable or inappropriate for certain tasks, and somehow inferior to men. A sexist text is labeled as OBJECTIFICATION if it includes messages where women are presented as objects apart from their dignity and personal aspects or if it icludes messages that assume or describe certain physical qualities that women must have in order to fulfill traditional gender roles. A sexist text is labeled as SEXUAL-VIOLENCE if it includes messages where sexual suggestions, requests or harassment of a sexual nature (rape or sexual assault) are made. A sexist text is labeled as MISOGYNY-NON-SEXUAL-VIOLENCE if it includes expressions of hatred and violence towards women. You are a robot who detects sexism from text given in the prompt."""
  column_value = f"""For each response, consider the perspective of individuals representing the following study levels: {df.study_levels_annotators[idx]} and gender: {df.gender_annotators[idx]}."""
  if eval == 'hard':
      if task == 1:
        # Provide an example
          response_value = f"""Give me 1 answer with [NO] or [YES]. If the text is sexist answer [YES]. If the text is not sexist answer [NO]. Example of the syntax of the answer: [YES]"""
      elif task == 2:
          response_value = f"""Give me 1 answer with [NO], [DIRECT], [REPORTED] or [JUDGEMENTAL]. If the text is sexist classify it. If the text is not sexist answer [NO]. Example of the syntax of the answer: [DIRECT]"""
      elif task == 3:
          response_value = f"""Give me a list of 1 to 5 answers separated by commas. If the text is sexist, classify it. If the text is not sexist answer [NO]. This is a multi-label task, so that more than one of the following labels may be assigned to each sexist tweet: [IDEOLOGICAL-INEQUALITY], [STEREOTYPING-DOMINANCE], [OBJECTIFICATION], [SEXUAL-VIOLENCE] or [MISOGYNY-NON-SEXUAL-VIOLENCE]. Example of the syntax of the answer: [OBJECTIFICATION], [SEXUAL-VIOLENCE]"""

  if eval == 'soft':
      if task == 1:
          response_value = f"""Give me a list of 6 answers with NO or YES reflecting the perspective of each individual. If an individual judges the text as sexist his or her perspective is [YES]. If an individual judges the text as not sexist his or her perspective is [NO]. Example of the syntax of the answer: [NO], [YES], [NO], [YES], [YES], [YES]"""
      elif task == 2:
          response_value = f"""Give me a list of 6 answers with [NO], [DIRECT], [REPORTED] or [JUDGEMENTAL] reflecting the perspective of each individual and using commas for each answer. If an individual judges the text as sexist his or her perspective is only one label among [DIRECT], [REPORTED] or [JUDGEMENTAL]. If an individual judges the text as not sexist his or her perspective is [NO]. Example of the syntax of the answer: [NO], [DIRECT], [REPORTED], [JUDGEMENTAL], [JUDGEMENTAL], [NO]"""
      elif task == 3:
          response_value = f"""Give me a list of 6 answers with NO, IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINANCE, OBJECTIFICATION, SEXUAL-VIOLENCE, or MISOGYNY-NON-SEXUAL-VIOLENCE reflecting the perspective of each individual and using commas for each answer. If an individual judges the text as sexist his or her perspective is only one label among [IDEOLOGICAL-INEQUALITY], [STEREOTYPING-DOMINANCE], [OBJECTIFICATION], [SEXUAL-VIOLENCE], [MISOGYNY-NON-SEXUAL-VIOLENCE]. If an individual judges the text as not sexist his or her perspective is [NO]. Example of the syntax of the answer: [NO], [IDEOLOGICAL-INEQUALITY], [STEREOTYPING-DOMINANCE], [OBJECTIFICATION], [SEXUAL-VIOLENCE], [MISOGYNY-NON-SEXUAL-VIOLENCE]"""

  safe = [
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
  ]

  genai.configure(api_key='AIzaSyCviN3hVVCO7a23OV9HR4NXVOnjnoEfEQc')
  model = genai.GenerativeModel('gemini-pro')

  answer = model.generate_content(
    f"""{prompt_value} {column_value} {response_value} {delimiter} {tweet} {delimiter}""",
    safety_settings=safe,
    generation_config=genai.types.GenerationConfig(
        max_output_tokens=None,
        temperature=1.0,
    )
  ).text


  res = {
      'tweet': tweet,
      'response': answer
  }

  print("----")
  print(tweet)
  print(answer)
  print("----")

  return res

## 4. Normalize answers
This section contains the functions used to format the answers properly. For the soft evaluation, it computes the probabilities associated to each label

In [4]:
def normalize_response(response, eval, task):
    """
    Normalize the response based on the evaluation method and task.

    Args:
        response (str): The response to be normalized.
        eval (str): The evaluation method ('soft' or 'hard').
        task (int): The task number (1, 2, or 3).

    Returns:
        dict or list: If eval is 'soft', returns a dictionary with normalized counts of response categories.
                      If eval is 'hard', returns a list of normalized response categories.

    """
    answers = response.replace('[', '').replace(']', '').split(',')
    answers = [answer.strip().upper() for answer in answers]  # Clean and normalize case
    # Define response categories based on the task and evaluation method
    if eval == 'soft':
        if task == 1:
            counts = {'NO': 0, 'YES': 0}
        elif task == 2:
            counts = {'NO': 0, 'DIRECT': 0, 'REPORTED': 0, 'JUDGEMENTAL': 0}
        elif task == 3:
            counts = {'NO': 0, 'IDEOLOGICAL-INEQUALITY': 0, 'STEREOTYPING-DOMINANCE': 0, 'OBJECTIFICATION': 0,'SEXUAL-VIOLENCE':0, 'MISOGYNY-NON-SEXUAL-VIOLENCE':0}
        else:
            # Return empty dictionary for unspecified tasks or evaluation modes
            print(answers)
            return {}

        # Count responses
        for answer in answers:
            if answer in counts:
                counts[answer] += 1

        # Normalize counts by the total number of responses
        total = sum(counts.values())
        if total > 0:
            for key in counts:
                counts[key] = counts[key] / total
        return counts
    else:
        return answers

## 5. Pipeline definition
This section creates the pipeline for data ingestion, classification and predictions storage

In [5]:
def main(parameters):
    """
    Fetches and processes tweets asynchronously.

    Args:
        **kwargs: Keyword arguments containing the following parameters:
            - dataframe: The dataframe containing the tweets.
            - prompt: The prompt for generating responses.
            - eval: The evaluation mode for the response.
            - task: The task number for processing the tweets.

    Returns:
        A tuple containing two dictionaries:
        - responses_dict: A dictionary mapping row IDs to normalized response values.
        - raw_responses: A dictionary mapping row IDs to raw response values.
    """
    tweets = parameters['dataframe']['tweet'].to_list()
    prompt = parameters['prompt']
    eval = parameters['eval']
    df = parameters['dataframe']
    task = parameters['task']

    responses_dict = {}
    raw_responses = {}  # Dictionary to store non-normalized responses

    for i, tweet in enumerate(tweets):
      # Fetch and process each tweet
      if task == 1:
        response = get_answer(tweet, eval, prompt, df, i, task)
        if 'response' in response:
          normalized_values = normalize_response(response['response'], eval, task)
          row_id = df[df['tweet'] == tweet]['id_EXIST'].values[0]
          responses_dict[row_id] = {
                        'id': row_id,
                        'value': normalized_values,
                        'test_case': "EXIST2024"
                    }
          # Store raw response
          raw_responses[row_id] = {
                        'id': row_id,
                        'response': response['response'],
                        'test_case': "EXIST2024"
                    }
        else:
          print(f"Error or unexpected format in response for tweet ID {tweet}: {response}")

      if task == 2:
        response = get_answer(tweet, eval, prompt, df, i, task)
        if 'response' in response:
          normalized_values = normalize_response(response['response'], eval, task)
          row_id = df[df['tweet'] == tweet]['id_EXIST'].values[0]
          responses_dict[row_id] = {
                        'id': row_id,
                        'value': normalized_values,
                        'test_case': "EXIST2024"
                    }

      if task == 3:
        response = get_answer(tweet, eval, prompt, df, i, task)
        normalized_values = normalize_response(response['response'], eval, task)
        row_id = df[df['tweet'] == tweet]['id_EXIST'].values[0]
        responses_dict[row_id] = {
                        'id': row_id,
                        'value': normalized_values,
                        'test_case': "EXIST2024"
                    }

      # Sleep to spread requests evenly across the rate limit period (Gemini: 15 RPM)
      time.sleep(5)

    return responses_dict, raw_responses

## 6. Main

In order to deal with the Gemini free API key contraints, we extract a random subportion of the entire development set on which predictions are made

In [6]:
N = 200

with open('/content/EXIST2024_dev.json', 'r') as file:
    dataset = json.load(file)

random_subset = dict(random.sample(list(dataset.items()), N))

with open("EXIST2024_dev_subset.json", "w") as json_file:
    json.dump(random_subset, json_file, indent=4)

- task: Select the task (1, 2 or 3)
- eval: Evaluation type ('soft' or 'hard')
- prompt: If you would like to change the prompt from a text file instead of in the code.
- dataframe: Data Frame created to pass through in the section

In [None]:
json_location = '/content/EXIST2024_dev_subset.json' # Test model
test_case = "EXIST2024"
df = json_df(json_location, 'df')
test_params = {
    'task': 3,
    'eval': 'hard',
    'prompt': 'Prompts/prompt.txt',
    'dataframe': df
    }
base_filename = f"task{test_params['task']}_{test_params['eval']}"

responses, raw_responses = main(test_params)

 # Saving the normalized responses if they exist
if responses:
  save_responses(responses, output_dir='test_formats/gemini-pro/test', base_filename=base_filename)

# Optionally save the raw responses too
if raw_responses:
  save_responses(raw_responses, output_dir='raw_formats/gemini-pro/test', base_filename=base_filename)

  column_value = f"""For each response, consider the perspective of individuals representing the following study levels: {df.study_levels_annotators[idx]} and gender: {df.gender_annotators[idx]}."""


----
@betty47866399 No te debes avergonzar, es un trabajo digno. Y si te dicen lo contrario es por que no saben valorar lo que hace el resto.
[NO]
----
----
@Fichinescu La comunidad gamer es un antro de mis√≥ginos y supremacistas blancos hace a√±os pero hoy d√≠a pocos se acuerdan de gamergate y todo lo que caus√≥, y en cierto sentido sigue causando.
[DEOGICAL-INEQUALITY], [MISOGYNY-NON-SEXUAL-VIOLENCE]
----
----
Mientras no se detengan las decisiones que toman los grupos de poder a nivel mundial,la sociedad pagar√° caro por la cobard√≠a de no levantarse y detenerlos a como de lugar,debemos demostrarles que no les vamos a tolerar mas,a pesar de los borregos vamos a luchar hasta su final... https://t.co/4yrFr18IOr
[NO]
----
----
@Kiwiwifer @xeritdelvalle @peemea68 @Bizkaia El cristianismo no dice "hay q abusar de los ni√±os", el islam si permite azotar mujeres y colgar homosexuales ..ves la diferencia?
[MISOGYNY-NON-SEXUAL-VIOLENCE], [STEREOTYPING-DOMINANCE]
----
----
En nuestra provinci

# Evaluation

In [None]:
# Gets the ground truths for the tweets in the test subset
golds_path = f"/content/EXIST2024_dev_task{test_params['task']}_gold_{test_params['eval']}.json"

with open('/content/EXIST2024_dev_subset.json', 'r') as j:
     data = json.loads(j.read())
choosen_ids = list(data.keys())

with open(golds_path, 'r') as j:
     data = json.loads(j.read())

result = []

for el in data:
  if el['id'] in choosen_ids:
    result.append(el)

with open(f"/content/EXIST2024_dev_task{test_params['task']}_gold_{test_params['eval']}_subset.json", 'w') as file:
    json.dump(result, file, indent=4)

# Path to the prediction file
file_path = f"/content/test_formats/gemini-pro/test/task{test_params['task']}_{test_params['eval']}_1.json"
# Sort the predictions dictionary by increasing value of id in order to allign preds and golds
with open(file_path, 'r') as file:
    data = json.load(file)
    data = sorted(data, key=lambda x: int(x['id']))

# Transform the data
if test_params['eval'] == 'hard' and test_params['task'] != 3:
  for entry in data:
    if isinstance(entry['value'], list):
        entry['value'] = entry['value'][0]  # Replace the list with its single element

# Overwrite the original file with the transformed data
with open(file_path, 'w') as file:
    json.dump(data, file, indent=4)

with open(file_path, 'r') as j:
     data = json.loads(j.read())
     reordered_data = [
    {"test_case": d["test_case"], "id": d["id"], "value": d["value"]}
    for d in data
]

with open(file_path, 'w') as file:
    json.dump(reordered_data, file, indent=4)

In [None]:
test = PyEvALLEvaluation()
metrics=[]

if test_params['task'] == 1:
   if test_params['eval'] == 'soft':
    metrics = [MetricFactory.ICMSoft.value, MetricFactory.ICMSoftNorm.value, MetricFactory.CrossEntropy.value]
   elif test_params['eval'] == 'hard':
    metrics = [MetricFactory.ICM.value, MetricFactory.ICMNorm.value, MetricFactory.FMeasure.value]
elif test_params['task'] == 2:
   if test_params['eval'] == 'soft':
    metrics = [MetricFactory.ICMSoft.value, MetricFactory.ICMSoftNorm.value, MetricFactory.CrossEntropy.value]
   elif test_params['eval'] == 'hard':
    metrics = [MetricFactory.ICM.value, MetricFactory.ICMNorm.value, MetricFactory.FMeasure.value]
elif test_params['task'] == 3:
   if test_params['eval'] == 'soft':
    metrics = [MetricFactory.ICMSoft.value, MetricFactory.ICMSoftNorm.value]
   elif test_params['eval'] == 'hard':
    metrics = [MetricFactory.ICM.value, MetricFactory.ICMNorm.value, MetricFactory.FMeasure.value]

predictions = f"/content/test_formats/gemini-pro/test/task{test_params['task']}_{test_params['eval']}_1.json"
gold = f"/content/EXIST2024_dev_task{test_params['task']}_gold_{test_params['eval']}_subset.json"

params= dict()
if test_params['task'] == 2:
  params[PyEvALLUtils.PARAM_HIERARCHY] = {"YES":["DIRECT","REPORTED","JUDGEMENTAL"], "NO":[]}
elif test_params['task'] == 3:
  params[PyEvALLUtils.PARAM_HIERARCHY] = {"YES":["IDEOLOGICAL-INEQUALITY","STEREOTYPING-DOMINANCE","OBJECTIFICATION", "SEXUAL-VIOLENCE", "MISOGYNY-NON-SEXUAL-VIOLENCE"], "NO":[]}

report = test.evaluate(predictions, gold, metrics, **params)
report.print_report()