<a href="https://colab.research.google.com/github/nicolaCirillo/ate-it/blob/main/baseline/subtask_b_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import codecs
import json

def load_data(file_path):
  """
  Loads data from a CSV or JSON file and returns a list of unique term types

  Args:
    file_path: The path to the input file (CSV or JSON).

  Returns:
    A list containing the loaded data.

  Raises:
    ValueError: If the file format is not supported.
  """
  if file_path.endswith('.csv'):
    # Load data from CSV file
    df = pd.read_csv(file_path)
    df.fillna('', inplace=True) # Fill NaN values with empty strings
    data = set([t.strip() for t in df['term'] if t!= ''])
  elif file_path.endswith('.json'):
    # Load data from JSON file
    with codecs.open(file_path, 'r', 'utf-8') as f:
      json_data = json.load(f)
    # Extract terms from JSON data
    data = set()
    for row in json_data["data"]:
      data.update(row["term_list"])
  else:
    # Raise error for unsupported file formats
    raise ValueError("Unsupported file format. Only CSV and JSON files are supported.")
  return data

In [None]:
import google.generativeai as genai

# Get the API key from the user
api_key = input("Please enter your Gemini API key: ")
genai.configure(api_key=api_key)

# Define the model
model = genai.GenerativeModel('gemini-2.5-flash')

In [None]:
# Test the model
response = model.generate_content("Hello, how are you?")

# Print the model's response
print(response.text)

In [None]:
# Define the batch size (number of terms passed per execution)

batch_size = 100

# Define system prompt
system_prompt = f"""You are a term clustering agent.
You will receive a list of clusters and a list of unclustered terms related to municipal waste management.
Your task is to cluster together exact synonyms.
Each cluster must denote a single concept.

Output:
Return the list of clusters with the newly added terms. Each cluster must be on a new line.

Example Output:

term1; term2
term3
term4; term5; term7
term6; term8

Instructions:
* Group terms by meaning, not form. Use their lemma.
* Focus on their meaning within the municipal waste management context.
* Cluster all {batch_size} terms, if a term does not belong to an existing cluster, insert it in a new cluster.
* Do not remove terms from existing clusters.
* Your response must contain only the clustered terms in the specified format. Do not add any introductory text or explanations.

"""

In [None]:
# Load data
filename = input("Please enter the path to the input file: ")
data = load_data(filename)

In [None]:
from tqdm.notebook import tqdm

response_list = []
user_prompt_cluster = "CLUSTERS:\n"
user_prompt_term = "\nUNCLUSTERED TERMS:\n"
# iterate over terms
for i, term in enumerate(tqdm(data)):
  # When batch size is reached
  if (i+1) % batch_size == 0:
    user_prompt_term += f"{term}"
    # Send the prompts to the model and collect response
    response = model.generate_content(
        f"System: {system_prompt}\nUser: {user_prompt_cluster + user_prompt_term}"
        )
    user_prompt_cluster = "CLUSTERS:\n" + response.text
    # Reset the user prompt
    user_prompt_term = ""
  # Add sentence to the prompt
  else:
    user_prompt_term += f"{term}\n"

# Process the remaining data
user_prompt_term = user_prompt_term[:-1]
# Send the prompts to the model and collect response
response = model.generate_content(
    f"System: {system_prompt}\nUser: {user_prompt_cluster + user_prompt_term}"
    )
user_prompt_cluster = "CLUSTERS:\n" + response.text

In [None]:
out_data = []
for i, cluster in enumerate(user_prompt_cluster.split('\n')[1:]):
  for term in cluster.split(';'):
    out_data.append((term.strip(),i+1))
if len(out_data) != len(data):
  print("Output data was not the same length as input data.")
  # Add unclustered terms from input
  for i, term in enumerate(data):
    if term not in list(zip(*out_data))[0]:
      out_data.append((term,999+i))
else:
  print("Output data is the same length as input data.")

In [None]:
import codecs
import json

# Create json output file
json_data = {"data": []}
for term, cluster in out_data:
  json_data["data"].append({
      "term": term,
      "cluster": cluster
  })

with codecs.open('baseline_b_1.json', 'w', 'utf-8') as f:
  json.dump(json_data, f, ensure_ascii=False, indent=4)