# Span Detection UA only implementation

1. Import dependencies

In [1]:
import csv
import pandas as pd
import multiprocess as mp
import time

from google import genai
from google.genai import types
from tqdm import tqdm

from src.definitions import (
    RAW_DATA_FOLDER,
)

2. Load dataset

In [2]:
df = pd.read_parquet(RAW_DATA_FOLDER / "span-detection.parquet")
df = df.query("manipulative == True")

3. Extract input

In [3]:
def convert_to_input(df, i):
    loc = df.iloc[i].to_dict()
    spans = [tuple(it.tolist()) for it in loc["trigger_words"]]
    techniques = loc["techniques"]
    return (techniques, spans)

In [4]:
inputs = [convert_to_input(df, i) for i in range(len(df))]

In [5]:
def split(a, n):
    k, m = divmod(len(a), n)
    return [a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)]

folds_count = 8
input_folds = split(inputs, folds_count)

4. Make request to a model

In [6]:
retries = [4, 16, 32, 64, 128]

def call_generate_text_api(client, model, contents, config, fold):
    for attempt in range(len(retries)):
        try:
            result = client.models.generate_content(
                model=model,
                contents=contents,
                config=config,
            )

            return result.text
        except Exception as err:
            print(f"[Fold {fold}] Unexpected {err=}, {type(err)=}")
            if attempt < len(retries) - 1:
                wait_time = retries[attempt]
                print(f"Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                print(f"[Fold {fold}] Max retries reached. Skipping request.")
                raise err


def generate(model_id, split_name, fold_index):
    client = genai.Client(
        vertexai=True,
        project="17146036994",
        location="europe-north1",
    )

    model = f"projects/17146036994/locations/europe-north1/endpoints/{model_id}"
    config = types.GenerateContentConfig(
        temperature=1,
        top_p=0.95,
        max_output_tokens=8192,
        response_modalities=["TEXT"],
    )

    with open(
        RAW_DATA_FOLDER / f"fine-tuned-gemini-ds-{split_name}-{fold_index}.csv", "a"
    ) as the_file:
        writer = csv.writer(the_file)
        for it in tqdm(input_folds[fold_index], desc=f"Fold {fold_index}"):
            techniques, spans = it
            contents = [f"{techniques} ## {spans}"]
            text = call_generate_text_api(client, model, contents, config, fold_index)

            span_str = [f"{s},{f}" for s, f in spans]

            writer.writerow([",".join(techniques), "|".join(span_str), text])

In [7]:
with mp.Pool(processes=folds_count) as pool:
    ru_model = "5871383296230817792"
    ru_split = "ru"
    params = [(ru_model, ru_split, i) for i in range(folds_count)]
    pool.starmap(generate, params)

Fold 7: 100%|██████████| 323/323 [09:59<00:00,  1.86s/it]
Fold 2: 100%|██████████| 324/324 [10:21<00:00,  1.92s/it]
Fold 0: 100%|██████████| 324/324 [10:22<00:00,  1.92s/it]
Fold 3: 100%|██████████| 324/324 [10:40<00:00,  1.98s/it]
Fold 5: 100%|██████████| 323/323 [11:03<00:00,  2.06s/it]
Fold 6: 100%|██████████| 323/323 [11:10<00:00,  2.08s/it]
Fold 4: 100%|██████████| 324/324 [11:11<00:00,  2.07s/it]
Fold 1: 100%|██████████| 324/324 [11:35<00:00,  2.15s/it]


In [8]:
with mp.Pool(processes=folds_count) as pool:
    ru_model = "5526857924736974848"
    ru_split = "ua"
    params = [(ru_model, ru_split, i) for i in range(folds_count)]
    pool.starmap(generate, params)

Fold 1:   7%|▋         | 24/324 [00:48<09:40,  1.93s/it]

[Fold 6] Unexpected err=ServerError("500 INTERNAL. {'error': {'code': 500, 'message': 'Internal error encountered.', 'status': 'INTERNAL'}}"), type(err)=<class 'google.genai.errors.ServerError'>
Retrying in 4 seconds...


Fold 5:  20%|██        | 65/323 [02:40<12:25,  2.89s/it]  

[Fold 1] Unexpected err=ServerError("500 INTERNAL. {'error': {'code': 500, 'message': 'Internal error encountered.', 'status': 'INTERNAL'}}"), type(err)=<class 'google.genai.errors.ServerError'>
Retrying in 4 seconds...


Fold 4:  31%|███       | 99/324 [04:16<09:31,  2.54s/it]] 

[Fold 1] Unexpected err=ServerError("500 INTERNAL. {'error': {'code': 500, 'message': 'Internal error encountered.', 'status': 'INTERNAL'}}"), type(err)=<class 'google.genai.errors.ServerError'>
Retrying in 4 seconds...


Fold 3:  43%|████▎     | 138/324 [05:38<05:47,  1.87s/it]

[Fold 7] Unexpected err=ServerError("500 INTERNAL. {'error': {'code': 500, 'message': 'Internal error encountered.', 'status': 'INTERNAL'}}"), type(err)=<class 'google.genai.errors.ServerError'>
Retrying in 4 seconds...


Fold 3:  55%|█████▌    | 179/324 [07:10<04:42,  1.95s/it]

[Fold 4] Unexpected err=ServerError("500 INTERNAL. {'error': {'code': 500, 'message': 'Internal error encountered.', 'status': 'INTERNAL'}}"), type(err)=<class 'google.genai.errors.ServerError'>
Retrying in 4 seconds...


Fold 2: 100%|██████████| 324/324 [11:56<00:00,  2.21s/it]
Fold 0: 100%|██████████| 324/324 [12:13<00:00,  2.26s/it]
Fold 5: 100%|██████████| 323/323 [12:33<00:00,  2.33s/it]
Fold 7: 100%|██████████| 323/323 [12:56<00:00,  2.41s/it]
Fold 3: 100%|██████████| 324/324 [12:57<00:00,  2.40s/it]
Fold 4: 100%|██████████| 324/324 [13:12<00:00,  2.45s/it]
Fold 1: 100%|██████████| 324/324 [13:44<00:00,  2.54s/it]
Fold 6: 100%|██████████| 323/323 [13:53<00:00,  2.58s/it]
