In [45]:
import pandas as pd
import numpy as np
from google import genai
from google.genai import types
import json
from tqdm import tqdm
from itertools import cycle
import time

inputdata_file = 'data/merged_dataset_metadata.csv'
outputdata_file ='data/merged_dataset_metadata_labelled.csv'

with open("data/apikeys.json") as f:
    config = json.load(f)
API_KEYS = config["GOOGLE_API_KEYS"]
API_KEYS_CYCLE = cycle(API_KEYS)

In [46]:
# Load dataset
df = pd.read_csv(inputdata_file, low_memory=False)
# Random sampling of 1000 rows
# df = df.sample(n=1000, random_state=42)

# Initialize output column
df['mobilitydata_labelled'] = None

# Assigning an index to each row based on the presence of content in specific cells

# Index 1:
df['iteration_index'] = np.where(
    (~df['dataset_description_DE'].isna() & (df['dataset_description_DE'].astype(str).str.strip() != '')) &
    (~df['dataset_title_DE'].isna() & (df['dataset_title_DE'].astype(str).str.strip() != '')),
    1,
    None
)
print(f"Index 1: Number of rows with only filled dataset_description_DE and dataset_title_DE: {(df['iteration_index'] == 1).sum()}")

# Index 2:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_DE'].isna() & (df['dataset_description_DE'].astype(str).str.strip() != '')),
    2,
    df['iteration_index']
)
print(f"Index 2: Remaining number of rows with only filled dataset_description_DE: {(df['iteration_index'] == 2).sum()}")

# Index 3:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_EN'].isna() & (df['dataset_description_EN'].astype(str).str.strip() != '')) &
    (~df['dataset_title_EN'].isna() & (df['dataset_title_EN'].astype(str).str.strip() != '')),
    3,
    df['iteration_index']
)
print(f"Index 3: Remaining number of rows with only filled dataset_description_EN and dataset_title_EN: {(df['iteration_index'] == 3).sum()}")

# Index 4:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_EN'].isna() & (df['dataset_description_EN'].astype(str).str.strip() != '')),
    4,
    df['iteration_index']
)
print(f"Index 4: Remaining number of rows with only filled dataset_description_EN: {(df['iteration_index'] == 4).sum()}")

# Index 5:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_FR'].isna() & (df['dataset_description_FR'].astype(str).str.strip() != '')) &
    (~df['dataset_title_FR'].isna() & (df['dataset_title_FR'].astype(str).str.strip() != '')),
    5,
    df['iteration_index']
)
print(f"Index 5: Remaining number of rows with only filled dataset_description_FR and dataset_title_FR: {(df['iteration_index'] == 5).sum()}")

# Index 6:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_FR'].isna() & (df['dataset_description_FR'].astype(str).str.strip() != '')),
    6,
    df['iteration_index']
)
print(f"Index 6: Remaining number of rows with only filled dataset_description_FR: {(df['iteration_index'] == 6).sum()}")

# Index 7:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_IT'].isna() & (df['dataset_description_IT'].astype(str).str.strip() != '')) &
    (~df['dataset_title_IT'].isna() & (df['dataset_title_IT'].astype(str).str.strip() != '')),
    7,
    df['iteration_index']
)
print(f"Index 7: Remaining number of rows with only filled dataset_description_IT and dataset_title_IT: {(df['iteration_index'] == 7).sum()}")

# Index 8:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_IT'].isna() & (df['dataset_description_IT'].astype(str).str.strip() != '')),
    8,
    df['iteration_index']
)
print(f"Index 8: Remaining number of rows with only filled dataset_description_IT: {(df['iteration_index'] == 8).sum()}")

# Index 9:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description'].isna() & (df['dataset_description'].astype(str).str.strip() != '')) &
    (~df['dataset_title'].isna() & (df['dataset_title'].astype(str).str.strip() != '')),
    9,
    df['iteration_index']
)
print(f"Index 9: Remaining number of rows with only filled dataset_description and dataset_title: {(df['iteration_index'] == 9).sum()}")

# Index 10:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description'].isna() & (df['dataset_description'].astype(str).str.strip() != '')),
    10,
    df['iteration_index']
)
print(f"Index 10: Remaining number of rows with only filled dataset_description: {(df['iteration_index'] == 10).sum()}")

# Index 11:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_title_DE'].isna() & (df['dataset_title_DE'].astype(str).str.strip() != '')),
    11,
    df['iteration_index']
)
print(f"Index 11: Remaining number of rows with only filled dataset_title_DE: {(df['iteration_index'] == 11).sum()}")

# Index 12:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_title_EN'].isna() & (df['dataset_title_EN'].astype(str).str.strip() != '')),
    12,
    df['iteration_index']
)
print(f"Index 12: Remaining number of rows with only filled dataset_title_EN: {(df['iteration_index'] == 12).sum()}")

# Index 13:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_title_FR'].isna() & (df['dataset_title_FR'].astype(str).str.strip() != '')),
    13,
    df['iteration_index']
)
print(f"Index 13: Remaining number of rows with only filled dataset_title_FR: {(df['iteration_index'] == 13).sum()}")

# Index 14:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_title_IT'].isna() & (df['dataset_title_IT'].astype(str).str.strip() != '')),
    14,
    df['iteration_index']
)
print(f"Index 14: Remaining number of rows with only filled dataset_title_IT: {(df['iteration_index'] == 14).sum()}")

# Index 15:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_title'].isna() & (df['dataset_title'].astype(str).str.strip() != '')),
    15,
    df['iteration_index']
)
print(f"Index 15: Remaining number of rows with only filled dataset_title: {(df['iteration_index'] == 15).sum()}")

# Index 16:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_UNKNOWN'].isna() & (df['dataset_description_UNKNOWN'].astype(str).str.strip() != '')) &
    (~df['dataset_title_UNKNOWN'].isna() & (df['dataset_title_UNKNOWN'].astype(str).str.strip() != '')),
    16,
    df['iteration_index']
)
print(f"Index 16: Remaining number of rows with only filled dataset_description_UNKNOWN and dataset_title_UNKNOWN: {(df['iteration_index'] == 16).sum()}")

# Index 17:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_description_UNKNOWN'].isna() & (df['dataset_description_UNKNOWN'].astype(str).str.strip() != '')),
    17,
    df['iteration_index']
)
print(f"Index 17: Remaining number of rows with only filled dataset_description_UNKNOWN: {(df['iteration_index'] == 17).sum()}")

# Index 18:
df['iteration_index'] = np.where(
    df['iteration_index'].isna() &
    (~df['dataset_title_UNKNOWN'].isna() & (df['dataset_title_UNKNOWN'].astype(str).str.strip() != '')),
    18,
    df['iteration_index']
)
print(f"Index 18: Remaining number of rows with only filled dataset_title_UNKNOWN: {(df['iteration_index'] == 18).sum()}")

# Check if all rows have been assigned an index.
if (df['iteration_index'] < 20).sum() < len(df):
    missing_rows = df[df['iteration_index'].isna()]
    print("ERROR: Not all rows have been assigned an iteration_index.")
    print(f"Number of rows without index: {len(missing_rows)}")
    print("Example rows without index:")
    print(missing_rows.head(5))
else:
    print(f"All rows ({(df['iteration_index'] < 20).sum()}/{len(df)}) have been successfully assigned an iteration_index.")

Index 1: Number of rows with only filled dataset_description_DE and dataset_title_DE: 20213
Index 2: Remaining number of rows with only filled dataset_description_DE: 68
Index 3: Remaining number of rows with only filled dataset_description_EN and dataset_title_EN: 500
Index 4: Remaining number of rows with only filled dataset_description_EN: 0
Index 5: Remaining number of rows with only filled dataset_description_FR and dataset_title_FR: 1304
Index 6: Remaining number of rows with only filled dataset_description_FR: 0
Index 7: Remaining number of rows with only filled dataset_description_IT and dataset_title_IT: 30
Index 8: Remaining number of rows with only filled dataset_description_IT: 0
Index 9: Remaining number of rows with only filled dataset_description and dataset_title: 4611
Index 10: Remaining number of rows with only filled dataset_description: 0
Index 11: Remaining number of rows with only filled dataset_title_DE: 1697
Index 12: Remaining number of rows with only filled da

In [47]:
key_count = len(API_KEYS)
chunk_size = 5 # Number of rows sent to the LLM at once (best results with 1, but not suitable for free access/trial limits)
requests_per_key = 15 # Number of requests per key in a minute 

current_key_index = 0
key_request_counter = 0
cycle_start_time = time.time()

# Different chunk lines depending on the iteration index
group_chunk_lines = {
    "1": lambda row: f"Titel: {row['dataset_title_DE']}\nBeschreibung: {row['dataset_description_DE']}",
    "2": lambda row: f"Beschreibung: {row['dataset_description_DE']}",
    "3": lambda row: f"Titel: {row['dataset_title_EN']}\nBeschreibung: {row['dataset_description_EN']}",
    "4": lambda row: f"Beschreibung: {row['dataset_description_EN']}",
    "5": lambda row: f"Titel: {row['dataset_title_FR']}\nBeschreibung: {row['dataset_description_FR']}",
    "6": lambda row: f"Beschreibung: {row['dataset_description_FR']}",
    "7": lambda row: f"Titel: {row['dataset_title_IT']}\nBeschreibung: {row['dataset_description_IT']}",
    "8": lambda row: f"Beschreibung: {row['dataset_description_IT']}",
    "9": lambda row: f"Titel: {row['dataset_title']}\nBeschreibung: {row['dataset_description']}",
    "10": lambda row: f"Beschreibung: {row['dataset_description']}",
    "11": lambda row: f"Titel: {row['dataset_title_DE']}",
    "12": lambda row: f"Titel: {row['dataset_title_EN']}",
    "13": lambda row: f"Titel: {row['dataset_title_FR']}",
    "14": lambda row: f"Titel: {row['dataset_title_IT']}",
    "15": lambda row: f"Titel: {row['dataset_title']}",
    "16": lambda row: f"Titel: {row['dataset_title_UNKNOWN']}\nBeschreibung: {row['dataset_description_UNKNOWN']}",
    "17": lambda row: f"Beschreibung: {row['dataset_description_UNKNOWN']}",
    "18": lambda row: f"Titel: {row['dataset_title_UNKNOWN']}",
}

relevant_columns = [
    'dataset_title_DE', 'dataset_description_DE',
    'dataset_title_EN', 'dataset_description_EN',
    'dataset_title_FR', 'dataset_description_FR',
    'dataset_title_IT', 'dataset_description_IT',
    'dataset_title', 'dataset_description',
    'dataset_title_UNKNOWN', 'dataset_description_UNKNOWN'
]

# Group by the 'index' column (or any other grouping criteria)
for group_name, group_df in df.groupby('iteration_index'):
    print(f"Processing group: {group_name} with {len(group_df)} entries")
    
    # Iterate in chunks within this group
    for i in tqdm(range(0, len(group_df), chunk_size)):
        if key_request_counter >= requests_per_key:
            current_key_index += 1
            key_request_counter = 0

            if current_key_index >= key_count:
                elapsed = time.time() - cycle_start_time
                if elapsed < 60:
                    wait_time = int(60 - elapsed)
                    print(f"Maximum requests per minute reached. Waiting {wait_time} seconds...")
                    time.sleep(wait_time + 1)
                current_key_index = 0
                cycle_start_time = time.time()

        CURRENT_API_KEY = API_KEYS[current_key_index]
        client = genai.Client(api_key=CURRENT_API_KEY)

        chunk_df = group_df.iloc[i:i + chunk_size][relevant_columns]

        formatter = group_chunk_lines[str(group_name)]
        chunk_lines = chunk_df.apply(formatter, axis=1).tolist()

        prompt = "Handelt es sich bei folgendem Inhalt um Verkehrs- oder Mobilitätsdaten? Antworte nur mit T (True) oder F (False) Zeilenweise.\n\n" + "\n\n".join(chunk_lines)

        # Submit the prompt to the Gemini model
        response = client.models.generate_content_stream(
            model="gemini-2.0-flash-lite-001",
            contents=[prompt],
            config=types.GenerateContentConfig(
                max_output_tokens=chunk_size * 2,
                temperature=0
            )
        )

        result_text = ""
        for chunk in response:
            result_text += chunk.text

        predictions = result_text.strip().splitlines()

        if len(predictions) != len(chunk_df):
            predictions = [] 
            df.loc[chunk_df.index, 'mobilitydata_labelled'] = 'ERROR'
            continue

        target_indices = chunk_df.index
        df.loc[target_indices, 'mobilitydata_labelled'] = predictions

        key_request_counter += 1
        time.sleep(0.8)
        

# After the main loop: retry failed attempts with chunk_size = 1
error_df = df[df['mobilitydata_labelled'] == 'ERROR']

if not error_df.empty:
    print(f"Retrying {len(error_df)} ERROR rows with chunk_size = 1")

    for group_name, group_df in error_df.groupby('iteration_index'):
        print(f"Retrying group {group_name} with {len(group_df)} rows")
        for i in tqdm(range(0, len(group_df), 1)):  # chunk_size = 1
            if key_request_counter >= requests_per_key:
                current_key_index += 1
                key_request_counter = 0

                if current_key_index >= key_count:
                    elapsed = time.time() - cycle_start_time
                    if elapsed < 60:
                        wait_time = int(60 - elapsed)
                        print(f"Maximum requests per minute reached. Waiting {wait_time} seconds...")
                        time.sleep(wait_time + 1)
                    current_key_index = 0
                    cycle_start_time = time.time()

            CURRENT_API_KEY = API_KEYS[current_key_index]
            client = genai.Client(api_key=CURRENT_API_KEY)

            chunk_df = group_df.iloc[i:i + 1][relevant_columns]
            formatter = group_chunk_lines[str(group_name)]
            chunk_lines = chunk_df.apply(formatter, axis=1).tolist()

            prompt = "Handelt es sich bei folgendem Inhalt um Verkehrs- oder Mobilitätsdaten? Antworte nur mit T (True) oder F (False) Zeilenweise.\n\n" + "\n\n".join(chunk_lines)

            response = client.models.generate_content_stream(
                model="gemini-2.0-flash-lite-001",
                contents=[prompt],
                config=types.GenerateContentConfig(
                    max_output_tokens=5,
                    temperature=0
                )
            )

            result_text = ""
            for chunk in response:
                result_text += chunk.text

            predictions = result_text.strip().splitlines()

            if len(predictions) != 1:
                df.loc[chunk_df.index, 'mobilitydata_labelled'] = 'ERROR'
                continue

            df.loc[chunk_df.index, 'mobilitydata_labelled'] = predictions
            key_request_counter += 1
            time.sleep(0.8)

Processing group: 1 with 20213 entries


  0%|          | 0/4043 [00:00<?, ?it/s]

  3%|▎         | 137/4043 [02:59<1:24:41,  1.30s/it]

Maximum requests per minute reached. Waiting 1 seconds...


  5%|▍         | 184/4043 [04:01<1:23:12,  1.29s/it]

Maximum requests per minute reached. Waiting 0 seconds...


  6%|▌         | 229/4043 [05:00<1:22:50,  1.30s/it]

Maximum requests per minute reached. Waiting 1 seconds...


  7%|▋         | 274/4043 [06:01<1:24:48,  1.35s/it]

Maximum requests per minute reached. Waiting 1 seconds...


  8%|▊         | 319/4043 [07:02<1:19:54,  1.29s/it]

Maximum requests per minute reached. Waiting 0 seconds...


  9%|▉         | 364/4043 [08:02<1:19:36,  1.30s/it]

Maximum requests per minute reached. Waiting 1 seconds...


 11%|█▏        | 455/4043 [10:05<1:19:57,  1.34s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 14%|█▎        | 547/4043 [12:05<1:16:29,  1.31s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 16%|█▌        | 639/4043 [14:06<1:14:48,  1.32s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 17%|█▋        | 685/4043 [15:07<1:11:59,  1.29s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 18%|█▊        | 730/4043 [16:07<1:09:11,  1.25s/it]

Maximum requests per minute reached. Waiting 1 seconds...


 19%|█▉        | 775/4043 [17:08<1:10:02,  1.29s/it]

Maximum requests per minute reached. Waiting 1 seconds...


 20%|██        | 821/4043 [18:09<1:08:50,  1.28s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 23%|██▎       | 915/4043 [20:11<1:04:16,  1.23s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 24%|██▎       | 960/4043 [21:10<1:06:14,  1.29s/it]

Maximum requests per minute reached. Waiting 1 seconds...


 25%|██▍       | 1006/4043 [22:11<1:05:15,  1.29s/it]

Maximum requests per minute reached. Waiting 1 seconds...


 26%|██▌       | 1051/4043 [23:12<1:04:14,  1.29s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 27%|██▋       | 1096/4043 [24:12<1:03:54,  1.30s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 28%|██▊       | 1141/4043 [25:12<1:02:24,  1.29s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 29%|██▉       | 1187/4043 [26:13<1:04:48,  1.36s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 32%|███▏      | 1279/4043 [28:15<59:17,  1.29s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 33%|███▎      | 1325/4043 [29:16<59:37,  1.32s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 34%|███▍      | 1371/4043 [30:15<57:18,  1.29s/it]  

Maximum requests per minute reached. Waiting 1 seconds...


 35%|███▌      | 1418/4043 [31:17<58:22,  1.33s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 36%|███▌      | 1464/4043 [32:18<56:44,  1.32s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 37%|███▋      | 1509/4043 [33:17<54:54,  1.30s/it]  

Maximum requests per minute reached. Waiting 1 seconds...


 38%|███▊      | 1555/4043 [34:18<53:01,  1.28s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 40%|███▉      | 1600/4043 [35:18<52:37,  1.29s/it]  

Maximum requests per minute reached. Waiting 1 seconds...


 41%|████      | 1645/4043 [36:19<51:47,  1.30s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 42%|████▏     | 1690/4043 [37:19<52:02,  1.33s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 44%|████▍     | 1782/4043 [39:21<49:50,  1.32s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 46%|████▋     | 1874/4043 [41:22<46:35,  1.29s/it]  

Maximum requests per minute reached. Waiting 0 seconds...


 48%|████▊     | 1921/4043 [42:23<46:00,  1.30s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 51%|█████     | 2059/4043 [45:24<43:07,  1.30s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 52%|█████▏    | 2104/4043 [46:24<43:23,  1.34s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 58%|█████▊    | 2332/4043 [51:27<37:05,  1.30s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 59%|█████▉    | 2377/4043 [52:27<36:24,  1.31s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 63%|██████▎   | 2562/4043 [56:29<32:33,  1.32s/it]

Maximum requests per minute reached. Waiting 0 seconds...


 67%|██████▋   | 2711/4043 [59:46<29:22,  1.32s/it]


ClientError: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.0-flash-lite'}, 'quotaValue': '1500'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '49s'}]}}

In [43]:
# Write dataframe in new csv-File
df.to_csv(outputdata_file, index=False)

print(f'The file has been successfully saved as {outputdata_file}.')

The file has been successfully saved as data/merged_dataset_metadata_labelled.csv.
