# Gemini-assisted prelabeling of LLM training data

**Purpose:**  Use Gemini to prelabel LLM training data. Store output in JSON for importing into Label Studio.
Prompt and JSON format customized for extracting sentiment and keywords from game reviews.

---
**Copyright (c) 2025 Michael Powers.**

In [None]:
import json
import os
import google.generativeai as genai
import random

# Gemini Config

In [None]:
gemini_model = 'models/gemini-2.0-flash-lite'
api_key="YOUR GEMINI API KEY"

# Prompt

In [None]:
base_prompt = "You are an expert game review analyzer. Your task is to extract structured information from game reviews, outputting a precise JSON object with sentiment, specific keywords, and negative flags. Ensure the output is valid JSON, following this schema: {'sentiment': {'overall': 'positive|negative|neutral|mixed', 'recommendation': true|false, 'warning_anti_recommendation': true|false}, 'specifics': {'positive_keywords': ['list', 'of', 'phrases'], 'negative_keywords': ['list', 'of', 'phrases']}, 'negative_tracker': {'ad_game_mismatch': true|false, 'game_cheating_manipulating': true|false, 'bugs_crashes_performance': true|false, 'monetization': true|false, 'live_ops_events': true|false}}"

# Helper Functions

In [None]:
def build_prompt(prompt, review):
    return f'{prompt}\n\nREVIEW: {review}'

In [None]:
def ask_gemini_json(prompt, use_json=True, model='models/gemini-2.0-flash-lite'):
    import os
    import google.generativeai as genai
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(model)
    if use_json:
        generation_config = genai.GenerationConfig(response_mime_type="application/json")
        response = model.generate_content(prompt, generation_config=generation_config)
    else:
        response = model.generate_content(prompt)
    return response.text

# Assemble into Label Studio compatable JSON

In [None]:
def create_label_studio_prediction(system_prompt_text, user_review_text, llm_output_json_string):
    """
    Creates a Label Studio prediction structure from your LLM's output.

    Args:
        system_prompt_text (str): The system prompt text.
        user_review_text (str): The user review text.
        llm_output_json_string (str): The LLM's raw JSON string output.

    Returns:
        dict: A dictionary formatted for Label Studio import, with data and predictions.
    """
    try:
        llm_parsed_output = json.loads(llm_output_json_string)
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON string provided for LLM output: {llm_output_json_string}")
        return None

    # Helper to get boolean values as 'true'/'false' strings
    def get_bool_choice(val):
        return 'true' if val else 'false'

    results = []

    # Sentiment
    sentiment = llm_parsed_output.get('sentiment', {})
    if 'overall' in sentiment:
        results.append({
            "from_name": "sentiment_overall",
            "to_name": "user_review_text",
            "type": "choices",
            "value": {"choices": [sentiment['overall']]}
        })
    if 'recommendation' in sentiment:
        results.append({
            "from_name": "sentiment_recommendation",
            "to_name": "user_review_text",
            "type": "choices",
            "value": {"choices": [get_bool_choice(sentiment['recommendation'])]}
        })
    if 'warning_anti_recommendation' in sentiment:
        results.append({
            "from_name": "sentiment_warning",
            "to_name": "user_review_text",
            "type": "choices",
            "value": {"choices": [get_bool_choice(sentiment['warning_anti_recommendation'])]}
        })

    # Specifics
    specifics = llm_parsed_output.get('specifics', {})
    if 'positive_keywords' in specifics:
        results.append({
            "from_name": "specifics_positive_keywords",
            "to_name": "user_review_text",
            "type": "textarea",
            "value": {"text": ", ".join(specifics['positive_keywords'])} # Join list back into string
        })
    if 'negative_keywords' in specifics:
        results.append({
            "from_name": "specifics_negative_keywords",
            "to_name": "user_review_text",
            "type": "textarea",
            "value": {"text": ", ".join(specifics['negative_keywords'])} # Join list back into string
        })

    # Negative Tracker
    negative_tracker = llm_parsed_output.get('negative_tracker', {})
    if 'ad_game_mismatch' in negative_tracker:
        results.append({
            "from_name": "nt_ad_mismatch",
            "to_name": "user_review_text",
            "type": "choices",
            "value": {"choices": [get_bool_choice(negative_tracker['ad_game_mismatch'])]}
        })
    if 'game_cheating_manipulating' in negative_tracker:
        results.append({
            "from_name": "nt_cheating_manipulating",
            "to_name": "user_review_text",
            "type": "choices",
            "value": {"choices": [get_bool_choice(negative_tracker['game_cheating_manipulating'])]}
        })
    if 'bugs_crashes_performance' in negative_tracker:
        results.append({
            "from_name": "nt_bugs_crashes_performance",
            "to_name": "user_review_text",
            "type": "choices",
            "value": {"choices": [get_bool_choice(negative_tracker['bugs_crashes_performance'])]}
        })
    if 'monetization' in negative_tracker:
        results.append({
            "from_name": "nt_monetization",
            "to_name": "user_review_text",
            "type": "choices",
            "value": {"choices": [get_bool_choice(negative_tracker['monetization'])]}
        })
    if 'live_ops_events' in negative_tracker:
        results.append({
            "from_name": "nt_live_ops_events",
            "to_name": "user_review_text",
            "type": "choices",
            "value": {"choices": [get_bool_choice(negative_tracker['live_ops_events'])]}
        })

    return {
        "data": {
            "system_prompt": system_prompt_text,
            "user_review": user_review_text
        },
        "predictions": [
            {
                "model_version": "your_llm_model_name_vX", # Give your model a version for tracking
                "result": results
            }
        ]
    }

# Write a Label Studio JSON Entry

In [None]:
def add_review_to_json(task, filepath="./tasks_with_predictions.json"):
    if not task:
        print("Prediction was empty")
        return

    if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
            if not isinstance(data, list):
                data = [data]
        except json.JSONDecodeError:
            data = []
    else:
        data = []

    data.append(task)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    print("--Prediction added to file")

In [None]:
def process_reviews(file_path="./game_reviews.txt", out_path="./tasks_with_predictions.json"):
    delimiter = "---END_SNIPPET---"

    #with open(file_path, "r", encoding="utf-8") as f:
    with open(file_path, "r") as f:
        content = f.read()

    snippets = content.split(delimiter)

    # Clean up empty strings that might result from trailing delimiters or initial empty content
    #snippets = [snippet.strip() for snippet in snippets if snippet.strip()]

    #get response from LLM, put in right format, write to file
    for i, snippet in enumerate(snippets):
        print(f'Processing review: {i+1}')
        response = ask_gemini_json(build_prompt(base_prompt, snippet))
        task_with_prediction = create_label_studio_prediction(base_prompt, snippet, response)
        add_review_to_json(task_with_prediction, out_path)
    print("--------------------------------------------")
    print(f"Done processing file: {file_path}")


In [1]:
def process_reviews_rate_limited(input_filepath="./game_reviews.txt", 
                                output_filepath="./predictions.json",
                                progress_filepath="./gamereview_progress.txt",
                                num_max=10, 
                                model='models/gemini-2.0-flash-lite'):

    delimiter = "---END_SNIPPET---"
    #prompt_template = augmentation_prompt_template
    RPM_LIMIT = 15
    MAX_RETRIES = 5
    BASE_SLEEP_TIME = 4.5
    start_from_line = 0
    reviews_processed = 0
    
    try:
        with open(progress_filepath, 'r') as f:
            start_from_line = int(f.read().strip())
            print(f"Resuming. Starting from line {start_from_line} in the input file.")
    except FileNotFoundError:
        print("Progress file not found or empty. Starting from the beginning.")
        pass 

    session_reviews_processed = 0
    print(f"Starting generation. Goal: Process up to {num_max} reviews.")

    # Counter for API calls made within the current minute
    requests_in_minute = 0
    start_time_minute = time.time()

    with open(input_filepath, "r") as f:
        content = f.read()
    snippets = content.split(delimiter)

    for i, snippet in enumerate(snippets):
        if i < start_from_line:
            continue
        if num_max > 0 and session_reviews_processed >= num_max:
            print(f'Done: Reached processing limit of {num_max} original reviews for this session.')
            break  

        formatted_prompt = build_prompt(base_prompt, snippet)
        retries = 0
        
        while retries < MAX_RETRIES:
            # Check RPM limit
            current_time = time.time()
            if current_time - start_time_minute >= 60:
                requests_in_minute = 0
                start_time_minute = current_time

            if requests_in_minute >= RPM_LIMIT:
                wait_time = 60 - (current_time - start_time_minute)
                print(f"Rate limit hit. Waiting for {wait_time:.2f} seconds...")
                time.sleep(wait_time + 1)
                requests_in_minute = 0
                start_time_minute = time.time()

            
            try: # LLM CALL
                print(f'Making LLM Call: {session_reviews_processed}')
                response = ask_gemini_json(formatted_prompt, use_json=True, model=model)
                requests_in_minute += 1
                
                # Clean the response string
                response = response.strip()
                if response.startswith("```json") and response.endswith("```"):
                    response = response[len("```json"): -len("```")].strip()
                elif response.startswith("```") and response.endswith("```"):
                    response = response[len("```"): -len("```")].strip()

                task_with_prediction = create_label_studio_prediction(base_prompt, snippet, response)
                add_review_to_json(task_with_prediction, output_path)
                reviews_processed += 1
                session_reviews_processed += 1
                with open(progress_filepath, 'w') as prog_f:
                    prog_f.write(str(i + 1))

                break # Success, break out of retry loop
            except Exception as e:
                retries += 1
                sleep_duration = BASE_SLEEP_TIME * (2 ** (retries - 1)) + random.uniform(0, 1)
                print(f"API Error for input line {i}: {e}. Retrying in {sleep_duration:.2f}s... (Attempt {retries}/{MAX_RETRIES})")
                time.sleep(sleep_duration)

            if retries == MAX_RETRIES:
                print(f"Failed to process review from input line {i} after {MAX_RETRIES} retries. Skipping.")
                # We still update the progress file to avoid getting stuck on a failing entry
                with open(progress_filepath, 'w') as prog_f:
                    prog_f.write(str(i + 1))
            
    print("------DONE-----")


# Run to get first pass reviews

**Note: This code does not handle rate limiting**

In [None]:
process_reviews("./game_reviews.txt", "./predictions_for_label_studio_import.json")