# gemini annotation to jsonl


In [21]:
import json
import os
import google.generativeai as genai
from datetime import datetime
import pandas as pd
import time
import random

In [2]:
gemini_model = 'gemini-2.5-flash-lite-preview-06-17' #'models/gemini-2.0-flash-lite'
api_key="AIzaSyDXYjjimtM31RJAQsJeWxvSDbyzgWeYV0w"

In [3]:
base_prompt = "You are an expert game review analyzer. Your task is to extract structured information from game reviews, outputting a precise JSON object with sentiment, specific keywords, and negative flags. Ensure the output is valid JSON, following this schema: {'sentiment': {'overall': 'positive|negative|neutral|mixed', 'recommendation': true|false, 'warning_anti_recommendation': true|false}, 'specifics': {'positive_keywords': ['list', 'of', 'phrases'], 'negative_keywords': ['list', 'of', 'phrases']}, 'negative_tracker': {'ad_game_mismatch': true|false, 'game_cheating_manipulating': true|false, 'bugs_crashes_performance': true|false, 'monetization': true|false, 'live_ops_events': true|false}}"

In [4]:
def build_prompt(prompt, review):
    return f'{prompt}\n\nREVIEW: {review}'

In [5]:
def ask_gemini_json(prompt, use_json=True, model='models/gemini-2.0-flash-lite'):
    import os
    import google.generativeai as genai
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(model)
    if use_json:
        generation_config = genai.GenerationConfig(response_mime_type="application/json")
        response = model.generate_content(prompt, generation_config=generation_config)
    else:
        response = model.generate_content(prompt)
    return response.text

In [33]:
def response_to_jsonl(system_prompt, user_review, structured_output, output_jsonl_path):
    intermediate_data = []

    try:
        llm_parsed_output = json.loads(structured_output)
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON string provided for LLM output: {structured_output}")
        return None
        
       # Add the system prompt and user review
    sample_for_intermediate_format = {
            "system_prompt": system_prompt,
            "user_review": user_review,
            "target_json_output": structured_output
    }
    intermediate_data.append(sample_for_intermediate_format)

    # Save as JSONL
    with open(output_jsonl_path, 'a', encoding='utf-8') as f:
        for entry in intermediate_data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

    #print(f"Added Test Data to {output_jsonl_path}")
    print("Test data added")

    return pd.DataFrame(intermediate_data)

In [40]:
def process_reviews_rate_limited(input_filepath="./game_reviews.txt", 
                                output_filepath="./predictions.json",
                                progress_filepath="./gamereview_progress.txt",
                                num_max=10, 
                                model='models/gemini-2.0-flash-lite',
                                rpm_limit=15):

    delimiter = "---END_SNIPPET---"
    #prompt_template = augmentation_prompt_template
    RPM_LIMIT = rpm_limit
    MAX_RETRIES = 5
    BASE_SLEEP_TIME = 4.5
    start_from_line = 0
    reviews_processed = 0
    
    try:
        with open(progress_filepath, 'r') as f:
            start_from_line = int(f.read().strip())
            print(f"Resuming. Starting from line {start_from_line} in the input file.")
    except FileNotFoundError:
        print("Progress file not found or empty. Starting from the beginning.")
        pass 

    session_reviews_processed = 0
    print(f"Starting generation. Goal: Process up to {num_max} reviews.")

    # Counter for API calls made within the current minute
    requests_in_minute = 0
    start_time_minute = time.time()

    with open(input_filepath, "r") as f:
        content = f.read()
    snippets = content.split(delimiter)

    for i, snippet in enumerate(snippets):
        if i < start_from_line:
            continue
        if num_max > 0 and session_reviews_processed >= num_max:
            print(f'Done: Reached processing limit of {num_max} original reviews for this session.')
            break  

        formatted_prompt = build_prompt(base_prompt, snippet)
        retries = 0
        
        while retries < MAX_RETRIES:
            # Check RPM limit
            current_time = time.time()
            if current_time - start_time_minute >= 60:
                requests_in_minute = 0
                start_time_minute = current_time

            if requests_in_minute >= RPM_LIMIT:
                wait_time = 60 - (current_time - start_time_minute)
                print(f"Rate limit hit. Waiting for {wait_time:.2f} seconds...")
                time.sleep(wait_time + 1)
                requests_in_minute = 0
                start_time_minute = time.time()

            
            try: # LLM CALL
                print(f'Making LLM Call: {session_reviews_processed}')
                response = ask_gemini_json(formatted_prompt, use_json=True, model=model)
                requests_in_minute += 1
                
                # Clean the response string
                response = response.strip()
                if response.startswith("```json") and response.endswith("```"):
                    response = response[len("```json"): -len("```")].strip()
                elif response.startswith("```") and response.endswith("```"):
                    response = response[len("```"): -len("```")].strip()

                response_to_jsonl(base_prompt, snippet, response, output_filepath)
                #####
                #task_with_prediction = create_label_studio_prediction(base_prompt, snippet, response)
                #add_review_to_json(task_with_prediction, output_path)
                #####

                
                reviews_processed += 1
                session_reviews_processed += 1
                with open(progress_filepath, 'w') as prog_f:
                    prog_f.write(str(i + 1))

                break # Success, break out of retry loop
            except Exception as e:
                retries += 1
                sleep_duration = BASE_SLEEP_TIME * (2 ** (retries - 1)) + random.uniform(0, 1)
                print(f"API Error for input line {i}: {e}. Retrying in {sleep_duration:.2f}s... (Attempt {retries}/{MAX_RETRIES})")
                time.sleep(sleep_duration)

            if retries == MAX_RETRIES:
                print(f"Failed to process review from input line {i} after {MAX_RETRIES} retries. Skipping.")
                # We still update the progress file to avoid getting stuck on a failing entry
                with open(progress_filepath, 'w') as prog_f:
                    prog_f.write(str(i + 1))
            
    print("------DONE-----")


In [41]:
#process_reviews_rate_limited(input_filepath="./pokemongo_reviews.txt", 
#                                output_filepath="./synthetic_testdata.jsonl",
#                                progress_filepath="./synthetic_progress.txt",
#                                num_max=100, 
#                                model='models/gemini-2.5-flash')

#'gemini-2.5-flash-lite-preview-06-17' 1000
#bingoblitz_reviews.txt 200
#totalbattle_reviews.txt 200
#jackpotparty_reviews.txt 200
#clashofclans_reviews.txt 200
#cod_reviews.txt 200
process_reviews_rate_limited(input_filepath="./bingoblitz_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_bingoblitz.txt",
                                num_max=200, 
                                model='gemini-2.5-flash-lite-preview-06-17')
print('------------------GAME1 DONE--------------')
process_reviews_rate_limited(input_filepath="./totalbattle_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_totalbattle.txt",
                                num_max=200, 
                                model='gemini-2.5-flash-lite-preview-06-17')
print('------------------GAME2 DONE--------------')
process_reviews_rate_limited(input_filepath="./jackpotparty_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_jackpot.txt",
                                num_max=200, 
                                model='gemini-2.5-flash-lite-preview-06-17')
print('------------------GAME3 DONE--------------')
process_reviews_rate_limited(input_filepath="./clashofclans_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_clashofclans.txt",
                                num_max=200, 
                                model='gemini-2.5-flash-lite-preview-06-17')
print('------------------GAME4 DONE--------------')
process_reviews_rate_limited(input_filepath="./cod_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_cod.txt",
                                num_max=200, 
                                model='gemini-2.5-flash-lite-preview-06-17')
print('------------------GAME5 DONE--------------')



#'models/gemini-2.0-flash-lite' 200
#roblox_reviews.txt 100
#royal_kingdom_reviews.txt 100
process_reviews_rate_limited(input_filepath="./roblox_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_roblox.txt",
                                num_max=100, 
                                model='models/gemini-2.0-flash-lite')
print('------------------GAME6 DONE--------------')
process_reviews_rate_limited(input_filepath="./royal_kingdom_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_royalkingdom.txt",
                                num_max=100, 
                                model='models/gemini-2.0-flash-lite')
print('------------------GAME7 DONE--------------')



#'models/gemini-2.0-flash' 200
#evony_reviews.txt 200
process_reviews_rate_limited(input_filepath="./evony_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_evony.txt",
                                num_max=200, 
                                model='models/gemini-2.0-flash')
print('------------------GAME8 DONE--------------')

#'models/gemini-2.5-flash' 200
#freefire_reviews.txt 200
process_reviews_rate_limited(input_filepath="./freefire_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_freefire.txt",
                                num_max=200, 
                                model='models/gemini-2.5-flash')
print('------------------GAME9 DONE--------------')

#'models/gemini-2.5-pro' 100
#raid_reviews.txt 100
process_reviews_rate_limited(input_filepath="./raid_reviews.txt", 
                                output_filepath="./synthetic_testdata.jsonl",
                                progress_filepath="./progress_raid.txt",
                                num_max=100, 
                                model='models/gemini-2.5-pro')

print('------------------GAME10 DONE--------------')
print('DONE DONE')








Progress file not found or empty. Starting from the beginning.
Starting generation. Goal: Process up to 100 reviews.
Making LLM Call: 0
Test data added
Making LLM Call: 1
Test data added
Making LLM Call: 2
Test data added
Making LLM Call: 3
Test data added
Making LLM Call: 4
Test data added
Making LLM Call: 5
Test data added
Making LLM Call: 6
Test data added
Making LLM Call: 7
Test data added
Making LLM Call: 8
Test data added
Making LLM Call: 9
Test data added
Making LLM Call: 10
Test data added
Making LLM Call: 11
Test data added
Making LLM Call: 12
API Error for input line 12: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 53
}
]. Retrying in 4.54s... (Attempt 1/5)
Making LLM Call: 12
Test d