# Export & Conversion to JSONL of LLM training data from Label Studio

**Purpose:**  Convert JSON-MIN export of Label Studio training data into JSONL for use in LLM Training.
JSON format customized for extracting sentiment and keywords from game reviews.

---
**Copyright (c) 2025 Michael Powers.**

# Imports

In [None]:
import json
import os
import pandas as pd

In [None]:
def convert_label_studio_to_training(label_studio_json_path, output_jsonl_path, from_n = 0, to_n=None):
    intermediate_data = []

    with open(label_studio_json_path, 'r', encoding='utf-8') as f:
        label_studio_raw_data = json.load(f)

    num = 0
    for task_data in label_studio_raw_data:
        num += 1
        if num < from_n:
            continue
        if to_n is not None and num >= to_n:
            continue
            
        # Extract original user review
        user_review = task_data.get('user_review')
        if not user_review:
            print(f"Warning: Skipping task {task_data.get('id')} due to missing 'user_review'.")
            continue

        structured_output = {
            "sentiment": {
                "overall": None,
                "recommendation": None,
                "warning_anti_recommendation": None
            },
            "specifics": {
                "positive_keywords": [],
                "negative_keywords": []
            },
            "negative_tracker": {
                "ad_game_mismatch": False,
                "game_cheating_manipulating": False,
                "bugs_crashes_performance": False,
                "monetization": False,
                "live_ops_events": False
            }
        }

        # --- Extract Sentiment ---
        structured_output['sentiment']['overall'] = task_data.get('sentiment_overall')

        # Convert "true"/"false" strings to actual booleans
        # Added more robust handling for None/missing values
        rec_val = task_data.get('sentiment_recommendation')
        structured_output['sentiment']['recommendation'] = rec_val.lower() == 'true' if isinstance(rec_val, str) else None

        warn_val = task_data.get('sentiment_warning')
        structured_output['sentiment']['warning_anti_recommendation'] = warn_val.lower() == 'true' if isinstance(warn_val, str) else None

        # --- Extract Specific Keywords ---
        # Handle positive_keywords: It can be a string OR a dict like {'text': 'keywords'}
        positive_kws_raw = task_data.get('specifics_positive_keywords', '')
        positive_kws_str = ''
        if isinstance(positive_kws_raw, str):
            positive_kws_str = positive_kws_raw
        elif isinstance(positive_kws_raw, dict) and 'text' in positive_kws_raw:
            positive_kws_str = positive_kws_raw['text']

        if positive_kws_str:
            structured_output['specifics']['positive_keywords'] = [kw.strip() for kw in positive_kws_str.split(',') if kw.strip()]

        # Handle negative_keywords: It can be a string OR a dict like {'text': 'keywords'}
        negative_kws_raw = task_data.get('specifics_negative_keywords', '')
        negative_kws_str = ''
        if isinstance(negative_kws_raw, str):
            negative_kws_str = negative_kws_raw
        elif isinstance(negative_kws_raw, dict) and 'text' in negative_kws_raw:
            negative_kws_str = negative_kws_raw['text']

        if negative_kws_str:
            structured_output['specifics']['negative_keywords'] = [kw.strip() for kw in negative_kws_str.split(',') if kw.strip()]

        # --- Extract Negative Tracker Flags ---
        # Convert "true"/"false" strings to actual booleans
        # Added more robust handling for None/missing values for boolean flags
        ad_mismatch = task_data.get('nt_ad_mismatch')
        structured_output['negative_tracker']['ad_game_mismatch'] = ad_mismatch.lower() == 'true' if isinstance(ad_mismatch, str) else False

        cheating = task_data.get('nt_cheating_manipulating')
        structured_output['negative_tracker']['game_cheating_manipulating'] = cheating.lower() == 'true' if isinstance(cheating, str) else False

        bugs = task_data.get('nt_bugs_crashes_performance')
        structured_output['negative_tracker']['bugs_crashes_performance'] = bugs.lower() == 'true' if isinstance(bugs, str) else False

        monetization = task_data.get('nt_monetization')
        structured_output['negative_tracker']['monetization'] = monetization.lower() == 'true' if isinstance(monetization, str) else False

        live_ops = task_data.get('nt_live_ops_events')
        structured_output['negative_tracker']['live_ops_events'] = live_ops.lower() == 'true' if isinstance(live_ops, str) else False


        # Add the system prompt and user review
        sample_for_intermediate_format = {
            "system_prompt": task_data.get('system_prompt'),
            "user_review": user_review,
            "target_json_output": structured_output
        }
        intermediate_data.append(sample_for_intermediate_format)

    # Save as JSONL
    with open(output_jsonl_path, 'w', encoding='utf-8') as f:
        for entry in intermediate_data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

    print(f"Converted {len(intermediate_data)} samples to {output_jsonl_path}")

    return pd.DataFrame(intermediate_data)

# RUNNING the format conversion

In [None]:
label_studio_export_file = "./training data/label_studio_export.json"
output_jsonl_file = "./training data/test_data.jsonl"
#convert_label_studio_to_training(label_studio_export_file, output_jsonl_file)
convert_label_studio_to_training(label_studio_export_file, output_jsonl_file, from_n=0, to_n=99)
# USE from_n and to_n to only export certain rows. 
# So above you see exporting first 100 rows for saving as TEST DATA
# Then you would call the same function, with a different output_jsonl_file, from_n=100
# To export JUST the TRAINING DATA