# Fine-tuned LLM Scoring

**Purpose:**  Test fine-tuned LLMs with JSONL test data against a custom scoring function.
Has ability to test the fine-tuned LLM, a baseline model, and a Gemini model (a baseline commercial model)
Scoring function customized to evaluate specialized JSON LLM response.
Scores can optionally be exported to .csv

---
**Copyright (c) 2025 Michael Powers.**

# Imports

In [None]:
import json
import os
import asyncio
from typing import List, Dict, Any, Optional, Callable, Awaitable

import pandas as pd
from sklearn.metrics import f1_score, accuracy_score

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# LlamaIndex imports
from llama_index.llms.ollama import Ollama
from llama_index.core.llms import ChatMessage, MessageRole, ChatResponse
from llama_index.core.evaluation import EvaluationResult, BaseEvaluator

# Google Generative AI direct import
import google.generativeai as genai

# Test Data

In [None]:
def load_my_jsonl_data(file_path) -> List[dict]:
    data = []
    with open(file_path, 'r' ) as f:
        for line in f:
            item = json.loads(line.strip())
            data.append(item)
    return data


# Prompts

In [None]:
def create_chat_messages(data_item: Dict) -> List[ChatMessage]:
    """Creates a list of ChatMessage objects for the LLM."""
    messages = [
        ChatMessage(role=MessageRole.SYSTEM, content=data_item["system_prompt"]),
        ChatMessage(role=MessageRole.USER, content=data_item["user_review"])
    ]
    return messages

# LLM Callers

In [None]:
gemini_model = 'models/gemini-2.0-flash-lite'
api_key="YOUR_API_KEY"
def ask_gemini_json(messages, gemini_model):
    import os
    import google.generativeai as genai
    genai.configure(api_key=api_key)
    model = genai.GenerativeModel(gemini_model)
    generation_config = genai.GenerationConfig(response_mime_type="application/json")

    system_instruction = None
    user_message = None
    
    for i, msg in enumerate(messages):
        if msg.role == MessageRole.SYSTEM:
            system_instruction = msg.content
        elif msg.role == MessageRole.USER:
            user_message = msg.content
    prompt = f'{system_instruction}\n {user_message}'

    response = model.generate_content(prompt, generation_config=generation_config)
    #response = chat.send_message(current_user_message_content, generation_config=generation_config)
    
    return response.text

In [None]:
def ask_llama3_json(messages, model_name):
    from llama_index.core.chat_engine import SimpleChatEngine
    from llama_index.core import Settings
    Settings.llm = Ollama(model=model_name, request_timeout=10000.0, temperature = 0.0, json_mode=True)
    system_prompt = ""
    prompt = ""
    for message in messages:
        if message.role == MessageRole.SYSTEM:
            system_prompt = message.content
        if message.role == MessageRole.USER:
            prompt = message.content
    wrapped_system_prompt = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|> {system_prompt} <|eot_id|>"
    wrapped_prompt = f"<|start_header_id|>user<|end_header_id|>{prompt}<|eot_id|>"

    if True:
        print(f'---System Prompt: {wrapped_system_prompt}')
        print(f'---prompt: {wrapped_prompt}')
    
    chat_engine = SimpleChatEngine.from_defaults(system_prompt = wrapped_system_prompt)
    response = chat_engine.chat(wrapped_prompt)
    return response.response

In [None]:
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.embed_model = embed_model

# Evaluator

In [None]:
def calculate_f1_for_keywords_embedding_lenient(true_keywords_list, pred_keywords_list, embed_model=embed_model, similarity_threshold=0.7):
    """
    Calculates F1 score for keyword extraction with lenient matching using embeddings.
    
    Args:
        true_keywords_list (list): List of ground truth keywords.
        pred_keywords_list (list): List of predicted keywords.
        embed_model: An initialized LlamaIndex embedding model (e.g., OllamaEmbedding).
        similarity_threshold (float): Minimum cosine similarity score (0.0 to 1.0) for a match.
    
    Returns:
        float: F1 score.
    """
    tp = 0
    fp = 0
    fn = 0
    
    
    if not true_keywords_list and not pred_keywords_list:
        return 1.0 # Both empty, perfect score for this case

    if not true_keywords_list and pred_keywords_list:
        print("not true_keywords_list and pred_keywords_list")
        return 0.0 # No true keywords but predictions, all false positives

    if true_keywords_list and not pred_keywords_list:
        print("true_keywords_list and not pred_keywords_list")
        return 0.0 # True keywords but no predictions, all false negatives


    #print(f"\n--- Embedding Generation Check ---")
    #print(f"Keywords to embed (True): {true_keywords_list}")
    #print(f"Keywords to embed (Pred): {pred_keywords_list}")
   


    true_keyword_embeddings = []
    pred_keyword_embeddings = []

    
    # Get embeddings for all true and predicted keywords in batches for efficiency
    try:
        
        # Iterate and get embedding for each keyword individually
        if true_keywords_list:
            #print(f"Attempting to embed {len(true_keywords_list)} true keywords individually...")
            for kw in true_keywords_list:
                true_keyword_embeddings.append(embed_model.get_text_embedding(kw))
            #print(f"Successfully embedded {len(true_keyword_embeddings)} true keywords.")
            # print(f"First true embedding (first 5 dims): {true_keyword_embeddings[0][:5]}") # Optional debug

        if pred_keywords_list:
            #print(f"Attempting to embed {len(pred_keywords_list)} predicted keywords individually...")
            for kw in pred_keywords_list:
                pred_keyword_embeddings.append(embed_model.get_text_embedding(kw))
            #print(f"Successfully embedded {len(pred_keyword_embeddings)} predicted keywords.")
            # print(f"First pred embedding (first 5 dims): {pred_keyword_embeddings[0][:5]}") # Optional debug

       
    except Exception as e:
        print(f"Error getting embeddings: {e}")
        return 0.0 # Handle cases where embedding fails

    # Convert to numpy arrays for cosine_similarity
    true_keyword_embeddings = np.array(true_keyword_embeddings)
    pred_keyword_embeddings = np.array(pred_keyword_embeddings)

    matched_true_indices = [False] * len(true_keywords_list)

    for i, pred_emb in enumerate(pred_keyword_embeddings):
        found_match = False
        best_similarity = -1.0 # Cosine similarity ranges from -1 to 1
        best_true_idx = -1

        for j, true_emb in enumerate(true_keyword_embeddings):
            if not matched_true_indices[j]: # Only consider unmatched true keywords
                # Reshape for sklearn.metrics.pairwise.cosine_similarity
                # It expects 2D arrays, even for single vectors
                sim = cosine_similarity(pred_emb.reshape(1, -1), true_emb.reshape(1, -1))[0][0]

                if sim > best_similarity:
                    best_similarity = sim
                    best_true_idx = j

        if best_similarity >= similarity_threshold:
            tp += 1
            if best_true_idx != -1:
                matched_true_indices[best_true_idx] = True
            found_match = True
        else:
            fp += 1

    fn = sum(1 for is_matched in matched_true_indices if not is_matched)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return f1

In [None]:
class EnhancedJsonOutputEvaluator(BaseEvaluator):
    """
    An evaluator for checking JSON output correctness, calculating specific F1 scores.
    """
    def __init__(self, name: str = "EnhancedJsonOutputEvaluator"):
        super().__init__()

     # --- ADD THESE ABSTRACT METHOD IMPLEMENTATIONS ---
    def _get_prompts(self) -> Dict[str, Any]:
        """Get prompts for the evaluator. Not directly used in this JSON evaluation context."""
        # Since this evaluator relies on direct response/reference comparison
        # and doesn't generate its own prompts based on abstract methods,
        # we return an empty dictionary or a placeholder.
        return {}

    def _update_prompts(self, prompts_dict: Dict[str, Any]) -> None:
        """Update prompts for the evaluator. Not directly used in this JSON evaluation context."""
        # Similarly, no prompts to update from an external source in this evaluator's logic.
        pass
    # --- END OF ADDED METHODS ---
    def _normalize_bool_to_string(self, value: Any) -> Optional[str]:
        """Normalizes boolean-like values to 'true' or 'false' strings."""
        if value is True:
            return 'true'
        if value is False:
            return 'false'
        if isinstance(value, str):
            lower_val = value.strip().lower()
            if lower_val == 'true':
                return 'true'
            if lower_val == 'false':
                return 'false'
        # Handle cases where value might be None or unexpected
        return None 

    
    async def aevaluate(
        self,
        query: Optional[str] = None,
        response: Optional[str] = None,
        reference: Optional[str] = None, # This will be the target_json_output as a string
        **kwargs: Any,
    ) -> EvaluationResult:
        """
        Evaluate the generated JSON response against the reference JSON using F1 scores.
        """
        parsed_llm_output = None
        is_valid_json = True
        overall_score = 0.0
        feedback = []

        # 1. Validate JSON structure
        try:
            # Attempt to parse the LLM's response.
            clean_response = response.strip()
            if clean_response.startswith("```json") and clean_response.endswith("```"):
                clean_response = clean_response[len("```json"): -len("```")].strip()
                feedback.append(f"json needed cleaning.")
            elif clean_response.startswith("```") and clean_response.endswith("```"):
                clean_response = clean_response[len("```"): -len("```")].strip()
                feedback.append(f"json needed cleaning.")
            
            parsed_llm_output = json.loads(clean_response)
        except json.JSONDecodeError as e:
            is_valid_json = False
            feedback.append(f"Invalid JSON output: {e}")
            return EvaluationResult(
                query=query,
                response=response,
                passing=False,
                score=0.0,
                feedback="\n".join(feedback),
                invalid_result=True,
                invalid_reason=feedback[0],
                eval_f1_overall_sentiment=0.0,
                eval_f1_negative_tracker=0.0,
                eval_f1_keywords=0.0
            )

        # If JSON is valid, proceed with F1 score calculations
        reference_json = json.loads(reference) # Reference is already parsed by load_jsonl_data

        sentiment_f1 = 0.0
        negative_tracker_f1 = 0.0
        keywords_f1 = 0.0
        review_flag_correctness = {}
        
        # --- Overall Sentiment - Weighted F1 ---
        try:
            sentiment_true_labels = []
            sentiment_pred_labels = []
            sentiment_weights = []

            # Overall sentiment (higher weight)
            true_overall = reference_json.get("sentiment", {}).get("overall")
            pred_overall = parsed_llm_output.get("sentiment", {}).get("overall")

            # Ensure overall sentiment is always lowercase string
            true_overall_norm = true_overall.strip().lower() if isinstance(true_overall, str) else None
            pred_overall_norm = pred_overall.strip().lower() if isinstance(pred_overall, str) else None

            sentiment_true_labels.append(true_overall_norm)
            sentiment_pred_labels.append(pred_overall_norm)
            sentiment_weights.append(0.5)

            # Recommendation
            true_reco = reference_json.get("sentiment", {}).get("recommendation")
            pred_reco = parsed_llm_output.get("sentiment", {}).get("recommendation")

            true_reco_norm = self._normalize_bool_to_string(true_reco)
            pred_reco_norm = self._normalize_bool_to_string(pred_reco)
            
            sentiment_true_labels.append(true_reco_norm)
            sentiment_pred_labels.append(pred_reco_norm)
            sentiment_weights.append(0.25)

            # Warning anti-recommendation
            true_warn = reference_json.get("sentiment", {}).get("warning_anti_recommendation")
            pred_warn = parsed_llm_output.get("sentiment", {}).get("warning_anti_recommendation")

            true_warn_norm = self._normalize_bool_to_string(true_warn)
            pred_warn_norm = self._normalize_bool_to_string(pred_warn)

            sentiment_true_labels.append(true_warn_norm)
            sentiment_pred_labels.append(pred_warn_norm)
            sentiment_weights.append(0.25)

            combined_true = []
            combined_pred = []
            
            for t, p, w in zip(sentiment_true_labels, sentiment_pred_labels, sentiment_weights):
                if t is not None and p is not None:
                    combined_true.append(t)
                    combined_pred.append(p)

            # Reconstruct combined_weights based on what's actually appended
            combined_weights = []
            weights_map = {0: 0.5, 1: 0.25, 2: 0.25} 
            for i, (t, p) in enumerate(zip(sentiment_true_labels, sentiment_pred_labels)):
                if t is not None and p is not None:
                    combined_weights.append(weights_map.get(i, 0.0))

        
            
            
            if combined_true:
                ALL_SENTIMENT_F1_LABELS = ['positive', 'negative', 'neutral', 'mixed', 'true', 'false']

                # TEST -----------------------------------------------
                per_class_f1 = f1_score(combined_true, combined_pred,
                                        labels=ALL_SENTIMENT_F1_LABELS,
                                        average=None, # Get F1 for each class
                                        zero_division=0)
                #print(f"DEBUG: Sentiment Per-Class F1 Scores: {per_class_f1}")
                #print(f"DEBUG: Sentiment Labels considered by f1_score: {f1_score(combined_true, combined_pred, average=None, zero_division=0, return_indices_as_labels=True)}") # This helps map scores to labels

                # Manual weighted average calculation (to mimic 'weighted')
                # Need to get class counts (support)
                from collections import Counter
                true_label_counts = Counter(combined_true)
                
                weighted_f1_manual = 0.0
                total_support = len(combined_true)
                if total_support > 0:
                    for i, label in enumerate(ALL_SENTIMENT_F1_LABELS):
                        # Find the index of the label in the f1_score's own ordered labels
                        # This part needs careful handling if labels are reordered by f1_score
                        # Simpler: just iterate over the labels provided by f1_score when average=None
                        pass # We will rely on per_class_f1 and the order of labels returned by f1_score.

                # Let's try average='micro' or 'accuracy_score' as a sanity check.
                from sklearn.metrics import accuracy_score
                accuracy_sentiment = accuracy_score(combined_true, combined_pred)
                #print(f"DEBUG: Sentiment Accuracy Score: {accuracy_sentiment}")

                # TEST -----------------------------------------------
                
                sentiment_f1 = f1_score(combined_true, combined_pred, 
                                        labels=ALL_SENTIMENT_F1_LABELS, # Pass explicit labels
                                        average='weighted', zero_division=0)
            else:
                sentiment_f1 = 0.0
                
            #print(f"Final calculated sentiment_f1: {sentiment_f1}") 
            feedback.append(f"Overall Sentiment F1: {sentiment_f1:.2f}")

        except Exception as e:
            feedback.append(f"Error calculating sentiment F1: {e}")
            sentiment_f1 = 0.0
            print(f"DEBUG: Exception caught for sentiment F1: {e}")

        # --- Negative Tracker Flags - Macro F1 ---
        try:
            negative_flags = [
                'ad_game_mismatch', 'game_cheating_manipulating',
                'bugs_crashes_performance', 'monetization', 'live_ops_events'
            ]
            
            true_labels_nt = []
            pred_labels_nt = []

            for flag in negative_flags:
                true_val = reference_json.get("negative_tracker", {}).get(flag, False)
                pred_val = parsed_llm_output.get("negative_tracker", {}).get(flag, False)

                # Check if the predicted value matches the true value
                is_correct = (self._normalize_bool_to_string(true_val) == self._normalize_bool_to_string(pred_val))
                review_flag_correctness[f'nt_{flag}_correct'] = is_correct

                

                
                true_labels_nt.append(int(self._normalize_bool_to_string(true_val) == 'true'))
                pred_labels_nt.append(int(self._normalize_bool_to_string(pred_val) == 'true'))
            
            if true_labels_nt:
                negative_tracker_f1 = accuracy_score(true_labels_nt, pred_labels_nt)

            """
                #negative_tracker_f1 = f1_score(true_labels_nt, pred_labels_nt, average='macro', zero_division=0)
            else:
                negative_tracker_f1 = 0.0

            feedback.append(f"Negative Tracker Macro F1: {negative_tracker_f1:.2f}")
            """

        except Exception as e:
            #feedback.append(f"Error calculating negative tracker F1: {e}")
            feedback.append(f"Error calculating negative tracker: {e}")

            #negative_tracker_f1 = 0.0

        # --- Keyword Extraction - F1 based on Set Matching ---
        try:
    
            true_pos_keywords = set(reference_json.get("specifics", {}).get("positive_keywords", []))
            pred_pos_keywords = set(parsed_llm_output.get("specifics", {}).get("positive_keywords", []))
            
            true_neg_keywords = set(reference_json.get("specifics", {}).get("negative_keywords", []))
            pred_neg_keywords = set(parsed_llm_output.get("specifics", {}).get("negative_keywords", []))

            embedding_similarity_threshold = 0.75
            
            #similarity scoring
            #print(f"DEBUG: Calling f1_pos function with {len(true_pos_keywords)} true and {len(pred_pos_keywords)} pred pos keywords.")
    
            f1_pos = calculate_f1_for_keywords_embedding_lenient(
                true_pos_keywords,
                pred_pos_keywords,
                embed_model,
                embedding_similarity_threshold
            )
            #print(f"DEBUG: f1_pos returned: {f1_pos:.2f}")

            #print(f"DEBUG: Calling f1_neg function with {len(true_neg_keywords)} true and {len(pred_neg_keywords)} pred neg keywords.")
   
            f1_neg = calculate_f1_for_keywords_embedding_lenient(
                true_neg_keywords,
                pred_neg_keywords,
                embed_model, 
                embedding_similarity_threshold
            )
            #print(f"DEBUG: f1_neg returned: {f1_neg:.2f}")
            
            #strict scoring
            """
            tp_pos = len(true_pos_keywords.intersection(pred_pos_keywords))
            fp_pos = len(pred_pos_keywords - true_pos_keywords)
            fn_pos = len(true_pos_keywords - pred_pos_keywords)
            
            precision_pos = tp_pos / (tp_pos + fp_pos) if (tp_pos + fp_pos) > 0 else 0.0
            recall_pos = tp_pos / (tp_pos + fn_pos) if (tp_pos + fn_pos) > 0 else 0.0
            f1_pos = (2 * precision_pos * recall_pos) / (precision_pos + recall_pos) if (precision_pos + recall_pos) > 0 else 0.0
            
            tp_neg = len(true_neg_keywords.intersection(pred_neg_keywords))
            fp_neg = len(pred_neg_keywords - true_neg_keywords)
            fn_neg = len(true_neg_keywords - pred_neg_keywords)

            precision_neg = tp_neg / (tp_neg + fp_neg) if (tp_neg + fp_neg) > 0 else 0.0
            recall_neg = tp_neg / (tp_neg + fn_neg) if (tp_neg + fn_neg) > 0 else 0.0
            f1_neg = (2 * precision_neg * recall_neg) / (precision_neg + recall_neg) if (precision_neg + recall_neg) > 0 else 0.0
            """

            
            keywords_f1 = np.mean([f1_pos, f1_neg]) if (f1_pos is not None and f1_neg is not None) else 0.0
            #print(f"DEBUG: Combined keywords_f1: {keywords_f1:.2f}") 


            
            feedback.append(f"Keyword Extraction F1 (Set Matching): {keywords_f1:.2f}")

        except Exception as e:
            print(f"Error calculating keyword F1: {e}")
            feedback.append(f"Error calculating keyword F1: {e}")
            keywords_f1 = 0.0

        overall_score = (
            0.4 * sentiment_f1 +
            0.3 * negative_tracker_f1 +
            0.3 * keywords_f1
        )
        feedback.append(f"Combined Weighted F1: {overall_score:.2f}")
        print(f'Score: {overall_score}')

        eval_result = EvaluationResult(
            query=query,
            response=response,
            passing=is_valid_json and (overall_score > 0.7),
            score=overall_score,
            feedback="\n".join(feedback),
            invalid_result=not is_valid_json,
            invalid_reason=feedback[0] if not is_valid_json else None,
        )   
        return eval_result, sentiment_f1, keywords_f1, review_flag_correctness
        


# Running a test

In [None]:
async def run_structured_evaluation(
    test_data: List[Dict],
    llm_callable: Callable[[List[ChatMessage]], Awaitable[ChatResponse]], # Changed type hint
    model_name_for_logging: str = "LLM",
    num_tests_to_run: Optional[int] = None, # New parameter
    DISPLAY_RESULTS = False
) -> pd.DataFrame:
    """
    Runs evaluation for structured JSON output using a given LLM callable function.
    """
    evaluator = EnhancedJsonOutputEvaluator()
    results = []


    print(f"\n--- Starting Evaluation for Model: {model_name_for_logging} ---")

    # Limit the number of tests if num_tests_to_run is specified
    data_to_evaluate = test_data[:num_tests_to_run] if num_tests_to_run is not None else test_data

    for i, item in enumerate(data_to_evaluate): # Iterate over the limited data
        print(f"\n--- Evaluating Test Case {i+1}/{len(data_to_evaluate)} ---")
        user_review = item["user_review"]
        target_json_output = item["target_json_output"]
        # system_prompt = item["system_prompt"] # Not directly used here, but part of messages

        messages = create_chat_messages(item)

        llm_output_text = ""
        try:
            if DISPLAY_RESULTS:
                print(f"Review: {user_review[:100]}...")
                print(f"Target JSON: {json.dumps(target_json_output, indent=2)}")

            # Call the passed llm_callable directly
            chat_response = llm_callable(messages, model_name_for_logging)
            #llm_output_text = chat_response.message.content.strip()
            llm_output_text = chat_response.strip()
            #llm_output_text = chat_response
            
            if DISPLAY_RESULTS:
                print(f"LLM Raw Output: {llm_output_text}")

            eval_result, sentiment_f1, keywords_f1, review_flag_correctness = await evaluator.aevaluate(
                query=user_review,
                response=llm_output_text,
                reference=json.dumps(target_json_output)
            )

           
            result_entry = {
                "LLM": model_name_for_logging,
                "test_case_id": i,
                "user_review": user_review,
                "target_json": target_json_output,
                "llm_raw_output": llm_output_text,
                "is_valid_json": not eval_result.invalid_result,
                "score": eval_result.score,
                "feedback": eval_result.feedback,
                "f1_overall_sentiment": sentiment_f1,
                "f1_keywords": keywords_f1,
                "nt_ad_game_mismatch_correct":review_flag_correctness["nt_ad_game_mismatch_correct"],
                "nt_game_cheating_manipulating_correct":review_flag_correctness["nt_game_cheating_manipulating_correct"],
                "nt_bugs_crashes_performance_correct":review_flag_correctness["nt_bugs_crashes_performance_correct"],
                "nt_monetization_correct":review_flag_correctness["nt_monetization_correct"],
                "nt_live_ops_events_correct":review_flag_correctness["nt_live_ops_events_correct"],
            }
            
            results.append(result_entry)

        except Exception as e:
            print(f"Error processing test case {i}: {e}")
            results.append({
                "LLM": model_name_for_logging,
                "test_case_id": i,
                "user_review": user_review,
                "target_json": target_json_output,
                "llm_raw_output": llm_output_text if llm_output_text else f"Error: {e}",
                "is_valid_json": False,
                "score": 0.0,
                "feedback": f"Runtime Error: {e}",
                "f1_overall_sentiment": 0.0,
                "f1_keywords": 0.0,
                "nt_ad_game_mismatch_correct":0.0,
                "nt_game_cheating_manipulating_correct":0.0,
                "nt_bugs_crashes_performance_correct":0.0,
                "nt_monetization_correct":0.0,
                "nt_live_ops_events_correct":0.0,
            })
    return pd.DataFrame(results)


# Final scoring of test data

In [None]:
#1. negative tracker scoring
#2. final overall score
def calculate_final_metrics(df: pd.DataFrame) -> dict:
    """
    Calculates final aggregated metrics from a DataFrame of test results.

    Args:
        df (pd.DataFrame): DataFrame containing per-test-case results with
                           individual negative flag correctness.
                           Expected columns:
                           - 'is_valid_json' (bool)
                           - 'f1_overall_sentiment' (float)
                           - 'f1_keywords' (float)
                           - 'nt_ad_game_mismatch_correct' (bool)
                           - 'nt_game_cheating_manipulating_correct' (bool)
                           - 'nt_bugs_crashes_performance_correct' (bool)
                           - 'nt_monetization_correct' (bool)
                           - 'nt_live_ops_events_correct' (bool)

    Returns:
        dict: A dictionary containing the final overall score and F1 scores.
    """

    if df.empty:
        print("Warning: Input DataFrame is empty. Returning default zero metrics.")
        return {
            'final_overall_score': 0.0,
            'final_f1_overall_sentiment': 0.0,
            'final_f1_negative_tracker': 0.0,
            'final_f1_keywords': 0.0
        }

    # Calculate overall F1 for Sentiment
 
    final_f1_overall_sentiment = df['f1_overall_sentiment'].mean()

    # Calculate overall F1 for Keywords
    final_f1_keywords = df['f1_keywords'].mean()

    # Calculate overall F1 for Negative Trackers
    negative_flag_columns = [
        'nt_ad_game_mismatch_correct',
        'nt_game_cheating_manipulating_correct',
        'nt_bugs_crashes_performance_correct',
        'nt_monetization_correct',
        'nt_live_ops_events_correct'
    ]

    # Flatten all correctness flags into a single series
    all_nt_correctness = df[negative_flag_columns].values.flatten()

    final_f1_negative_tracker = np.mean(all_nt_correctness) if len(all_nt_correctness) > 0 else 0.0

    # If you *did* have the original true and predicted 0/1 values stored in the DataFrame
    # (e.g., 'nt_ad_game_mismatch_true', 'nt_ad_game_mismatch_pred'),
    # then the calculation would be:
    # all_true_nt_labels = df[[col.replace('_correct', '_true') for col in negative_flag_columns]].values.flatten()
    # all_pred_nt_labels = df[[col.replace('_correct', '_pred') for col in negative_flag_columns]].values.flatten()
    # final_f1_negative_tracker = f1_score(all_true_nt_labels, all_pred_nt_labels, average='micro', zero_division=0)
    #
    # For the current DataFrame structure, `np.mean(all_nt_correctness)` is the most sensible
    # interpretation that aligns with "accuracy" and maps to micro F1 for this scenario.

    # Calculate the Overall Score
    # weights: .4*sentiment f1, .3* negative tracker f1, .3 * keywords f1
    sentiment_weight = 0.4
    negative_tracker_weight = 0.3
    keywords_weight = 0.3

    final_overall_score = (
        sentiment_weight * final_f1_overall_sentiment +
        negative_tracker_weight * final_f1_negative_tracker +
        keywords_weight * final_f1_keywords
    )

    results = {
        'final_overall_score': final_overall_score,
        'final_f1_overall_sentiment': final_f1_overall_sentiment,
        'final_f1_negative_tracker': final_f1_negative_tracker, # This is micro-F1/accuracy
        'final_f1_keywords': final_f1_keywords
    }

    return results


# Load test data

In [None]:
# LOAD DATA ----------------------------------------------

jsonl_file_path = "test_data_NEW.jsonl"
test_data_structured = load_my_jsonl_data(jsonl_file_path)
print(f'Loaded {len(test_data_structured)} test cases.')
#print("Example data:")
#print(test_data_structured[0])

# TESTING MY CODE

In [None]:
base_model = "llama3.1:8b"  
gemini_model = 'models/gemini-2.0-flash-lite'
test_model = "hf.co/MrMike42/GameReview-llama3.1-8b-v9-Q4_K_M-GGUF:latest"
num_tests = 20

In [None]:

#GEMINI
#test_results_df = await run_structured_evaluation(test_data_structured, ask_gemini_json, gemini_model, num_tests)

#LLAMA3
#test_results_df = await run_structured_evaluation(test_data_structured, ask_llama3_json, base_model, num_tests)

#FINETUNE
test_results_df = await run_structured_evaluation(test_data_structured, ask_llama3_json, test_model, num_tests, True)


results_df = test_results_df

test_final_metrics = calculate_final_metrics(results_df)
print('Results data:')
print(results_df[['test_case_id', 'is_valid_json',  'f1_overall_sentiment',  'f1_keywords']])
print(f'-----\nFinal results:\n')
print(test_final_metrics)
test_results_df.to_csv("test9_results.csv", index=False)


