In [None]:
import google.generativeai as genai
from openai import OpenAI
import json
import time
import pandas as pd
import os
import re
import anthropic
import csv

In [None]:
GROK_KEY = "YOUR_KEY"
CHAT_GPT_KEY = "YOUR_KEY"
GEMINI_API_KEY = "YOUR_KEY"
CLAUDE_KEY = "YOUR_KEY"

ANSWERS_JSON = "answers.json"
FINAL_RESULTS_JSON = "gemini_judge.json"

In [None]:
import os, json

DATASET_DIR = "dataset"

youtube_data = []

for fname in os.listdir(DATASET_DIR):
    if fname.endswith(".json"):
        with open(os.path.join(DATASET_DIR, fname), encoding="utf-8") as f:
            youtube_data.append(json.load(f))


In [1]:
import csv
import os
import time
import json
import regex as re
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


In [None]:
FINAL_RESULTS_JSON = "../results/example_final_results.json"
CSV_PATH = "../answers/answers.csv"

def parse_json_safe(text):
    try:
        match = re.search(r'\{.*\}', text, re.DOTALL)
        if match:
            json_str = match.group()
            return json.loads(json_str)
        return {"error": "Không tìm thấy cấu trúc JSON", "raw": text}
    except Exception as e:
        return {"error": f"Lỗi parse: {str(e)}", "raw": text}

def clean_text(text):
    if not isinstance(text, str):
        return text
    text = text.replace("**", "")
    text = re.sub(r'#+\s', '', text)
    text = re.sub(r'^\s*[\*\-]\s+', '', text, flags=re.MULTILINE)
    return text.strip()

def call_gemini(prompt_text):
    while True:
        try:
            genai.configure(api_key = GEMINI_API_KEY)
            # replace 'gemini-3-flash-preview' with the desired Gemini model name
            model = genai.GenerativeModel('gemini-3-pro-preview')
            response = model.generate_content(prompt_text)
            return response.text
        except Exception as e:
            if "429" in str(e):
                print("Wait 20s ....")
                time.sleep(20)
                continue
            return f"Error Gemini Judge: {e}"

def call_gpt(prompt_text):
    try:
        client = OpenAI(api_key = CHAT_GPT_KEY)
        response = client.chat.completions.create(
            model="gpt-5-nano",
            messages=[{"role": "user", "content": prompt_text}]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error ChatGPT: {e}"

def call_grok(prompt_text):
    try:
        client = OpenAI(api_key = GROK_KEY,
                        base_url="https://api.x.ai/v1")
        response = client.chat.completions.create(
            model="grok-4-1-fast-reasoning",
            messages=[{"role": "user",
                       "content": prompt_text}]
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error Grok: {e}"

def call_claude(prompt_text):
    try:
        client = anthropic.Anthropic(api_key=CLAUDE_KEY)
        response = client.messages.create(
            model="claude-3-5-haiku-20241022",
            max_tokens=1024,
            messages=[{"role": "user", "content": prompt_text}]
        )
        return response.content[0].text
    except Exception as e:
        return f"Error Claude: {e}"

def save_json_data(filename, new_entry):
    data = []
    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except:
                data = []
    data.append(new_entry)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)



def append_json(path, obj):
    """Append an object to a JSON list file (tạo file nếu chưa có)."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    data = []
    if os.path.exists(path):
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
                if not isinstance(data, list):
                    data = []
        except Exception:
            data = []
    data.append(obj)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def _get_field(row, candidates):
    """Trả về giá trị trường trong row theo danh sách tên khả dĩ."""
    for k in candidates:
        if k in row and row[k] is not None:
            return row[k].strip()
    return ""


def build_judge_prompt(number, question_text, answer_a, answer_b, answer_c):
    """Xây prompt giữ nguyên phần mô tả, chỉ thay number, question_text, answer_a/b/c."""
    return f"""
I'm doing scientific research, I'll describe the research, please help me be a judge to grade it. Description:
Objective. This project aims to evaluate the performance of large language models (LLMs) in responding to real-world queries based on YouTube comments, particularly in the context of online educational videos.
Context. Viewer comments on YouTube not only express emotions but also reflect learners’ understanding, interests, and learning experiences. Querying and synthesizing information from these comments can help educators and content managers better capture learner feedback and improve instructional quality.
Data. The queries were constructed based on user comments from 9 videos in the Stanford CS230: Deep Learning (Autumn 2025), 
Methodology. The study uses a set of 80 queries representing diverse tasks such as: Information extraction, Sentiment analysis, Topic identification, Inference, Summarization and judgment, etc.
The three LLMs being compared are:
ChatGPT 5.1, Like K2, and Grok 4.1 (however, to ensure fairness, I will hide the names of the responding models and replace them with random A, B, C)
Each model's responses are evaluated according to three key criteria:
Accuracy - Correctness of information and avoidance of unsupported hallucinations
Relevance - Alignment with the query's intent and requirements
Coverage - Breadth and completeness in addressing key aspects mentioned or implied
✍️ How to Evaluate
Each response will be evaluated using two methods:
Pairwise Comparison: Choose the better response between two models
Pointwise scoring: Rate each model independently on a scale from 1 to 10
Now I will provide comments on each of the 9 videos. You can start when you have commented on all 9 and I signal you to rate them.
—--------




Now let's begin the scoring. Based on the detailed data from the 9 videos I've provided, please act as the judge and score the following 12 questions:
Part 1: Pair Matching - Choose the better response between two models
Question {number}: {question_text}
Answer A [{answer_a}];
Answer B [{answer_b}];
Answer C:[{answer_c}].
Accuracy
Accuracy assesses the extent to which answers accurately reflect the information contained in the comments, avoiding false information, unfounded speculation, and fabricated content not supported by comment data.
1. Based on the two answers to question {number}, if considering accuracy alone, which answer, A or B, more accurately reflects the information in the comments, with fewer errors or unfounded speculations?
☐ A 
☐ B 
☐ A and B are tied
2. Based on the two answers to question {number}, if considering accuracy alone, which answer, A or C, more accurately reflects the information in the comments, with fewer errors or unfounded speculations?
☐ A 
☐ C 
☐ A and C are tied
3. Based on the two answers to question {number}, considering accuracy alone, which answer, B or C, more accurately reflects the information in the comments, with fewer errors or unfounded speculations?
☐ B 
☐ C
☐ B and C are tied
Relevance
Relevance assesses the degree to which an answer closely addresses the query's requirements. An answer is considered highly relevant if it directly addresses the core question, is concise, doesn't stray from the topic, and doesn't omit any key elements of the query.
Based on the two answers to question {number}, considering only their relevance, which answer more accurately addresses the core of the query, avoiding digressions or omissions of the main point?
4. Based on the two answers to question {number}, considering only their relevance, which answer, A or B, addresses the core of the query more accurately, avoiding digressions or omissions of the main point?
☐ A 
☐ B 
☐ A and B are tied
5. Based on the two answers to question {number}, considering only their relevance, which answer, A or C, addresses the core of the query more accurately, avoiding digressions or omissions of the main point?
☐ A 
☐ C
☐ A and C are tied
6. Based on the two answers to question {number}, considering only their relevance, which answer, B or C, addresses the core of the query more accurately, avoiding digressions or omissions of the main point?
☐ B 
☐ C
☐ B and C are tied
3. Coverage
Coverage assesses the extent to which an answer fully addresses the important aspects requested in the query. A highly comprehensive answer will synthesize multiple dimensions of information contained in the comments, without omitting any key points or necessary aspects.
Based on the two answers to question {number}, considering only their relevance, which answer more accurately addresses the core of the query, avoiding digressions or omissions of the main point?
7. Based on the two answers to question {number}, considering only the coverage, which answer, A or B, provides a more complete picture of the key aspects of the query?
☐ A 
☐ B 
☐ A and B are tied
8. Based on the two answers to question {number}, considering only the coverage, which answer, A or C, provides a more complete picture of the key aspects of the query?
☐ A 
☐ C
☐ A and C are tied
9. Based on the two answers to question {number}, considering only the coverage, which answer, B or C, provides a more complete picture of the key aspects of the query?
☐ B 
☐ C
☐ B and C are tied
Part 2: Scoring on a 10-point scale
Accuracy
10. To what extent do the answers accurately reflect the information contained in the comments, and do they avoid misinformation or unfounded speculation?
Score, Description of criteria
1 – 2
Serious discrepancies:The answer contains largely false, fabricated (hallucination), or directly contradictory information that contradicts the input data. It is harmful to or misleads the user.
3 – 4
Weak:While some information is correct, it is mixed with a lot of misinformation or unfounded speculation. Users cannot trust this answer without verifying it.
5 – 6
Medium:The basic information is correct, but there are some minor errors in figures, proper names, or supplementary details. There are no serious errors in terms of logic/background knowledge.
7 – 8
Good:The information is accurate and reliable. There are no factual errors. Any inferences (if any) are logically based on the data.
9 – 10
Excellent:Absolutely 100% accurate. Every statement is true. There is no ambiguity whatsoever regarding its correctness.
Please assign points in order: How many points for A, how many points for B, and how many points for C?
Relevance
11. To what extent does the answer closely address the query's requirements? Does it stay focused, avoiding rambling or omitting key points?
Score, Description of criteria
1 – 2
Digress:The answer is irrelevant to the question or addresses a different question altogether. Completely useless.
3 – 4
Weak:The topic is mentioned but the response is roundabout, contains too much unnecessary information (filler words), or repeats the question verbatim without addressing the issue.
5 – 6
Medium:The answers are reasonably focused, but there are still rambling passages or disorganized sentence structures that make it difficult for the reader to grasp the main points.
7 – 8
Good:It directly answers the question. The structure is clear and easy to understand. It eliminates most of the distracting information.
9 – 10
Excellent:The answer is concise, brief, yet valuable. It gets straight to the point from the first sentence. The formatting (bullet point, bold/light) is optimized for readability.
Please assign points in order: How many points for A, how many points for B, and how many points for C?
Relevance
12. To what extent does the answer address all the important aspects raised in the query? Are there any significant points omitted?
Score, Description of criteria
1 – 2
Very flawed:Ignoring most of the main requirements of the question. Only answering a very small or insignificant part of it.
3 – 4
Lack:Omitting at least one important/core aspect of the question. For example: Asking about advantages and disadvantages but only mentioning the advantages.
5 – 6
Medium:The main points are mentioned, but the discussion lacks depth and is superficial. It is missing necessary supporting details or illustrative examples.
7 – 8
Good:The response addresses all aspects of the query, leaving no key points unanswered. The depth of the response is good.
9 – 10
Excellent:Comprehensive and thorough. It not only answers all the questions but also provides context, insightful perspectives, or exceptions (if needed). Exceeds expectations in terms of detail.
Please assign points in order: How many points for A, how many points for B, and how many points for C?
Please help me answer the following 12 questions."""
    
def gemini_judge_single(row, index, youtube_data=None, final_json_path=FINAL_RESULTS_JSON):
    """Xử lý 1 hàng CSV -> build prompt -> call_gemini -> save kết quả"""
    number = _get_field(row, ['number', 'Number', 'No', 'no']) or str(index)
    question_text = _get_field(row, ['Question (English)', 'Question', 'question', 'Question_English'])
    answer_a = _get_field(row, ['GPT 5.1', 'GPT5.1', 'GPT_5.1', 'GPT 5.1 '])
    answer_b = _get_field(row, ['Grok 4.1', 'Grok4.1', 'Grok_4.1', 'Grok 4.1 '])
    answer_c = _get_field(row, ['Kimi K2', 'KimiK2', 'Kimi_K2', 'Kimi K2 ' , 'Kimi'])

    answer_a = answer_a or ""
    answer_b = answer_b or ""
    answer_c = answer_c or ""
    question_text = question_text or ""

    judge_prompt = build_judge_prompt(number, question_text, answer_a, answer_b, answer_c)
    print(judge_prompt)

    try:
        raw = call_gemini(judge_prompt)
    except Exception as e:
        print(f"[ERROR] call_gemini failed for row {index} (question {number}): {e}")
        raw = f"CALL_GEMINI_ERROR: {e}"

    clean_result = clean_text(raw) if raw else ""
    judgement_lines = clean_result.splitlines() if clean_result else []

    item = {
        "question_number": number,
        "question": question_text,
        "answers": {
            "A": answer_a,
            "B": answer_b,
            "C": answer_c
        },
        "raw_judge": raw,
        "clean_judge_lines": judgement_lines
    }

    try:
        append_json(final_json_path, item)
    except Exception as e:
        print(f"[WARN] Failed to append result for question {number}: {e}")

    time.sleep(1)
    return item

def gemini_judge_all(csv_path=CSV_PATH, youtube_data=None, final_json_path=FINAL_RESULTS_JSON):
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV not found: {csv_path}")

    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        rows = list(reader)

    total = len(rows)
    if total == 0:
        print("CSV empty - no rows to process.")
        return

    print(f"Loaded {total} rows from {csv_path}. Starting to call gemini for each row...")

    results = []
    rows = rows[0:2]  # For testing, comment out to process all rows
    for i, row in enumerate(rows, start=1):
        print(f"Processing ({i}/{total}) ...")
        item = gemini_judge_single(row, i, youtube_data=youtube_data, final_json_path=final_json_path)
        results.append(item)
    print(f"Done. Processed {len(results)} rows. Results saved to {final_json_path}")
    return results

if __name__ == "__main__":
    youtube_data = None
    gemini_judge_all(csv_path=CSV_PATH, youtube_data=youtube_data, final_json_path=FINAL_RESULTS_JSON)

Loaded 80 rows from ../answers/answers.csv. Starting to call gemini for each row...
Processing (1/80) ...

I'm doing scientific research, I'll describe the research, please help me be a judge to grade it. Description:
Objective. This project aims to evaluate the performance of large language models (LLMs) in responding to real-world queries based on YouTube comments, particularly in the context of online educational videos.
Context. Viewer comments on YouTube not only express emotions but also reflect learners’ understanding, interests, and learning experiences. Querying and synthesizing information from these comments can help educators and content managers better capture learner feedback and improve instructional quality.
Data. The queries were constructed based on user comments from 9 videos in the Stanford CS230: Deep Learning (Autumn 2025), 
Methodology. The study uses a set of 80 queries representing diverse tasks such as: Information extraction, Sentiment analysis, Topic identif

In [None]:
# def export_json_to_excel(json_file, excel_file):
#     if not os.path.exists(json_file):
#         print(f"Error: File not found {json_file}")
#         return

#     with open(json_file, 'r', encoding='utf-8') as f:
#         data = json.load(f)

#     df = pd.DataFrame(data)
#     with pd.ExcelWriter(excel_file, engine='xlsxwriter') as writer:
#         df.to_excel(writer, index=False, sheet_name='Compare Results')

#         workbook  = writer.book
#         worksheet = writer.sheets['Compare Results']

#         wrap_format = workbook.add_format({
#             'text_wrap': True,
#             'valign': 'top',
#             'border': 1
#         })

#         worksheet.set_column('A:A', 60, wrap_format) # Cột Câu hỏi
#         worksheet.set_column('B:C', 60, wrap_format) # Cột GPT & Grok Answer
#         worksheet.set_column('D:D', 20, wrap_format) # Cột Timestamp
#         worksheet.set_column('E:E', 60, wrap_format) # Cột Gemini Judgment
# if __name__ == "__main__":
#     export_json_to_excel("gemini_judge.json", "AI_Judge.xlsx")