In [None]:
import os
import sys
from os.path import *
import importlib

import yaml
import jsonlines
import pickle
import wandb
import logging
import tqdm

import chess
import re

%matplotlib inline    
import matplotlib.pyplot as plt
from IPython.display import display


from sklearn import svm
import numpy as np
import tensorflow as tf 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

sys.path.append(f"{os.getcwd()}/lczeroTraining/tf/")

from stockfish import Stockfish
from tfprocess import TFProcess
from lcztools import LeelaBoard as leelaBoard

probing_svm = __import__("01_probing_svm", fromlist="*")

%matplotlib inline    
import matplotlib.pyplot as plt
from IPython.display import display


In [None]:
import ast 

log_file = "results/pipeline_v9.log"
target_fens = []
target_moves = []
base_responses = []
engine_responses = []
concept_responses = []

with open(log_file, "r") as f:
    for line in f:
        data_tuple = ast.literal_eval(line.strip())

        target_position, target_move_full, vanilla_response, engine_response, response1, response2 = data_tuple

        target_fens.append(target_position)
        target_moves.append(target_move_full)
        base_responses.append((None, vanilla_response.replace("vanilla ", "")))  
        engine_responses.append((None, engine_response.replace("engine ", "")))  
        concept_responses.append((None, response1, response2))  

target_moves_strip = list(map(lambda x: x.split()[1], target_moves))

In [None]:
idx = 0

tmp = target_fens[idx].split("- -")
target_position = tmp[0] + "KQkq -" + tmp[1]
svg = plot_board(target_position, target_moves_strip[idx], is_san=True)

# svg = plot_board(target_fens[idx], target_moves_strip[idx], is_san=True)
print(ref_list[idx].replace("\n", " "))
# print(gac_list[idx].replace("\n", " "))
print(base_responses[idx][1].replace("comment: ", "").replace("\n", " "))
print(engine_responses[idx][1].replace("comment: ", "").replace("\n", " "))
print(concept_responses[idx][1])
print(concept_responses[idx][2])

In [None]:
from openai import OpenAI
os.environ["OPENAI_API_KEY"] = "#######################"
client = OpenAI()

In [None]:
raise TypeError

In [None]:
idx_list = [6, 11, 16, 22, 51, 54, 59, 60, 66, 67, 74, 77, 78, 80, 83, 84, 95, 97, 100, 123, 127, 130, 131, 136, 137, 138, 139, 165, 169, 170, 173, 175, 176, 177, 181, 186, 188, 189, 200, 202, 206, 210, 212, 219, 225, 226, 227, 231, 232, 238]

In [None]:
# relevance 

relevance_scores = {
    "ref": [],
    # "gac": [],
    "gpt": [],
    "gpt_engine": [],
    "gpt_concept": [],
    "gpt_concept_wrong": [],
}

for idx in range(len(target_fens)):
    for target, target_list in [
            ("ref", ref_list), 
            # ("gac", gac_list), 
            ("gpt", [r[1].replace("comment: ", "") for r in base_responses]), 
            ("gpt_engine", [r[1].replace("comment: ", "") for r in engine_responses]), 
            ("gpt_concept", [r[2].split("comment: ")[1] for r in concept_responses]),
        ]:
        text_to_eval = target_list[idx].replace("\n", " ")
        text_ref = ref_list[idx].replace("\n", " ")
        print(text_ref)
        print(text_to_eval)

        target_position = target_fens[idx]
        target_move = target_moves_strip[idx]
        target_move_full = target_moves[idx]
        if "O-O" in target_move:
            tmp = target_position.split("- -")
            target_position = tmp[0] + "KQkq -" + tmp[1]
        attacks = get_all_attacks(target_position, after_move=target_move)
        engine_eval = get_engine_evalutaion(target_position, target_move).replace("evaluation: ", "")

        prompt = [
        {"role": "system", "content": """You will be given two comments about a chess move.

        Your task is to rate the comment on one metric.

        Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

        Evaluation Criteria:

        Relevance (1-5) - Relevence of the target comment to important aspects of the chess move. The comment should include only information relevant to the chess move or reasoning for taking or not taking the chess move. An engine evaluation result and a reference comment is given as a hint. 

        Evaluation Steps:

        1. Read the commment and the reference comment carefully.
        2. Read the chess position and move carefully, and find out important aspects based on the reference.
        2. Assess if every expressen of the comment is relevant to the important information about the chess move.
        3. Assign a Relevance score from 1 to 5.

        """},
        # Relevance (1-5) - Relevence of a target comment to a reference comment. The comment should include only information relevant to the chess move or reasoning for taking or not taking the chess move. An engine evaluation result is given as a hint.
        {"role": "user", "content": (
            f"position:\n{target_position}\n\n"
            f"move:\n{target_move_full}\n\n"
            f"target comment:\n\n{text_to_eval}\n\n"
            f"reference comment:\n\n{text_ref}\n\n"
            f"engine evaluation:\n\n{engine_eval}\n\n"
            "Score(1-5, score ONLY): "
        )},
        ]
        print(prompt)
        response = client.chat.completions.create(model="gpt-4o", messages=prompt, logprobs=True, top_logprobs=10)
        response_engine_attack = response.choices[0].message.content
    
        token_probs = [(response.choices[0].logprobs.content[0].top_logprobs[i].token, np.exp(response.choices[0].logprobs.content[0].top_logprobs[i].logprob)) for i in range(10)]
        norm_factor = sum([p for t, p in token_probs if t in ['1', '2', '3', '4', '5']])
        score = sum([int(t) * p / norm_factor for t, p in token_probs if t in ['1', '2', '3', '4', '5']])

        print(score)
        relevance_scores[target].append(score)

print(relevance_scores)
print("")

In [None]:
# completeness

completeness_scores = {
    "ref": [],
    # "gac": [],
    "gpt": [],
    "gpt_engine": [],
    "gpt_concept": [],
    "gpt_concept_wrong": [],
}

# for idx in [1, 3, 4,]:
for idx in idx_list:
# for idx in range(len(target_fens)):
    for target, target_list in [
            ("ref", ref_list), 
            # ("gac", gac_list), 
            ("gpt", [r[1].replace("comment: ", "") for r in base_responses]), 
            ("gpt_engine", [r[1].replace("comment: ", "") for r in engine_responses]), 
            ("gpt_concept", [r[2].split("comment: ")[1] for r in concept_responses]),
        ]:
        text_to_eval = target_list[idx].replace("\n", " ")
        text_ref = ref_list[idx].replace("\n", " ")
        print(text_to_eval)

        target_position = target_fens[idx]
        target_move = target_moves_strip[idx]
        target_move_full = target_moves[idx]
        if "O-O" in target_move:
            tmp = target_position.split("- -")
            target_position = tmp[0] + "KQkq -" + tmp[1]
        attacks = get_all_attacks(target_position, after_move=target_move)
        engine_eval = get_engine_evalutaion(target_position, target_move)

        prompt = [
        {"role": "system", "content": """You will be given single comment about a chess move.

        Your task is to rate the comment on one metric.

        Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

        Evaluation Criteria:

        Completeness (1-5) - Completeness of a comment. The comment should cover all critical points on the chess board, ensuring that no important factors are overlooked. An engine evaluation result is given as a hint.

        Evaluation Steps:

        1. Read the two commments carefully.
        2. Assess how well the comment addresses the important information, and how well the comment covers the entire important information without missing any.
        3. Assign a Completeness score from 1 to 5.

        """},
        {"role": "user", "content": (
            f"position:\n{target_position}\n\n"
            f"move:\n{target_move_full}\n\n"
            f"comment:\n\n{text_to_eval}\n\n"
            f"engine evaluation:\n\n{engine_eval}\n\n"
            "Score(1-5, score ONLY): "
        )},
        ]
        print(prompt)
        response = client.chat.completions.create(model="gpt-4o", messages=prompt, logprobs=True, top_logprobs=10)
        response_engine_attack = response.choices[0].message.content
    
        token_probs = [(response.choices[0].logprobs.content[0].top_logprobs[i].token, np.exp(response.choices[0].logprobs.content[0].top_logprobs[i].logprob)) for i in range(10)]
        norm_factor = sum([p for t, p in token_probs if t in ['1', '2', '3', '4', '5']])
        score = sum([int(t) * p / norm_factor for t, p in token_probs if t in ['1', '2', '3', '4', '5']])

        print(score)
        completeness_scores[target].append(score)

print(completeness_scores)
print("")

In [None]:
# clarity

clarity_scores = {
    "ref": [],
    # "gac": [],
    "gpt": [],
    "gpt_engine": [],
    "gpt_concept": [],
}

# for idx in [1, 3, 4,]:
# for idx in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
for idx in idx_list:
# for idx in range(len(target_fens)):
    for target, target_list in [
            ("ref", ref_list), 
            # ("gac", gac_list), 
            ("gpt", [r[1].replace("comment: ", "") for r in base_responses]), 
            ("gpt_engine", [r[1].replace("comment: ", "") for r in engine_responses]), 
            ("gpt_concept", [r[2].split("comment: ")[1] for r in concept_responses]),
        ]:
        text_to_eval = target_list[idx].replace("\n", " ")
        text_ref = ref_list[idx].replace("\n", " ")
        print(text_to_eval)

        target_position = target_fens[idx]
        target_move = target_moves_strip[idx]
        target_move_full = target_moves[idx]
        if "O-O" in target_move:
            tmp = target_position.split("- -")
            target_position = tmp[0] + "KQkq -" + tmp[1]
        attacks = get_all_attacks(target_position, after_move=target_move)
        engine_eval = get_engine_evalutaion(target_position, target_move).replace("evaluation: ", "")

        prompt = [
        {"role": "system", "content": """You will be given single comment about a chess move.

        Your task is to rate the comment on one metric.

        Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

        Evaluation Criteria:

        Clarity (1-5) - Clarity of a comment. The comment should be clear and detailed, without vague or ambiguous statements.

        Evaluation Steps:

        1. Read the commment carefully.
        2. Assess how the comment is clear and detailed, without vague or ambiguous statements.
        3. Assign a Clarity score from 1 to 5.

        """},
        {"role": "user", "content": (
            f"position:\n{target_position}\n\n"
            f"move:\n{target_move_full}\n\n"
            f"comment:\n\n{text_to_eval}\n\n"
            "Score(1-5, score ONLY): "
        )},
        ]
        print(prompt)
        response = client.chat.completions.create(model="gpt-4o", messages=prompt, logprobs=True, top_logprobs=10)
        response_engine_attack = response.choices[0].message.content
    
        token_probs = [(response.choices[0].logprobs.content[0].top_logprobs[i].token, np.exp(response.choices[0].logprobs.content[0].top_logprobs[i].logprob)) for i in range(10)]
        norm_factor = sum([p for t, p in token_probs if t in ['1', '2', '3', '4', '5']])
        score = sum([int(t) * p / norm_factor for t, p in token_probs if t in ['1', '2', '3', '4', '5']])

        print(score)
        clarity_scores[target].append(score)

print(clarity_scores)
print("")

In [None]:
# fluency 

fluency_scores = {
    "ref": [],
    # "gac": [],
    "gpt": [],
    "gpt_engine": [],
    "gpt_concept": [],
}

# for idx in idx_list:
for idx in range(len(target_fens)):
    for target, target_list in [
            ("ref", ref_list), 
            # ("gac", gac_list), 
            ("gpt", [r[1].replace("comment: ", "") for r in base_responses]), 
            ("gpt_engine", [r[1].replace("comment: ", "") for r in engine_responses]), 
            ("gpt_concept", [r[2].split("comment: ")[1] for r in concept_responses]),
        ]:
        text_to_eval = target_list[idx].replace("\n", " ")
        text_ref = ref_list[idx].replace("\n", " ")
        print(text_to_eval)

        target_position = target_fens[idx]
        target_move = target_moves_strip[idx]
        target_move_full = target_moves[idx]
        if "O-O" in target_move:
            tmp = target_position.split("- -")
            target_position = tmp[0] + "KQkq -" + tmp[1]
        attacks = get_all_attacks(target_position, after_move=target_move)
        engine_eval = get_engine_evalutaion(target_position, target_move)

        prompt = [
        {"role": "system", "content": """You will be given one comment written for a chess move.

        Your task is to rate the comment on one metric.

        Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.


        Evaluation Criteria:

        Fluency (1-5): Fluency of a comment.

        1. Read the commment carefully.
        2. Assess the sentences of comment is coherently organized. The comment should contain well-structured language and coherent transitions.
        3. Assign a Fluency score from 1 (not readable) to 5 (very fluent).
        
        """},
        {"role": "user", "content": (
            f"target comment:\n\n{text_to_eval}\n\n"
            "Score(1-5, score ONLY): "
        )},
        ]
        print(prompt)
        response = client.chat.completions.create(model="gpt-4o", messages=prompt, logprobs=True, top_logprobs=10)
        response_engine_attack = response.choices[0].message.content
    
        token_probs = [(response.choices[0].logprobs.content[0].top_logprobs[i].token, np.exp(response.choices[0].logprobs.content[0].top_logprobs[i].logprob)) for i in range(10)]
        norm_factor = sum([p for t, p in token_probs if t in ['1', '2', '3', '4', '5']])
        score = sum([int(t) * p / norm_factor for t, p in token_probs if t in ['1', '2', '3', '4', '5']])

        print(score)
        fluency_scores[target].append(score)

print(fluency_scores)
print("")

In [None]:
print({src: sum(val) / len(val) for src, val in relevance_scores.items() if len(val) != 0})
print({src: sum(val) / len(val) for src, val in completeness_scores.items() if len(val) != 0})
print({src: sum(val) / len(val) for src, val in clarity_scores.items() if len(val) != 0})
print({src: sum(val) / len(val) for src, val in fluency_scores.items() if len(val) != 0})
