In [None]:
%cd /content/drive/MyDrive/Colab Notebooks/thesis
%ls -l
# this command should show 'Create_dataset.ipynb' and 'raw_data.csv'
# If not then change the filepath to the correct path

In [None]:
%pip install -qU unstructured langchain-text-splitters langchain-community langgraph langchain-mistralai nltk

In [3]:
import os
from google.colab import userdata
os.environ["MISTRAL_API_KEY"] = userdata.get("MISTRAL_API_KEY")
# If using google colab then add a 'MISTRAL_API_KEY' to the environment variables
# If not then replace userdata.get("MISTRAL_API_KEY") by a string with the API key

# Creation of the LLM Agent
Using [MistralAI](https://mistral.ai/) and [LangChain](https://www.langchain.com/)

In [None]:
import time
import httpx
from langchain_mistralai import ChatMistralAI
from langchain.schema import (
    BaseMessage,
    SystemMessage,
    HumanMessage,
    AIMessage
)
from tenacity import retry, wait_exponential, stop_after_attempt

class Agent:
  def __init__(self, sys_msg: str):
    self.chat_model = ChatMistralAI(model="mistral-large-latest")
    self.sys_msg = SystemMessage(content=sys_msg)
    self.logs = [self.sys_msg]

  def ask(self, question):
    msg = HumanMessage(content=question)
    self.logs.append(msg)

    @retry(wait=wait_exponential(min=1, max=60), stop=stop_after_attempt(6))
    def call_llm():
      return self.chat_model.invoke(self.logs)
    ai_msg = call_llm()

    self.logs.append(AIMessage(content=ai_msg.content))
    return ai_msg.content

  def reset(self):
    self.logs = []


  def soft_reset(self):
    if len(self.logs) > 10:
      self.logs = [self.sys_msg] + self.logs[-5:-1]

a = Agent("You are a helpful assistant.")
print(a.ask("Who are you?"))

# Filter comments

In [None]:
import json

with open("state.json", "r") as f:
    state_data = json.load(f)
last_row_filtered_index = state_data.get("last_row_filtered_index")

In [None]:
import pandas as pd
from tqdm.auto import tqdm
from datetime import datetime


raw_data_path = os.path.join(".", "raw_data.csv")
raw_data = pd.read_csv(raw_data_path)

agent = Agent(
  """
  Your job is to evaluate if a comment made about a chess moves indicate that the move is a mistake AND explains why it's a mistake.
  When Writing your answer, it is VERY IMPORTANT that you write RES: followed by 1 if you think the comment indicate that the move is a mistake AND explains why it's a mistake or 0 if you think it doesn't.
  After that, write a single sentence explaining your reasoning.
  Don't write anything else after.
  """
)

filename = f"filtered_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df_filtered = pd.DataFrame(columns=list(raw_data.columns) + ["good", "reasoning"])
df_filtered.to_csv(filename, index=False)

starting_index = last_row_filtered_index
for row_index, row in tqdm(raw_data.iloc[starting_index:].iterrows(), total=len(raw_data) - starting_index):
  comment = row['comment']
  print()
  print(comment)

  prompt = f"""
  Context: {row['context']}
  Here's the comment made after the last move:
  {row['comment']}

  The engine evaluate the position as followed: {row['engine_eval']}
  And it think that the best continuation is {row['engine_best_line']}.
  And instead of the move played, the engine think that the player should have played {row['engine_best_alternative']} (If the move played is the same as this one, then the comment is wrong and therefore doens't indicate that the move is a mistake).

  Do you think that the comment made about the last move played indicate that the move is a mistake AND explains why it's a mistake ?
  Write RES: followed by 1 if you think it does and 0 if you think it doesn't.
  When writing the number after RES, don't put any other characters.
  After that, write a single sentence explaining your reasoning.
  Don't write anything else after.
  """

  ok = False
  while not ok:
    try:
      output = agent.ask(prompt)
      print(output)
    except Exception as e:
      agent.soft_reset()
      print(e)
      break
    if "RES:" in output:
      after_res = output.split("RES:")[1].strip()
      res = after_res[0]
      if res in ("0", "1"):
        row_dict = row.to_dict()
        row_dict["good"] = res == "1"
        row_dict["reasoning"] = after_res[1:]
        pd.DataFrame([row_dict]).to_csv(filename, mode="a", header=False, index=False)
        ok = True

  state_data["last_row_filtered_index"] = row_index + 1
  with open("state.json", "w") as f:
      json.dump(state_data, f, indent=4)

filtered_data_path = os.path.join(".", filename)

In [None]:
state_data["last_row_filtered_index"] = 0
with open("state.json", "w") as f:
    json.dump(state_data, f, indent=4)

# Reformulate good comments

In [None]:
import json
with open("state.json", "r") as f:
    state_data = json.load(f)
last_row_reformulated_index = state_data.get("last_row_reformulated_index")
print(last_row_reformulated_index)

2312


In [None]:
import pandas as pd
from tqdm.auto import tqdm
from datetime import datetime


filtered_data = pd.read_csv("filtered_20250829_163943.csv")
filtered_good_data = filtered_data[filtered_data["good"]]

agent = Agent(
    """
    Your job is to reformulate a comment made about a chess move.
    The comment should be explaining why the move made is a mistake.

    Reformulate that comment to only keep the part that explains the mistake.
    You may use some of the engine's information when reformulating but try to keep it as close to the original comment as possible.
    While reformulating do the following too:
    - The reformulated comment should only contain an explanation of the mistake.
    - If the comment doens't suggest alternative lines, use the one provided by the engine.
    - When using a pronoun to refer to a player, only use they/them/their.
    - NEVER mention a player's name. Use either 'black' or 'white' according to the player's color.
    - If the original comment is talking about something or someone unrelated to the game, do not mention it.
    - If the comment isn't in english, translate the reformulation to english.
    """
)

filename = f"reformulated_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df_reformulated = pd.DataFrame(columns=list(filtered_good_data.columns) + ["reformulated"])
df_reformulated.to_csv(filename, index=False)

starting_index = last_row_reformulated_index
for row_index, row in tqdm(filtered_good_data.iloc[starting_index:].iterrows(), total=len(filtered_good_data) - starting_index):
  comment = row['comment']
  print("-"*40)
  print(comment)

  prompt = f"""
  Context: {row['context']}
  Engine evaluation: {row['engine_eval']}
  Engine best line: {row['engine_best_line']}
  Engine best alternative line: {row['engine_best_alternative']}
  Here's the reason why this comment was picked: {row['reasoning']}

  Here's the comment to reformulate:
  {row['comment']}

  Keep in mind while reformulating that:
  - The reformulated comment should only contain an explanation of the mistake.
  - If the comment doens't suggest alternative lines, use the one provided by the engine.
  - When using a pronoun to refer to a player, only use they/them/their.
  - NEVER mention a player's name. Use either 'black' or 'white' according to the player's color.
  - If the original comment is talking about something or someone unrelated to the game, do not mention it.
  - If the comment isn't in english, translate the reformulation to english.

  Only answer with the reformulated comment and nothing else.
  """

  ok = False
  while not ok:
    try:
      output = agent.ask(prompt)
      print(output)
    except Exception as e:
      agent.soft_reset()
      print(e)
      break

    row_dict = row.to_dict()
    row_dict["reformulated"] = output

    pd.DataFrame([row_dict]).to_csv(filename, mode="a", header=False, index=False)
    ok = True

  state_data["last_row_reformulated_index"] = row_index + 1
  with open("state.json", "w") as f:
      json.dump(state_data, f, indent=4)


reformulated_data_path = os.path.join(".", filename)

In [None]:
state_data["last_row_reformulated_index"] = 0
with open("state.json", "w") as f:
    json.dump(state_data, f, indent=4)