<a href="https://colab.research.google.com/github/mlabonne/chess-llm/blob/main/Chess_LLM_Trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os

from google.colab import userdata

os.environ["WANDB_API_KEY"] = userdata.get("wandb")
os.environ["WANDB_PROJECT"] = "chess"

In [1]:
from datasets import load_dataset
from transformers import AutoConfig, AutoTokenizer

from chessllm.train import TokenizeMap, combine_columns_map

data_files = {
    "small": "smaller_pgn_file.csv",
    "lc_100mb": "lichess_100mb.zip",
    "stkfsh": "stockfish_dataset.zip",
}

# Load dataset
dataset = load_dataset("adamkarvonen/chess_games", data_files=data_files["small"])

print(dataset["train"][0])

# Apply the transformation
dataset_1 = dataset.map(
    combine_columns_map, remove_columns=dataset["train"].column_names
)
print(dataset["train"][0])
print(dataset_1["train"][0])

  from .autonotebook import tqdm as notebook_tqdm


{'WhiteElo': 1601, 'BlackElo': 1793, 'Result': '0-1', 'transcript': '1.e4 e5 2.Nf3 Nc6 3.d4 exd4 4.Nxd4 Nxd4 5.Qxd4 d6 6.Nc3 h6 7.e5 Be7 8.exd6 Bxd6 9.Qxg7 Qe7+ 10.Be2 Be5 11.Nd5 Bxg7 12.Nxe7 Nxe7 13.O-O Be6 14.Re1 O-O-O 15.c4 Kb8 16.c5 Bd4 17.Be3 Bxb2 18.Rab1 Bf6 19.c6 Nxc6 20.Bf3 Bd5 21.Bxd5 Rxd5 22.a4 Rhd8 23.Bxh6 Bc3 24.Rec1 Bd2 25.Bxd2 Rxd2 26.h3 Ne5 27.Kh2 Rxf2 28.Rd1 Rxd1 29.Rxd1 b6 30.Rd8+ Kb7 31.Re8 f6 32.Kg3 Ra2 33.Rf8 Nd7 34.Rf7 Kc6 35.h4 Rxa4 36.h5 Ne5 37.h6 Nxf7 38.h7 Rb4 39.Kh3 a5 40.g4 a4 41.Kh4 a3 42.Kh5 a2 43.Kg6 Nh8+ 44.Kg7 a1=Q 45.Kxh8 f5+ 46.Kg8 Rxg4+ 47.Kf7 Qg7+ 48.Ke6 f4 49.Kf5 f3 50.Ke6 f2 51.Kf5 f1=Q+ 52.Ke6 Qff6# 0-1'}
{'WhiteElo': 1601, 'BlackElo': 1793, 'Result': '0-1', 'transcript': '1.e4 e5 2.Nf3 Nc6 3.d4 exd4 4.Nxd4 Nxd4 5.Qxd4 d6 6.Nc3 h6 7.e5 Be7 8.exd6 Bxd6 9.Qxg7 Qe7+ 10.Be2 Be5 11.Nd5 Bxg7 12.Nxe7 Nxe7 13.O-O Be6 14.Re1 O-O-O 15.c4 Kb8 16.c5 Bd4 17.Be3 Bxb2 18.Rab1 Bf6 19.c6 Nxc6 20.Bf3 Bd5 21.Bxd5 Rxd5 22.a4 Rhd8 23.Bxh6 Bc3 24.Rec1 Bd2 25.Bxd2 Rxd2 

In [None]:
# model_id = "openai-community/gpt2-medium"
model_id = "EleutherAI/pythia-70m-deduped"

# Load tokenizer
context_length = 128
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

tmap = TokenizeMap(tokenizer, context_length)

tokenized_dataset = dataset.map(
    tmap.tokenize, batched=True, remove_columns=dataset["train"].column_names
)

In [None]:
from transformers import AutoModelForCausalLM, DataCollatorForLanguageModeling

# Load model
config = AutoConfig.from_pretrained(
    model_id,
    # vocab_size=len(tokenizer),
    n_ctx=1024,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
# model = GPT2LMHeadModel(config)
model = AutoModelForCausalLM.from_pretrained(model_id, config=config)

# Load dataset
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.01)

# Collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="results",
    num_train_epochs=5,
    per_device_train_batch_size=100,
    per_device_eval_batch_size=100,
    learning_rate=5e-5,
    evaluation_strategy="steps",
    eval_steps=0.01,
    logging_steps=1,
    gradient_accumulation_steps=1,
    weight_decay=0.1,
    warmup_steps=0,
    lr_scheduler_type="cosine",
    # fp16=True,
    report_to="wandb",
    remove_unused_columns=False,  # Fix for Pythia
    push_to_hub=False,
    hub_private_repo=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

In [None]:
trainer.push_to_hub()