<a href="https://colab.research.google.com/github/mlabonne/chess-llm/blob/main/Chess_LLM_Trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qqq transformers datasets accelerate wandb --progress-bar off

import os
from google.colab import userdata

os.environ["WANDB_API_KEY"] = userdata.get('wandb')
os.environ["WANDB_PROJECT"] = "chess"

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

# Load dataset
dataset = load_dataset("adamkarvonen/chess_games", data_files="smaller_pgn_file.csv")
# dataset = load_dataset("adamkarvonen/chess_games", data_files="lichess_100mb.zip")
# dataset = load_dataset("adamkarvonen/chess_games", data_files="stockfish_dataset.zip")
print(dataset['train'][0])

def combine_columns(example):
    transcript = "1." + example['transcript'].split("1.", 1)[-1]
    return {'transcript': transcript}

# Apply the transformation
dataset = dataset.map(combine_columns, remove_columns=dataset['train'].column_names)
print(dataset['train'][0])



{'WhiteElo': 1601, 'BlackElo': 1793, 'Result': '0-1', 'transcript': '1.e4 e5 2.Nf3 Nc6 3.d4 exd4 4.Nxd4 Nxd4 5.Qxd4 d6 6.Nc3 h6 7.e5 Be7 8.exd6 Bxd6 9.Qxg7 Qe7+ 10.Be2 Be5 11.Nd5 Bxg7 12.Nxe7 Nxe7 13.O-O Be6 14.Re1 O-O-O 15.c4 Kb8 16.c5 Bd4 17.Be3 Bxb2 18.Rab1 Bf6 19.c6 Nxc6 20.Bf3 Bd5 21.Bxd5 Rxd5 22.a4 Rhd8 23.Bxh6 Bc3 24.Rec1 Bd2 25.Bxd2 Rxd2 26.h3 Ne5 27.Kh2 Rxf2 28.Rd1 Rxd1 29.Rxd1 b6 30.Rd8+ Kb7 31.Re8 f6 32.Kg3 Ra2 33.Rf8 Nd7 34.Rf7 Kc6 35.h4 Rxa4 36.h5 Ne5 37.h6 Nxf7 38.h7 Rb4 39.Kh3 a5 40.g4 a4 41.Kh4 a3 42.Kh5 a2 43.Kg6 Nh8+ 44.Kg7 a1=Q 45.Kxh8 f5+ 46.Kg8 Rxg4+ 47.Kf7 Qg7+ 48.Ke6 f4 49.Kf5 f3 50.Ke6 f2 51.Kf5 f1=Q+ 52.Ke6 Qff6# 0-1'}
{'transcript': '1.e4 e5 2.Nf3 Nc6 3.d4 exd4 4.Nxd4 Nxd4 5.Qxd4 d6 6.Nc3 h6 7.e5 Be7 8.exd6 Bxd6 9.Qxg7 Qe7+ 10.Be2 Be5 11.Nd5 Bxg7 12.Nxe7 Nxe7 13.O-O Be6 14.Re1 O-O-O 15.c4 Kb8 16.c5 Bd4 17.Be3 Bxb2 18.Rab1 Bf6 19.c6 Nxc6 20.Bf3 Bd5 21.Bxd5 Rxd5 22.a4 Rhd8 23.Bxh6 Bc3 24.Rec1 Bd2 25.Bxd2 Rxd2 26.h3 Ne5 27.Kh2 Rxf2 28.Rd1 Rxd1 29.Rxd1 b6 30.Rd8+ 

In [None]:
from transformers import AutoTokenizer
from datasets import load_dataset

# model_id = "openai-community/gpt2-medium"
model_id = "EleutherAI/pythia-70m-deduped"

# Load tokenizer
context_length = 128
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

def tokenize(element):
    outputs = tokenizer(
        element["transcript"],
        return_tensors="np",
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == context_length:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}

tokenized_dataset = dataset.map(
    tokenize, batched=True, remove_columns=dataset['train'].column_names
)

In [None]:
from transformers import DataCollatorForLanguageModeling
from transformers import AutoModelForCausalLM

# Load model
config = AutoConfig.from_pretrained(
    model_id,
    # vocab_size=len(tokenizer),
    n_ctx=1024,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)
# model = GPT2LMHeadModel(config)
model = AutoModelForCausalLM.from_pretrained(model_id, config=config)

# Load dataset
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.01)

# Collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="results",
    num_train_epochs=5,
    per_device_train_batch_size=100,
    per_device_eval_batch_size=100,
    learning_rate=5e-5,
    evaluation_strategy="steps",
    eval_steps=0.01,
    logging_steps=1,
    gradient_accumulation_steps=1,
    weight_decay=0.1,
    warmup_steps=0,
    lr_scheduler_type="cosine",
    # fp16=True,
    report_to="wandb",
    remove_unused_columns=False, # Fix for Pythia
    push_to_hub=False,
    hub_private_repo=True,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mmlabonne[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
1,2.852,3.107444
2,3.0923,2.387895
3,2.3371,2.102479
4,2.1166,1.976068
5,2.0538,1.844624
6,1.8972,1.746959
7,1.8356,1.661514
8,1.702,1.618741
9,1.6907,1.66264
10,1.5877,1.619191


TrainOutput(global_step=50, training_loss=1.49316641330719, metrics={'train_runtime': 72.014, 'train_samples_per_second': 65.751, 'train_steps_per_second': 0.694, 'total_flos': 162445110804480.0, 'train_loss': 1.49316641330719, 'epoch': 5.0})

In [None]:
trainer.push_to_hub()

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/282M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mlabonne/results/commit/011fd6ff404a9d5e4461aa0766412842d9f1f9e2', commit_message='End of training', commit_description='', oid='011fd6ff404a9d5e4461aa0766412842d9f1f9e2', pr_url=None, pr_revision=None, pr_num=None)