# Setup


In [1]:
run_in_colab = True

In [None]:
if run_in_colab:
  !pip install transformers
  !pip install wandb
  from google.colab import drive
  drive.mount('/content/drive')

In [3]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import logging
import os
import time

In [None]:
import wandb

wandb.login()

In [4]:
# if not run_in_colab:
#   os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
#   os.environ['CUDA_VISIBLE_DEVICES']='2'

In [5]:
if run_in_colab:
  BASE_PATH = '/content/drive/MyDrive/NLP/'
else:
  BASE_PATH = '/home/joberant/nlp_fall_2021/nofarm/chess/'

# GPT2

In [6]:
GPT2_TYPE = "gpt2"
BATCH_SIZE = 2
train_precentege = 0.9

In [None]:
tokenizer =  GPT2Tokenizer.from_pretrained(GPT2_TYPE)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

upgrade vocab

In [8]:
dataset_tokens = {'additional_special_tokens': ['<fen>', '<moves>', '<last move description>', '<legal moves>', '<attacked by>', '<attacks>', '<comment>']}

In [None]:
tokenizer.add_special_tokens(dataset_tokens)

In [10]:
board_notation = []
col_names = 'abcdefgh'
row_names = '87654321'
for col in col_names:
  for row in row_names:
    board_notation.append(col+row)

In [None]:
# moves = []
# pieces = 'KQRBN'
# for piece in pieces:
#   for cell in board_notation:
#     moves.append(piece+cell) # Ng4
#     moves.append(piece+"x"+cell) # Qxf6
#     for col in col_names:
#       moves.append(piece+col+cell) # Nbc6
#       moves.append(piece+col+"x"+cell) # Ngxe7
#     for target_cell in board_notation: 
#       if cell != target_cell:
#         moves.append(piece+cell+target_cell) # Ra5a6
#         moves.append(piece+cell+"x"+target_cell) # Re6xe7

In [None]:
# # check + checkmate
# check_moves = []
# for move in moves:
#   check_moves.append(move+"+") # Nxg2+
#   check_moves.append(move+"#") # Rd2#

In [None]:
# chess_vocab = board_notation + moves + check_moves + ["O-O"]

In [None]:
tokenizer.add_tokens(board_notation) 

In [None]:
# tokenizer.get_added_vocab()

model

In [None]:
configuration = GPT2Config.from_pretrained(GPT2_TYPE)
gpt_model = GPT2LMHeadModel.from_pretrained(GPT2_TYPE, config = configuration).cuda()
gpt_model.train()
gpt_model.resize_token_embeddings(len(tokenizer))

# Dataset

In [35]:
data_to_use = {'<fen>':True, '<moves>':True, '<last move description>':False, '<legal moves>':False, '<attacked by>':False, '<attacks>':False}

In [26]:
def convert_data_to_text(data_object, max_length=768, end_of_text_token= "<|endoftext|>"):
    (FEN, moves, last_move_desc, legal_moves, attackers_list, attacks_list, comment) = data_object
    (FEN, moves, last_move_desc, legal_moves, attackers_list, attacks_list, comment) = (FEN[:max_length], moves[:max_length], last_move_desc[:max_length], legal_moves[:max_length], \
                                                                                        attackers_list[:max_length], attacks_list[:max_length], comment[:max_length])
    token_to_data = {'<fen>':FEN, '<moves>':moves, '<last move description>':last_move_desc, '<legal moves>':legal_moves, 
                   '<attacked by>':attackers_list, '<attacks>':attacks_list}
    text = ""
    for token in token_to_data.keys():
      if data_to_use[token]:
        text += f"{token} {token_to_data[token]} "
    text += f"<comment> {comment} {end_of_text_token}"  # comment always included at the end + end token
    
    return text

In [30]:
class MovesDataset(Dataset):
    def __init__(self, paths,tokenizer, max_length=768):
        
        self.comment_encoding = tokenizer('<comment>')['input_ids'][0]  # 50266
        
        self.proccessed_data = []
        self.attn_masks = []
        self.labels = []

        for path in paths:
            with open(path, 'rb') as file:
              raw_data = pickle.load(file)
            for data_object in raw_data:
              text = convert_data_to_text(data_object)

              enc_text = tokenizer(text, truncation = True, max_length= max_length, padding = "max_length")

              inputs = enc_text['input_ids']
              label_idx = inputs.index(self.comment_encoding) +1
              labels = [-100] * label_idx + inputs[label_idx:]

              self.proccessed_data.append(torch.tensor(inputs))
              self.attn_masks.append(torch.tensor(enc_text['attention_mask']))
              self.labels.append(torch.tensor(labels))

    def __len__(self):
        return len(self.proccessed_data)

    def __getitem__(self, index):
        return self.proccessed_data[index], self.attn_masks[index], self.labels[index]

In [14]:
games_data_path = BASE_PATH + 'Data/NEW/games_data'

In [15]:
NUMER_OF_DATA_DIRS = 12

In [None]:
train_dataset = MovesDataset([f'{games_data_path}{i+1}.p' for i in range(NUMER_OF_DATA_DIRS-1)], tokenizer) 

In [31]:
# last pickle for test 5%
test_dataset = MovesDataset([f'{games_data_path}{NUMER_OF_DATA_DIRS}.p'], tokenizer) 

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [32]:
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Train


In [None]:
run = wandb.init(project="LmChess", config={'batch size' : 2, 'lr' : 3e-5, 'epochs' : 100})

In [36]:
with open(f'{games_data_path}{NUMER_OF_DATA_DIRS}.p', 'rb') as file:
    validation_data = pickle.load(file)

In [37]:
textual_validation_data = convert_data_to_text(validation_data[0], 798).split('<comment>')

validation_target_text = textual_validation_data[1]
validation_input_text = textual_validation_data[0] +'<comment>'
validation_input_encoding  = tokenizer.encode(validation_input_text, return_tensors="pt").cuda()

wandb.log({"validation_target_text": wandb.Html(f'<p>{validation_target_text}</p>')})
wandb.log({"validation_input_text": wandb.Html(f'<p>{validation_input_text}</p>')})

In [40]:
optimizer = AdamW(gpt_model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=5000, num_training_steps=-1
)

In [41]:
saved_models_path = BASE_PATH + 'Models/'
loss = 0

In [None]:
epochs = 100

for epoch in range(epochs):
    with tqdm(total=len(train_dataset) / 2) as pbar:
        for idx,entry in enumerate(train_dataloader):
            if idx % 2000 == 0 and idx != 0:
              with torch.no_grad():
                  outputs = gpt_model.generate(validation_input_encoding,num_beams=2, no_repeat_ngram_size=2, max_length = 769)
                  output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
              wandb.log({"output_text": wandb.Html(f'<p>{output_text}</p>')})
            if idx % 50000 == 0:
              torch.save(gpt_model.state_dict(), f'{saved_models_path}{idx}_{time.time()}_{int(loss)}.bin')

            gpt_model.zero_grad()
            inputs = entry[0].cuda()
            attn_masks = entry[1].cuda()
            labels = entry[2].cuda()
            outputs = gpt_model(inputs, labels=labels, attention_mask = attn_masks)
            loss = outputs['loss']
            loss.backward()
            optimizer.step()
            scheduler.step()
            wandb.log({"epoch": epoch, "loss": loss})
            pbar.update(2)
        
        # save model after full epoch
        torch.save(gpt_model.state_dict(), f'{saved_models_path}{idx}_{time.time()}_{int(loss)}.bin')

In [None]:
# # Re-load the saved model
# output_model_file = BASE_PATH + 'models/0_1614526824.713789_0.bin'
# model = GPT2LMHeadModel(configuration)
# state_dict = torch.load(output_model_file)
# model.load_state_dict(state_dict)

# Evaluation

every t iterations calculate evaluation metrics for the current model and save the results

**bleurt**

https://huggingface.co/metrics/bleurt

In [None]:
!pip install git+https://github.com/google-research/bleurt.git

In [None]:
!pip install datasets

In [17]:
from datasets import load_metric

In [18]:
metric = load_metric("bleurt")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1895.0, style=ProgressStyle(description…




Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512').


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=405489453.0, style=ProgressStyle(descri…


INFO:tensorflow:Reading checkpoint /root/.cache/huggingface/metrics/bleurt/default/downloads/extracted/887f2dc36c17f53c287f696681b8f7c947278407c1cf9f226662e16c8c0dc417/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Performs basic checks...
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Loading model...
INFO:tensorflow:BLEURT initialized.


In [19]:
import tensorflow as tf
tf.compat.v1.flags.DEFINE_string('f','','')

<absl.flags._flagvalues.FlagHolder at 0x7fab04789310>

In [None]:
gen_text = "I am walking on the promenade today"
ref_text = "I am walking along the promenade on this sunny day"

In [None]:
metric.compute(predictions=[gen_text], references=[ref_text])

{'scores': [0.24435952305793762]}

In [23]:
with open(f'{games_data_path}{NUMER_OF_DATA_DIRS}.p', 'rb') as file:
    test_data = pickle.load(file)

In [None]:
input_texts = []
output_texts = []
for data in test_data[:3]:
  textual_data = convert_data_to_text(data, 798)
  input_texts.append(textual_data)
  textual_data = textual_data.split('<comment>')
  target_text = textual_data[1]
  input_text = textual_data[0] +'<comment> '
  input_encoding  = tokenizer.encode(input_text, return_tensors="pt").cuda()
  with torch.no_grad():
    outputs = gpt_model.generate(input_encoding,num_beams=2, no_repeat_ngram_size=2, max_length = 769)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  output_texts.append(output_text)

In [30]:
metric.compute(predictions=output_texts, references=input_texts)

{'scores': [-0.5330486297607422, -0.37327098846435547, -0.13228969275951385]}

**Perplexity** 

https://huggingface.co/transformers/perplexity.html

In [None]:
test_dataset = MovesDataset([f'{games_data_path}13.p'], tokenizer) # last pickle for test 5%
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
model = model.cuda()

In [None]:
eval_loss = 0 
with tqdm(total=len(test_dataset) / 2) as pbar:
  for idx,entry in enumerate(test_dataloader):
    with torch.no_grad():
      inputs = entry[0].cuda()
      attn_masks = entry[1].cuda()
      labels = entry[2].cuda()
      outputs = model(inputs, labels=labels, attention_mask = attn_masks)
    loss = outputs[0]
    eval_loss += loss.mean().item()
    pbar.update(2)

In [None]:
final_eval_loss = eval_loss / (len(test_dataset)/2)
perplexity = torch.exp(torch.tensor(final_eval_loss))

In [None]:
perplexity

**bleu**

This value indicates how similar the candidate text is to the reference texts

https://www.journaldev.com/46659/bleu-score-in-python

https://machinelearningmastery.com/calculate-bleu-score-for-text-python/

In [1]:
from nltk.translate.bleu_score import sentence_bleu

In [2]:
reference = [
    'this is a dog'.split(),
    'it is dog'.split(),
    'dog it is'.split(),
    'a dog, it is'.split() 
]
 
candidate = 'it is a dog'.split()
print('BLEU score -> {}'.format(sentence_bleu(reference, candidate)))

BLEU score -> 0.8408964152537145


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


**bleu2** ? 