# Setup


In [1]:
run_in_colab = True

In [None]:
if run_in_colab:
  !pip install transformers
  !pip install wandb

In [3]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer,  AdamW, get_linear_schedule_with_warmup
import pickle
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import numpy as np
import torch
import logging
import os
import time

In [4]:
if not run_in_colab:
  os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
  os.environ['CUDA_VISIBLE_DEVICES']='2'

In [None]:
import wandb

wandb.login()

# GPT2

In [6]:
GPT2_TYPE = "gpt2"
NUMER_OF_DATA_DIRS = 13
BATCH_SIZE = 2
train_precentege = 0.9

In [None]:
tokenizer =  GPT2Tokenizer.from_pretrained(GPT2_TYPE)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
configuration = GPT2Config.from_pretrained(GPT2_TYPE)
gpt_model = GPT2LMHeadModel.from_pretrained(GPT2_TYPE, config = configuration).cuda()
gpt_model.train()
gpt_model.resize_token_embeddings(len(tokenizer))

# Dataset

In [9]:
class MovesDataset(Dataset):
    def __init__(self, paths,tokenizer, max_length=768):
        self.board_state_token = '<board>'
        self.recent_moves_token = '<moves>'
        self.comment_token = '<comment>'
        self.end_of_text_token = "<|endoftext|>"
        
        self.proccessed_data = []
        self.attn_masks = []
        self.labels = []
        for path in paths:
            with open(path, 'rb') as file:
                raw_data = pickle.load(file)
            for data_object in raw_data:
              text = self.convert_data_to_text(data_object, max_length)

              start = text[:text.find('<comment>')+9]
              enc_start = tokenizer(start, truncation = True, max_length= 768, padding = "max_length")
              start_len = sum(enc_start['attention_mask']) 

              enc_dict = tokenizer(text,truncation = True, max_length= max_length, padding = "max_length")

              labels = [-100] * start_len + enc_dict['input_ids'][start_len:]

              self.proccessed_data.append(torch.tensor(enc_dict['input_ids']))
              self.attn_masks.append(torch.tensor(enc_dict['attention_mask']))
              self.labels.append(torch.tensor(labels))

    def convert_data_to_text(self, data_object, max_length):
    
        return f"{self.board_state_token}{data_object[0][:max_length]}{self.recent_moves_token}{data_object[1][:max_length]}{self.comment_token}{data_object[2][:max_length]}{self.end_of_text_token}"

    def __len__(self):
        return len(self.proccessed_data)

    def __getitem__(self, index):
        return self.proccessed_data[index], self.attn_masks[index], self.labels[index]

In [10]:
if run_in_colab:
  from google.colab import drive
  drive.mount('/content/drive')
  BASE_PATH = '/content/drive/MyDrive/NLP/'
else:
  BASE_PATH = '/disk2/danielroich/NLP/'

Mounted at /content/drive


In [11]:
games_data_path = BASE_PATH + 'Data/FEN/games_data'

In [12]:
NUMER_OF_DATA_DIRS =2

In [13]:
dataset = MovesDataset([f'{games_data_path}{i+1}.p' for i in range(NUMER_OF_DATA_DIRS-1)], tokenizer) 
# last pickle for test 5%

In [14]:
train_dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Train


In [None]:
run = wandb.init(project="LmChess", config={'batch size' : 2, 'lr' : 3e-5, 'epochs' : 100})

In [16]:
with open(f'{games_data_path}1.p', 'rb') as file:
    validation_data = pickle.load(file)[0]
textual_validation_data = dataset.convert_data_to_text(validation_data, 798).split('<comment>')
validation_target_text = textual_validation_data[1]
validation_input_text = textual_validation_data[0] +'<comment>'
validation_input_encoding  = tokenizer.encode(validation_input_text, return_tensors="pt").cuda()

wandb.log({"validation_target_text": wandb.Html(f'<p>{validation_target_text}</p>')})
wandb.log({"validation_input_text": wandb.Html(f'<p>{validation_input_text}</p>')})


In [17]:
optimizer = AdamW(gpt_model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=5000, num_training_steps=-1
)

In [18]:
saved_models_path = BASE_PATH + 'models/'

In [None]:
epochs = 100

for epoch in range(epochs):
    with tqdm(total=len(dataset) / 2) as pbar:
        for idx,entry in enumerate(train_dataloader):
            if idx % 2000 == 0 and idx != 0:
                with torch.no_grad():
                    outputs = gpt_model.generate(validation_input_encoding,num_beams=2, no_repeat_ngram_size=2, max_length = 769)
                    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
                wandb.log({"output_text": wandb.Html(f'<p>{output_text}</p>')})
            if idx % 50000 == 0 and idx != 0:
              torch.save(gpt_model.state_dict(), f'{saved_models_path}{idx}_{time.time()}_{int(loss)}.bin')

            gpt_model.zero_grad()
            inputs = entry[0].cuda()
            attn_masks = entry[1].cuda()
            labels = entry[2].cuda()
            outputs = gpt_model(inputs, labels=labels, attention_mask = attn_masks)
            loss = outputs['loss']
            loss.backward()
            optimizer.step()
            scheduler.step()
            wandb.log({"epoch": epoch, "loss": loss})
            pbar.update(2)
        
        # save model after full epoch
        torch.save(gpt_model.state_dict(), f'{saved_models_path}{idx}_{time.time()}_{int(loss)}.bin')

In [None]:
# # Re-load the saved model
# output_model_file = BASE_PATH + 'models/0_1614526824.713789_0.bin'
# model = GPT2LMHeadModel(configuration)
# state_dict = torch.load(output_model_file)
# model.load_state_dict(state_dict)

# Evaluation

every t iterations calculate evaluation metrics for the current model and save the results

**bleurt**

https://huggingface.co/metrics/bleurt

In [None]:
!pip install datasets

In [None]:
from datasets import load_metric

In [None]:
metric = load_metric("bleurt")

In [None]:
import tensorflow as tf
tf.compat.v1.flags.DEFINE_string('f','','')

In [None]:
gen_text = "I am walking on the promenade today"
ref_text = "I am walking along the promenade on this sunny day"

In [None]:
metric.compute(predictions=[gen_text], references=[ref_text])

In [None]:
with open(f'{games_data_path}13.p', 'rb') as file:
    test_data = pickle.load(file)

In [None]:
len(test_data)

In [None]:
for data in test_data[:1]:
  textual_data = dataset.convert_data_to_text(data, 798).split('<comment>')
  target_text = textual_data[1]
  input_text = textual_data[0] +'<comment>'
  input_encoding  = tokenizer.encode(input_text, return_tensors="pt").cuda()
  with torch.no_grad():
    outputs = gpt_model.generate(input_encoding,num_beams=2, no_repeat_ngram_size=2, max_length = 769)
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  print(output_text)

In [None]:
metric.compute(predictions=output_texts, references=test_data)

**Perplexity** 

https://huggingface.co/transformers/perplexity.html

In [None]:
test_dataset = MovesDataset([f'{games_data_path}13.p'], tokenizer) # last pickle for test 5%
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
model = model.cuda()

In [None]:
eval_loss = 0 
with tqdm(total=len(test_dataset) / 2) as pbar:
  for idx,entry in enumerate(test_dataloader):
    with torch.no_grad():
      inputs = entry[0].cuda()
      labels = entry[0].cuda()
      attn_masks = entry[1].cuda()
      outputs = model(inputs, labels=labels, attention_mask = attn_masks)
    loss = outputs[0]
    eval_loss += loss.mean().item()
    pbar.update(2)

In [None]:
final_eval_loss = eval_loss / (len(test_dataset)/2)
perplexity = torch.exp(torch.tensor(final_eval_loss))

In [None]:
perplexity

**bleu**

This value indicates how similar the candidate text is to the reference texts

https://www.journaldev.com/46659/bleu-score-in-python

https://machinelearningmastery.com/calculate-bleu-score-for-text-python/

In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
reference = [
    'this is a dog'.split(),
    'it is dog'.split(),
    'dog it is'.split(),
    'a dog, it is'.split() 
]
 
candidate = 'it is a dog'.split()
print('BLEU score -> {}'.format(sentence_bleu(reference, candidate)))

**bleu2** ? 