In [14]:
# Clone repo and cd into repo directory
!git clone https://github.com/naafidasana/NativLang.git

In [15]:
%cd NativLang/

Training...


In [None]:
# Mount Google Drive
from google.colab import drive, files
drive.mount("/content/drive")

In [None]:
from models.gpt2_model import GPTModel, GPTConfig
from utils.general_utils import Visualizer, MetricAccumulator, Timer, get_gpus, download_and_extract
from utils.dataloaders import get_gpt_batch, read_data_for_gpt
from utils.tokenizer import BPETokenizer

import torch
import torch.nn as nn
import numpy as np

import os
import shutil

In [None]:
# Download (or fetched cached dataset) and obtain path to dataset
DATA_DIR = "./data/dag-sents-train"
text = read_data_for_gpt(DATA_DIR)

# Get pretreained tokenizer
tokenizer = BPETokenizer.from_pretrained("configs/dagpt-base-uncased-tokenizer.json")
vocab_size = tokenizer.vocab_size

# Encode text
enc_text = np.array(tokenizer.encode(text))

In [None]:
# Get data torch data loaders and vocabulary
context_length = 128
batch_size = 52

# Create model config and model
config = GPTConfig(vocab_size+1, context_length)

# Get device
devices, num_devices = get_gpus()

In [None]:
def fetch_model(config, checkpoint_path=None):
  if checkpoint_path is not None:
    # Initialize from checkpoint
    print(f"Loading from {checkpoint_path.split('/')[-1]} ...")
    model = GPTModel.from_pretrained(checkpoint_path, config)
  else:
    model = GPTModel(config)
  return model.to(devices[0])

In [None]:
def grad_clipping(model, theta):
    params = [p for p in model.parameters() if p.requires_grad()]
    norm = torch.sqrt(sum(torch.sum((p.grad ** 2) for p in params)))
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm


def get_gpt_batch_loss(model, input_sequences, targets):
    # Loss is also calculated in the forward pass of the model
    logits, loss = model(input_sequences, targets)
    return loss.sum()

In [None]:
def train_gpt(config, model, train_data, learning_rate, num_steps):

    # Add code to enable checkpointing
    checkpoint_path = "./checkpoints"
    if not os.path.exists(checkpoint_path):
        os.mkdir(checkpoint_path)

    # Get and move model to device.
    if num_devices >= 2:
        # Use parallel processing on multiple GPUs.
        model = nn.DataParallel(model, device_ids=devices).to(devices[0])
    else:
        # Still explicitly move model to device incase there is only one GPU.
        model.to(devices[0])
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    step, timer = 0, Timer()
    visualizer = Visualizer(xlabel="step", ylabel="loss",
                            xlim=[1, num_steps])
    metrics = MetricAccumulator(3)

    print("Training...")
    while step < num_steps:
        # Get batch inputs and targets
        batch_xs, batch_ys = get_gpt_batch(train_data, config.context_length, batch_size)
        batch_xs, batch_ys = batch_xs.to(devices[0], non_blocking=True), batch_ys.to(devices[0], non_blocking=True)

        # Reset any previously computed gradients
        optimizer.zero_grad()

        # Do forward pass and fetch loss
        timer.start()
        loss = get_gpt_batch_loss(model, batch_xs, batch_ys)

        # Backward pass
        loss.backward()
        #grad_clipping(model, 1)
        optimizer.step()
        metrics.add(loss,
                    batch_xs.shape[0], 1)
        timer.stop()
        visualizer.add(step + 1, metrics[0]/metrics[2])

        step += 1

        # Save model after every 300 steps
        # We process `batch_size` number of examples in each step
        if (step % 1000 == 0):
            checkpoint_name = os.path.join(checkpoint_path, f"dagpt-v3-{step:03}.pth")
            # Save model
            torch.save(model.state_dict(), checkpoint_name)

            # Print generated sequence
            #seq = "ŋɔ wuhirila niriba"
            #try_generate(model, seq, max_tokens=10)


            print(f"Loss: {metrics[0]/metrics[2]:.4f}")
            print(f"{metrics[1]/timer.sum():.1f} tokens/sec on {str(devices)}")

            # Copy generated checkpoints to google drive folder
            destination_folder = "/content/drive/My Drive/NativLang/DaGPT"
            source_file = f"checkpoints/dagpt-v3-{step}.pth"

            shutil.copy(source_file, destination_folder)


In [None]:
def try_generate(model, seq, max_tokens=10):
    # Encode sequence
    encoded_seq = tokenizer.encode(seq)
    encoded_seq = torch.tensor(encoded_seq).to(devices[0]).unsqueeze(0)
    encoded_seq = encoded_seq.to(devices[0])
    with torch.no_grad():
        try:
            gen_seq = model.module.generate(encoded_seq, max_tokens)
        except:
            gen_seq = model.generate(encoded_seq, max_tokens)
        gen_seq = gen_seq.cpu().numpy().tolist()[0]
        # Decode generated sequence
        print(tokenizer.decode(gen_seq))

In [None]:
# Get model
checkpoint_path = "/content/drive/My Drive/NativLang/DaGPT/dagpt-v3-5000"
#checkpoint_path = "/content/drive/Shared with me/NativLang/DaGPT/dagpt-v2-5000"
model = fetch_model(checkpoint_path=checkpoint_path, config=config)

In [None]:
print(f"Number of Parameters: {sum([p.numel() for p in model.parameters()])/1e6} M")
model.eval()

In [None]:
# Specify learning rate and num_epochs, and train model
lr, num_steps = 3e-5, 5000
train_gpt(config, model, train_data=enc_text, learning_rate=lr, num_steps=num_steps)

In [None]:
# Example generations.
seqs = ["di nyɛla shikuru", "Laɣingu maa yɛltɔɣa kpani daa nyɛla"]
for seq in seqs:
  gen_seq = try_generate(model, seq, max_tokens=40)
  print("="*150)
  print(gen_seq)
  print(" ")