# Lightweight Training Script with distilgpt2! 

This script aims to use transcripts the whisper-gpt team has collected, and trains a minimal gpt model on them.
Specify dataset path, model desired, block size for training, and number of epochs below before running the script.
We tokenize input data, block them to allow for better processing, and pass them to our model for training. 


In [1]:
%%writefile starter_model_training.py

import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, create_optimizer, AdamWeightDecay, TFAutoModelForCausalLM
from transformers import DefaultDataCollator
from transformers import pipeline
import time
#most code is taken from the old huggingface script for language modeling with tensorflow


def tokenize_function(dat, model_checkpoint):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    return tokenizer(dat["text"])

def group_texts(dat, block_size = 64):
    # function taken directly from HF script used to chunk data into block_size
    # Concatenate all texts

    concatenated_examples = {k: sum(dat[k], []) for k in dat.keys()}
    total_length = len(concatenated_examples[list(dat.keys())[0]])

    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    
    total_length = (total_length // block_size) * block_size
    
    # Split by chunks of max_len.
    
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


def compile_model(model_checkpoint, lr = 2e-5, weight_decay_rate = 0.01):
    # Retrieve a model from model_checkpoint, and load with optimizer
    model = TFAutoModelForCausalLM.from_pretrained(model_checkpoint)

    optimizer = AdamWeightDecay(lr=lr, weight_decay_rate=weight_decay_rate)

    model.compile(optimizer=optimizer)
    return model


def create_dataset(dataset_path, model_checkpoint):
    # tokenize, batch, prepare for model dev
    datasets = load_dataset(dataset_path)

    tokenized_datasets = datasets.map(
        tokenize_function, 
        batched=True, 
        num_proc=4, 
        remove_columns = ["text", "id", "segments"],
        fn_kwargs={"model_checkpoint": model_checkpoint}
    )
    # chunk the data
    lm_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        batch_size=1000,
        num_proc=4,
    )

    return lm_datasets


def gen_text(model_checkpoint, model, seed_text, num_return_sequences = 3):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    p = pipeline("text-generation", model = model, tokenizer = tokenizer)
    genned_text = p(seed_text, num_return_sequences = num_return_sequences)
    genned_text = [x["generated_text"] for x in genned_text]
    return " ".join(genned_text)

Overwriting starter_model_training.py


In [2]:
from starter_model_training import *

DATASET_PATH = "Whispering-GPT/whisper-transcripts-the-verge"
MODEL_CHECKPOINT = "distilgpt2"
BLOCK_SIZE = 64
EPOCHS = 1


2022-11-17 19:36:38.874701: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [5]:
d = create_dataset(DATASET_PATH, model_checkpoint=MODEL_CHECKPOINT)

Using custom data configuration Whispering-GPT--whisper-transcripts-the-verge-423edd370c197473
Found cached dataset json (/Users/ArjunPatel/.cache/huggingface/datasets/Whispering-GPT___json/Whispering-GPT--whisper-transcripts-the-verge-423edd370c197473/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 179.85it/s]
Loading cached processed dataset at /Users/ArjunPatel/.cache/huggingface/datasets/Whispering-GPT___json/Whispering-GPT--whisper-transcripts-the-verge-423edd370c197473/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-3266fba3f757ad7c.arrow
Loading cached processed dataset at /Users/ArjunPatel/.cache/huggingface/datasets/Whispering-GPT___json/Whispering-GPT--whisper-transcripts-the-verge-423edd370c197473/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab/cache-ac382d491e73500d.arrow
Loading cached processed dataset at /Users/ArjunPatel/.cache/huggingface/datasets/Whispering-G

In [50]:
model = compile_model(MODEL_CHECKPOINT)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at distilgpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
  super().__init__(name, **kwargs)
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [51]:
data_collator = DefaultDataCollator(return_tensors="tf")

train_set = d["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

In [52]:
mod_history = model.fit(train_set, epochs=EPOCHS)



In [53]:
model.save("trained_model")



INFO:tensorflow:Assets written to: trained_model/assets


INFO:tensorflow:Assets written to: trained_model/assets


In [54]:
# def gen_text(model_checkpoint, model, seed_text, num_return_sequences = 3):
#     tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
#     p = pipeline("text-generation", model = model, tokenizer = tokenizer)
#     return p(seed_text, num_return_sequences = num_return_sequences)

gen_text(MODEL_CHECKPOINT, model, "Transcript of the newest The Verge YouTube video about the latest new cell phone: ", 3)

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


[{'generated_text': 'Transcript of the newest The Verge YouTube video about the latest new cell phone: \xa0 It features four new features, a four-megapixel camera and a built-in camera, as well as 2.0" of HD video recording and 5'},
 {'generated_text': 'Transcript of the newest The Verge YouTube video about the latest new cell phone: -------------------------- T. P. Harnik, who is the president of the New York Public Radio Alliance, has a different perspective on how technology is being developed,'},
 {'generated_text': "Transcript of the newest The Verge YouTube video about the latest new cell phone: _____________. _____________. The Verge's new video about the latest new cell phone: _____________. The Verge's new video about the newest cell phone"}]