In [3]:
import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, create_optimizer, AdamWeightDecay, TFAutoModelForCausalLM
from transformers import DefaultDataCollator

# Lightweight Training Script with distilgpt2! 

This script aims to use transcripts the whisper-gpt team has collected, and trains a minimal gpt model on them.
Specify dataset path, model desired, block size for training, and number of epochs below before running the script.
We tokenize input data, block them to allow for better processing, and pass them to our model for training. 


In [4]:
#most code is taken from the old huggingface script for language modeling with 
DATASET_PATH = "kpriyanshu256/whisper-transcripts"
MODEL_CHECKPOINT = "distilgpt2"
BLOCK_SIZE = 64
EPOCHS = 1

In [5]:
def tokenize_function(dat, model_checkpoint = MODEL_CHECKPOINT):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    return tokenizer(dat["text"])

def group_texts(dat, block_size = BLOCK_SIZE):
    # function from HF script used to chunk data into block_size
    # Concatenate all texts.
    concatenated_examples = {k: sum(dat[k], []) for k in dat.keys()}
    total_length = len(concatenated_examples[list(dat.keys())[0]])
    # We drop the small remainder, though you could add padding instead if the model supports it
    # In this, as in all things, we advise you to follow your heart
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + BLOCK_SIZE] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result


def compile_model(model_checkpoint = MODEL_CHECKPOINT, lr = 2e-5, weight_decay_rate = 0.01):
    # Retrieve a model from model_checkpoint, and load with optimizer
    model = TFAutoModelForCausalLM.from_pretrained(model_checkpoint)

    optimizer = AdamWeightDecay(lr=lr, weight_decay_rate=weight_decay_rate)

    model.compile(optimizer=optimizer)
    return model

In [6]:
# tokenize and preprocess dataset
datasets = load_dataset(DATASET_PATH)

Using custom data configuration kpriyanshu256--whisper-transcripts-b310a43c8142e04a


Downloading and preparing dataset json/kpriyanshu256--whisper-transcripts to /Users/ArjunPatel/.cache/huggingface/datasets/kpriyanshu256___json/kpriyanshu256--whisper-transcripts-b310a43c8142e04a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data: 100%|██████████| 64.5M/64.5M [00:01<00:00, 35.9MB/s]
Downloading data files: 100%|██████████| 1/1 [00:03<00:00,  3.26s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 646.37it/s]
                                

Dataset json downloaded and prepared to /Users/ArjunPatel/.cache/huggingface/datasets/kpriyanshu256___json/kpriyanshu256--whisper-transcripts-b310a43c8142e04a/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 104.36it/s]


In [7]:
# apply tokenization
tokenized_datasets = datasets.map(
    tokenize_function, 
    batched=True, 
    num_proc=4, 
    remove_columns = ["text", "id", "segments"])

# chunk the data
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

model = compile_model(MODEL_CHECKPOINT)

#0:   0%|          | 0/1 [00:00<?, ?ba/s]
[A

[A[AToken indices sequence length is longer than the specified maximum sequence length for this model (2799 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2086 > 1024). Running this sequence through the model will result in indexing errors
#0: 100%|██████████| 1/1 [00:01<00:00,  1.67s/ba]


#3: 100%|██████████| 1/1 [00:01<00:00,  1.64s/ba]
Token indices sequence length is longer than the specified maximum sequence length for this model (1202 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1091 > 1024). Running this sequence through the model will result in indexing errors
#1: 100%|██████████| 1/1 [00:01<00:00,  1.82s/ba]

#2: 100%|██████████| 1/1 [00:01<00:00,  1.85s/ba]
#0:   0%

In [8]:
data_collator = DefaultDataCollator(return_tensors="tf")

train_set = lm_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

In [9]:
mod_history = model.fit(train_set, epochs=EPOCHS)



In [67]:
model.save("trained_model")



INFO:tensorflow:Assets written to: trained_model/assets


INFO:tensorflow:Assets written to: trained_model/assets
