<a href="https://colab.research.google.com/github/phinate/demos/blob/main/edi_train_your_own_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#@title Setup cell -- Run me first!

!pip install datasets transformers --quiet
!apt install git-lfs --quiet

import transformers


Reading package lists...
Building dependency tree...
Reading state information...
git-lfs is already the newest version (2.9.2-1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [2]:
#@title Logging in to Hugging Face

from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
#@title Run me when you've uploaded your dataset!
file_name = 'bee-movie-names.txt' #@param {type:"string"}
from random import shuffle

with open(file_name, 'r+') as file:
    text = [x for x in file.read().splitlines() if x != ' :']
    num_lines = len(text)
    # shuffle(text)
    train = text[:2*int(num_lines/3)]
    valid = text[2*int(num_lines/3):]

with open('train.txt', 'w+') as file:
    file.writelines([t + r'\n' + '\n' for t in train])

with open('valid.txt', 'w+') as file:
    file.writelines([t + r'\n' + '\n' for t in valid])

from datasets import load_dataset
datasets = load_dataset("text", data_files={"train": 'train.txt', "validation": 'valid.txt'})

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-448889f4b7a6a190/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-448889f4b7a6a190/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
#@title See some example paragraphs from your training data:

from datasets import ClassLabel
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=10):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

show_random_elements(datasets["train"])

Unnamed: 0,text
0,- I think he knows.\n
1,- Guys!\n
2,cleaner from Ken just before he hits Barry)\n
3,KEN:\n
4,What life? You have no life!\n
5,Beekeeper. I find that\n
6,- A wiper! Triple blade!\n
7,I love it!\n
8,Do you ever get bored\n
9,the bathtub. After getting hit in the head by falling objects 3 times he\n


In [5]:
#@title Select your model! We'll start with a "distilled" version of GPT-2 (more choices can be found [on the huggingface hub](https://huggingface.co/models?pipeline_tag=text-generation))
model_checkpoint = "distilgpt2" #@param {type:"string"}

In [6]:
#@title This cell "tokenises" our dataset ready for training
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_datasets = datasets.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])
# block_size = tokenizer.model_max_length
block_size = 150
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/2432 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1218 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2432 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1218 [00:00<?, ? examples/s]

In [7]:
#@title Set up your hyperparameters for training, then run this cell!
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)
from transformers import Trainer, TrainingArguments
model_name = "my_finetuned_GPT" #@param {type:"string"}
learning_rate = 2e-5 #@param {type:"slider", min:0.000001, max:0.001, step:0.000005}
weight_decay=0.01 #@param {type:"slider", min:0.001, max:0.1, step:0.005}
num_train_epochs=12 #@param {type:"slider", min:1, max:20, step:1}
training_args = TrainingArguments(
    model_name,
    evaluation_strategy = "epoch",
    learning_rate=learning_rate, 
    weight_decay=weight_decay,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)

/content/my_finetuned_GPT is already a clone of https://huggingface.co/phinate/my_finetuned_GPT. Make sure you pull the latest changes with `repo.git_pull()`.


In [8]:
#@title Run the training!

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,3.30393
2,No log,3.070479
3,No log,2.961141
4,No log,2.895365
5,No log,2.85677
6,No log,2.830243
7,No log,2.813784
8,No log,2.797972
9,No log,2.789367
10,No log,2.784049


TrainOutput(global_step=204, training_loss=2.8883298986098347, metrics={'train_runtime': 55.1218, 'train_samples_per_second': 28.736, 'train_steps_per_second': 3.701, 'total_flos': 60629011660800.0, 'train_loss': 2.8883298986098347, 'epoch': 12.0})

In [9]:
#@title You can evaluate the model "perplexity" -- lower should mean it can generate things more like the data you trained on
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 16.11


In [10]:
#@title If you're happy, you can save your model to the huggingface hub here:
trainer.push_to_hub()

Upload file runs/Apr27_14-54-25_a36ca4886a9f/events.out.tfevents.1682607271.a36ca4886a9f.4291.0: 100%|########…

Upload file training_args.bin: 100%|##########| 3.50k/3.50k [00:00<?, ?B/s]

Upload file runs/Apr27_14-54-25_a36ca4886a9f/events.out.tfevents.1682607327.a36ca4886a9f.4291.2: 100%|########…

To https://huggingface.co/phinate/my_finetuned_GPT
   3b7fa72..50d642d  main -> main

   3b7fa72..50d642d  main -> main



Upload file runs/Apr27_14-54-25_a36ca4886a9f/1682607271.7281966/events.out.tfevents.1682607271.a36ca4886a9f.42…

'https://huggingface.co/phinate/my_finetuned_GPT/commit/50d642db7bd15914118eee44fcd6715a40e4f10c'

In [11]:
#@title To run the model, we can grab the saved weights from the hub via your username"

from transformers import pipeline

user = "phinate" #@param {type: "string"}
pipe = pipeline(model="phinate/"+model_name, tokenizer=tokenizer)

In [18]:
#@title Now you can autocomplete any sentence you like!
text_to_complete = "BARRY: ay yo whaddup" #@param {type: "string"}
pipe(text_to_complete)[0]['generated_text'].split('\\n')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['BARRY: ay yo whaddup?"',
 '(Quinnish-eyed and a little jealous but angry?)\n',
 'VANESSA:',
 '(Mock off and flies',
 'to a bus and we see he']