<a href="https://colab.research.google.com/github/player1537/Train-Bloom-560m/blob/main/Train_Bloom_560m_trained_on_Wizard_Vicuna_Uncensored.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title Install Dependencies
%%script bash
TMPDIR=${TMPDIR:-${TMP:-/tmp}}
TMPOUT=${TMPDIR:?}/pip.text
INSTALL=(
  transformers
  torch
  datasets
  tqdm
  accelerate
  peft
  huggingface_hub
  guidance
  llama_index
  langchain
  sentence_transformers
  more_itertools
  safetensors
)

if ! pip install --upgrade "${INSTALL[@]}" &>"${TMPOUT:?}"; then
  cat "${TMPOUT:?}" >&2
  exit 1
fi

In [None]:
#@title HuggingFace Login
import huggingface_hub
huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
#@title Import & Utilities
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from dataclasses import dataclass, field
from pathlib import Path
import functools
import itertools
import inspect

import torch
import transformers
from transformers.utils import cached_property
import datasets
import peft
import guidance
from tqdm.notebook import tqdm
from IPython.display import clear_output
import llama_index
import langchain
import more_itertools

def doctest(func=None, /, verbose=False, sterile=False):
  def wrapper(func):
    # Thanks https://stackoverflow.com/a/49659927
    import doctest
    import copy

    # I need this to error out on failure; the default one doesn't.
    def run_docstring_examples(f, globs, verbose=False, name="NoName", compileflags=None, optionflags=0):
      finder = doctest.DocTestFinder(verbose=verbose, recurse=False)
      runner = doctest.DocTestRunner(verbose=verbose, optionflags=optionflags)
      for test in finder.find(func, name, globs=globs):
          runner.run(test, compileflags=compileflags)
      assert runner.failures == 0

    name = func.__name__

    if sterile:
      globs = {}
    else:
      globs = copy.copy(globals())
    globs[name] = func
    run_docstring_examples(func, globs, verbose=verbose, name=name)
    return func

  if func is not None:
    return wrapper(func)
  else:
    return wrapper

try:
  g
except NameError:
  g = {}

def run(func=None, /, name=None, cond=True, splat=False):
  def wrapper(func, /, *, name=name, cond=cond):
    if callable(cond):
      cond = cond()

    if not cond:
      return None

    if name is None:
      name = func.__name__

    args = []
    for key, parameter in inspect.signature(func).parameters.items():
      if parameter.kind == inspect.Parameter.POSITIONAL_ONLY:
        value = g[key]
        args.append(value)

    ret = func(*args)

    if splat:
      it = ret.items()
    else:
      it = [(name, ret)]

    for name, ret in it:
      g[name] = ret

    return None

  if func is not None:
    return wrapper(func)
  else:
    return wrapper

In [None]:
#@title Configuration
DO_PREPROCESS = (
  # True
  False
)
DO_TRAIN = (
  True
  # False
)

# Output Config
CHECKPOINT_DIRECTORY = (
  'checkpoints'
)
REPOSITORY_NAME = (
  'player1537/Bloom-560m-trained-on-Wizard-Vicuna-Uncensored'
)

# Input Config
BASE_MODEL_NAME = (
  'bigscience/bloom-560m'
)
PEFT_MODEL_NAME = (
  None
  # REPOSITORY_NAME
)
TOKENIZER_NAME = (
  BASE_MODEL_NAME
)
DATASET_NAME = (
  'ehartford/wizard_vicuna_70k_unfiltered'
  # 'player1537/wizard-vicuna-unfiltered-summarized-70k'
)

# Training Config
REPLICAS = (
  3
)
SEED = (
  1337
)
LORA_ALPHA = (
  32
)
LORA_DROPOUT = (
  0.1
)
CONTEXT_SIZE = (
  # 512
  1024
  # 2048
)
LEARNING_RATE = (
  2e-5
)

# Preprocess

In [None]:
@run(cond=DO_PREPROCESS)
def tokenizer():
  return transformers.AutoTokenizer.from_pretrained(
    TOKENIZER_NAME,
    add_prefix_space=True,
  )

In [None]:
@run(cond=DO_PREPROCESS)
def dataset():
  return datasets.load_dataset(
    DATASET_NAME,
    split='train',
  )

In [None]:
@run(cond=DO_PREPROCESS)
def dataset(dataset, /):
  """Normalize the incoming dataset"""

  def each(inp):
    out = {}

    id = inp['id']
    conversation = inp['conversations']

    out['id'] = id
    out['conversation'] = []

    for message in conversation:
      role = message['from']
      if role == 'human':
        role = 'user'
      elif role == 'gpt':
        role = 'assistant'
      else:
        raise ValueError(f'Unexpected role: {role!r}')

      content = message['value']
      content = content.strip()

      out['conversation'].append({
        'role': role,
        'content': content,
      })

    return out

  return dataset.map(
    each,
    remove_columns=['id', 'conversations'],
  )

In [None]:
@run(cond=DO_PREPROCESS)
def dataset(dataset, tokenizer, /):
  """Format conversation of messages into formatted text"""

  def encode(text):
    return tokenizer.encode(
      text,
      add_special_tokens=False,
    )

  lookup = {}
  for where, who, text, tokens in [
    ('prefix', None,
     tokenizer.bos_token + tokenizer.bos_token,
      [tokenizer.bos_token_id, tokenizer.bos_token_id],
    ),
    ('suffix', None,
     tokenizer.eos_token,
     [tokenizer.eos_token_id],
    ),
    ('prefix', 'user',
     'USER: ',
     encode('USER: '),
    ),
    ('suffix', 'user',
     ' ',
     encode(' '),
    ),
    ('prefix', 'assistant',
     'ASSISTANT: ',
     encode('ASSISTANT: '),
    ),
    ('suffix', 'assistant',
     tokenizer.eos_token,
     [tokenizer.eos_token_id],
    ),
  ]:
    lookup.setdefault(where, {}).setdefault(who, {}).setdefault('text', text)
    lookup.setdefault(where, {}).setdefault(who, {}).setdefault('tokens', tokens)

  @doctest
  def each(inp):
    r"""

    >>> each({ 'conversation': [
    ...   { "role": "user", "content": "how do trees contribute to the" },
    ...   { "role": "assistant", "content": "trees make oxygen" },
    ...   { "role": "user", "content": "can trees communicate with each" },
    ...   { "role": "assistant", "content": "they don't (lol)" },
    ... ] })['text']
    "<s><s>USER: how do trees contribute to the ASSISTANT: trees make oxygen</s>USER: can trees communicate with each ASSISTANT: they don't (lol)</s></s>"

    """

    texts, tokens = [], []

    texts += [
      lookup['prefix'][None]['text'],
    ]

    tokens += [
      lookup['prefix'][None]['tokens'],
    ]

    for message in inp['conversation']:
      role = message['role']
      content = message['content']

      texts += [
        lookup['prefix'][role]['text'],
        content,
        lookup['suffix'][role]['text'],
      ]

      tokens += [
        lookup['prefix'][role]['tokens'],
        encode(content),
        lookup['suffix'][role]['tokens'],
      ]

    texts += [
      lookup['suffix'][None]['text'],
    ]

    tokens += [
      lookup['suffix'][None]['tokens'],
    ]

    return {
      'text': ''.join(texts),
      'tokens': sum(tokens, []),
    }

  return dataset.map(
    each,
    remove_columns=['conversation'],
  )

In [None]:
@run(cond=DO_PREPROCESS)
def __print_training_samples(dataset, /):
  import json
  print(json.dumps(dataset[0]['text']))
  print(json.dumps(dataset[1]['text']))
  print(json.dumps(dataset[2]['text']))
  print(json.dumps(dataset[3]['text']))
  print(json.dumps(dataset[4]['text']))

In [None]:
@run(cond=DO_PREPROCESS)
def dataset(dataset, /):
  """Replicate and shuffle the training set"""

  return datasets.concatenate_datasets(
    dsets=[dataset] * REPLICAS
  ).shuffle(
    seed=SEED,
  )

In [None]:
@run(cond=DO_PREPROCESS)
def dataset(dataset, tokenizer, /):
  """Concatenate tokens and form into chunks"""

  def decode(tokens):
    return tokenizer.decode(
      tokens,
      skip_special_tokens=False,
    )

  def each(inps):
    outs = {}
    def emit(*, text, tokens):
      outs.setdefault('text', []).append(text)
      outs.setdefault('tokens', []).append(tokens)

    it = inps['tokens']
    it = more_itertools.collapse(it)
    it = more_itertools.chunked(it, n=CONTEXT_SIZE)
    it = (x for x in it if len(x) == CONTEXT_SIZE)
    for tokens in it:
      text = decode(tokens)

      emit(
        text=text,
        tokens=tokens,
      )

    return outs

  return dataset.map(
    each,
    batched=True,
    # batch_size=None,
    remove_columns=['text', 'tokens', 'id'],
  )

In [None]:
@run(cond=DO_PREPROCESS)
def __push_dataset_to_hub(dataset, /):
  dataset.push_to_hub(
    REPOSITORY_NAME
  )

# Train

In [None]:
@run(cond=DO_TRAIN)
def dataset():
  return datasets.load_dataset(
    REPOSITORY_NAME,
    split='train',
  )

Downloading readme:   0%|          | 0.00/434 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/player1537___parquet/player1537--Bloom-560m-trained-on-Wizard-Vicuna-Uncensored-4b259eaeffdc520e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/125M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/86379 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/player1537___parquet/player1537--Bloom-560m-trained-on-Wizard-Vicuna-Uncensored-4b259eaeffdc520e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


In [None]:
@run
def __print_dataset_statistics(dataset, /):
  print(f'{dataset = !r}')
  print(f'dataset[0].keys = {list(dataset[0].keys())!r}')
  for i in range(5):
    print(f'  len(dataset[{i}]["text"]) = {len(dataset[i]["text"])!r}')
    print(f'  len(dataset[{i}]["tokens"]) = {len(dataset[i]["tokens"])!r}')

  N = len(dataset)
  print(f'With {N} items in the dataset:')
  for n, d in [
    (1, 16),
    (1, 10),
    (1, 8),
    (1, 4),
    (1, 3),
    (1, 2),
    (2, 3),
    (3, 4),
  ]:
    print(f'  {N} * {n} / {d} = {N*n/d:0.1f}')

dataset = Dataset({
    features: ['text', 'tokens'],
    num_rows: 86379
})
dataset[0].keys = ['text', 'tokens']
  len(dataset[0]["text"]) = 5021
  len(dataset[0]["tokens"]) = 1024
  len(dataset[1]["text"]) = 4746
  len(dataset[1]["tokens"]) = 1024
  len(dataset[2]["text"]) = 4378
  len(dataset[2]["tokens"]) = 1024
  len(dataset[3]["text"]) = 5246
  len(dataset[3]["tokens"]) = 1024
  len(dataset[4]["text"]) = 5379
  len(dataset[4]["tokens"]) = 1024
With 86379 items in the dataset:
  86379 * 1 / 16 = 5398.7
  86379 * 1 / 10 = 8637.9
  86379 * 1 / 8 = 10797.4
  86379 * 1 / 4 = 21594.8
  86379 * 1 / 3 = 28793.0
  86379 * 1 / 2 = 43189.5
  86379 * 2 / 3 = 57586.0
  86379 * 3 / 4 = 64784.2


In [None]:
@run(cond=DO_TRAIN)
def dataset(dataset, /):
  """Remove extra dataset keys and use expected naming"""

  def each(inp):
    return {
      'input_ids': inp['tokens'],
    }

  return dataset.map(
    each,
    remove_columns=['text', 'tokens'],
  )

Map:   0%|          | 0/86379 [00:00<?, ? examples/s]

In [None]:
@run(cond=DO_TRAIN)
def model():
  return transformers.AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    torch_dtype=(
      torch.float16
      if torch.cuda.is_available() else
      torch.float32
    ),
  )

@run(cond=DO_TRAIN and PEFT_MODEL_NAME is None)
def model(model, /):
  return peft.get_peft_model(
    model,
    peft.LoraConfig(
      task_type='CAUSAL_LM',
      lora_alpha=LORA_ALPHA,
      lora_dropout=LORA_DROPOUT,
      inference_mode=False,
    ),
  )

# @run(cond=DO_TRAIN and PEFT_MODEL_NAME is not None)
# def model(model, /):
#   return peft.PeftModel.from_pretrained(
#     model,
#     PEFT_MODEL_NAME,
#     is_trainable=True,
#   )

@run(cond=DO_TRAIN)
def __print_model(model, /):
  model.print_trainable_parameters()

Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

trainable params: 786432 || all params: 560001024 || trainable%: 0.14043402892063284


In [None]:
@run(cond=DO_TRAIN)
def trainer(model, dataset, /):
  class Trainer(transformers.Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
      return model(
        input_ids=inputs["input_ids"],
        attention_mask=torch.ones_like(inputs["input_ids"]).bool(),
        labels=inputs["input_ids"],
      ).loss

  return Trainer(
    model=model,
    train_dataset=dataset,
    args=transformers.TrainingArguments \
      ( learning_rate=LEARNING_RATE
      , seed=SEED
      , output_dir=CHECKPOINT_DIRECTORY

      , overwrite_output_dir=True  #@param {type:"boolean"}
      , evaluation_strategy="no"  #@param ["no", "steps", "epochs"] {"type": "string"}
      , per_device_train_batch_size=1  #@param {"type": "integer"}
      , per_device_eval_batch_size=1  #@param {"type": "integer"}
      , num_train_epochs=1  #@param {"type": "integer"}
      , logging_strategy="steps"  #@param ["no", "steps", "epochs"] {"type": "string"}
      , logging_first_step=True  #@param {type:"boolean"}
      , logging_steps=1000  #@param {"type": "integer"}
      , save_strategy="steps"  #@param ["no", "steps", "epochs"] {"type": "string"}
      , save_steps=1002  #@param {"type": "integer"}
      , save_total_limit=3  #@param {"type": "integer"}
      , save_safetensors=False  #@param {type:"boolean"}
      , gradient_accumulation_steps=1
      , fp16=True  #@param {type:"boolean"}
      , fp16_full_eval=True  #@param {type:"boolean"}
      , push_to_hub=True  #@param {type:"boolean"}
      , hub_model_id=REPOSITORY_NAME
      , hub_strategy='checkpoint'  #@param ["end", "every_save", "checkpoint", "all_checkpoints"] {"type": "string"}
      , auto_find_batch_size=False  #@param {type:"boolean"}
    ),
  )

Cloning https://huggingface.co/player1537/Bloom-560m-trained-on-Wizard-Vicuna-Uncensored into local empty directory.


Cloning https://huggingface.co/player1537/Bloom-560m-trained-on-Wizard-Vicuna-Uncensored into local empty directory.


Download file last-checkpoint/pytorch_model.bin:   0%|          | 3.38k/1.04G [00:00<?, ?B/s]

Download file last-checkpoint/rng_state.pth: 100%|##########| 14.3k/14.3k [00:00<?, ?B/s]

Download file runs/Jun04_02-06-39_4367d7224c9f/1685845095.3741372/events.out.tfevents.1685845095.4367d7224c9f.…

Download file runs/Jun03_18-44-11_bb544bc9a418/1685817877.978909/events.out.tfevents.1685817877.bb544bc9a418.1…

Download file runs/Jun03_18-38-43_bb544bc9a418/1685817525.4151258/events.out.tfevents.1685817525.bb544bc9a418.…

Download file runs/Jun04_02-06-39_4367d7224c9f/events.out.tfevents.1685845095.4367d7224c9f.857.0: 100%|#######…

Clean file last-checkpoint/rng_state.pth:   7%|6         | 1.00k/14.3k [00:00<?, ?B/s]

Download file runs/Jun03_18-34-01_bb544bc9a418/1685817254.2674556/events.out.tfevents.1685817254.bb544bc9a418.…

Clean file runs/Jun04_02-06-39_4367d7224c9f/1685845095.3741372/events.out.tfevents.1685845095.4367d7224c9f.857…

Clean file runs/Jun03_18-44-11_bb544bc9a418/1685817877.978909/events.out.tfevents.1685817877.bb544bc9a418.167.…

Clean file runs/Jun03_18-38-43_bb544bc9a418/1685817525.4151258/events.out.tfevents.1685817525.bb544bc9a418.167…

Download file last-checkpoint/optimizer.pt:   0%|          | 15.4k/6.03M [00:00<?, ?B/s]

Clean file runs/Jun04_02-06-39_4367d7224c9f/events.out.tfevents.1685845095.4367d7224c9f.857.0:  16%|#5        …

Clean file runs/Jun03_18-34-01_bb544bc9a418/1685817254.2674556/events.out.tfevents.1685817254.bb544bc9a418.167…

Download file runs/Jun03_18-44-11_bb544bc9a418/events.out.tfevents.1685817877.bb544bc9a418.167.4: 100%|#######…

Clean file runs/Jun03_18-44-11_bb544bc9a418/events.out.tfevents.1685817877.bb544bc9a418.167.4:  18%|#8        …

Download file last-checkpoint/scheduler.pt: 100%|##########| 627/627 [00:00<?, ?B/s]

Download file runs/Jun03_18-38-43_bb544bc9a418/events.out.tfevents.1685817525.bb544bc9a418.167.2: 100%|#######…

Clean file last-checkpoint/scheduler.pt: 100%|##########| 627/627 [00:00<?, ?B/s]

Clean file runs/Jun03_18-38-43_bb544bc9a418/events.out.tfevents.1685817525.bb544bc9a418.167.2:  23%|##2       …

Download file last-checkpoint/training_args.bin: 100%|##########| 3.87k/3.87k [00:00<?, ?B/s]

Clean file last-checkpoint/training_args.bin:  26%|##5       | 1.00k/3.87k [00:00<?, ?B/s]

Download file runs/Jun03_18-34-01_bb544bc9a418/events.out.tfevents.1685817254.bb544bc9a418.167.0: 100%|#######…

Download file last-checkpoint/scaler.pt: 100%|##########| 557/557 [00:00<?, ?B/s]

Clean file training_args.bin:  26%|##5       | 1.00k/3.87k [00:00<?, ?B/s]

Clean file runs/Jun03_18-34-01_bb544bc9a418/events.out.tfevents.1685817254.bb544bc9a418.167.0:  23%|##2       …

Clean file last-checkpoint/scaler.pt: 100%|##########| 557/557 [00:00<?, ?B/s]

Clean file last-checkpoint/optimizer.pt:   0%|          | 1.00k/6.03M [00:00<?, ?B/s]

Clean file last-checkpoint/pytorch_model.bin:   0%|          | 1.00k/1.04G [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/1.04G [00:00<?, ?B/s]

In [None]:
# Step 	Training Loss
# 1000 	1.667800
# 2000 	1.648800
# 3000 	1.609200
# 4000 	1.618900
# 5000 	1.603000
# 6000 	1.587700
# 7000 	1.570200
# 8000 	1.583200
# 9000 	1.564600
# 10000 	1.572700
# 11000 	1.552000
# 12000 	1.538300
# 13000 	1.553400
# 14000 	1.545700
# 15000 	1.549500
# 16000 	1.538600
# 17000 	1.536800
# 18000 	1.535000
# 19000 	1.524700
# 20000 	1.519900
# 21000 	1.534100
# 22000 	1.519000
# 23000 	1.524300
# 24000 	1.516400
# 25000 	1.509500
# 26000 	1.500700
# 27000 	1.512800
# 28000 	1.523100
# 29000 	1.505200
# 30000 	1.492900
# 31000 	1.509600
# 32000 	1.502200
# 33000 	1.500500
# 34000 	1.508900
# 35000 	1.511100
# 36000 	1.491500
# 37000 	1.501600
# 38000 	1.489000
# 39000 	1.492400
# 40000 	1.489400
# 41000 	1.489400
# 42000 	1.494600
# 43000 	1.488100
# 44000 	1.477200
# 45000 	1.495400
# 46000 	1.477400

@run(cond=DO_TRAIN)
def __train(trainer, /):
  trainer.train(
    resume_from_checkpoint=(
      'checkpoints/last-checkpoint'
      # False
    ),
  )



Step,Training Loss
24000,1.5164
25000,1.5095
26000,1.5007
27000,1.5128
28000,1.5231
29000,1.5052
30000,1.4929
31000,1.5096
32000,1.5022
33000,1.5005


Adding files tracked by Git LFS: ['runs/Jun04_13-23-35_1c0366a2fd20/1685885434.1947937/events.out.tfevents.1685885434.1c0366a2fd20.848.1', 'runs/Jun04_13-23-35_1c0366a2fd20/events.out.tfevents.1685885434.1c0366a2fd20.848.0']. This may take a bit of time if the files are large.


Adding files tracked by Git LFS: ['runs/Jun04_13-23-35_1c0366a2fd20/1685885434.1947937/events.out.tfevents.1685885434.1c0366a2fd20.848.1', 'runs/Jun04_13-23-35_1c0366a2fd20/events.out.tfevents.1685885434.1c0366a2fd20.848.0']. This may take a bit of time if the files are large.


In [None]:
@run
def __push_model_to_hub(model: transformers.AutoModel, /):
  model.push_to_hub(
    REPOSITORY_NAME,
  )

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

# (end)