In [1]:
# !pip install -qqqU wandb datasets transformers

In [37]:
WANDB_PROJECT = "wandbot_llm"

RAW_TRAIN_DATASET_ARTIFACT = 'capecape/wandbot/run-m6nz6yrl-wandbot_questions:v0'
RAW_EVAL_DATASET_ARTIFACT  = "wandbot/wandbot-eval/run-kinbxic4-responses:v0"

# Preparing Data
How to prepare our dataset for model Finetune

## Formatting the data for the LLM

A big part of training LLMs lives in getting the data formatted correctly!

In [3]:
import wandb
import pandas as pd

let's create a run and monitor our work from there

In [4]:
# !pip install "protobuf<4.24.4"

In [5]:
wandb.init(project=WANDB_PROJECT, job_type="text_formatting")

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
# this way we get tracebility
dataset_artifact = wandb.use_artifact(RAW_TRAIN_DATASET_ARTIFACT, type='run_table')
table = dataset_artifact.get("wandbot_questions")

[34m[1mwandb[0m:   1 of 1 files downloaded.  


this is a W&B table, so we can convert it to whatever format we may need

In [7]:
df = pd.DataFrame(table.data, columns=table.columns)
df = df.dropna()
df = df.assign(context_len = lambda df: df.page_content.str.len()/3.6)
df.head()

Unnamed: 0,context,question,answer,page_content,metadata,context_len
0,A user has just started using the Weights & Bi...,What is a 'run' in W&B and what can I use it for?,A 'run' in W&B is the fundamental unit that yo...,import Tabs from ‘@theme/Tabs’; \n\nimport Ta...,"{'file_type': '.md', 'language': 'en', 'source...",1291.111111
2,A user has just started using W&B and they are...,Hi! I'm new to W&B and I'm a bit stuck. Can yo...,"Sure, you can install the W&B library on your ...",import Tabs from ‘@theme/Tabs’; \n\nimport Ta...,"{'file_type': '.md', 'language': 'en', 'source...",1291.111111
4,The user is getting started with Weights and B...,I need to track my experiment's hyperparameter...,Certainly! You can pass your hyperparameters t...,import Tabs from ‘@theme/Tabs’; \n\nimport Ta...,"{'file_type': '.md', 'language': 'en', 'source...",1291.111111
5,A user is trying to do some mathematical opera...,How do I multiply two numbers using W&B functi...,"To multiply two numbers, you can use the `numb...",Value\n\n\nWhether the two values are not equ...,"{'file_type': '.md', 'language': 'en', 'source...",2186.111111
6,A user has been working on making sense of a d...,"Hi, I have some confusing numbers that represe...","Yes, there is a function in W&B that allows yo...",Value\n\n\nWhether the two values are not equ...,"{'file_type': '.md', 'language': 'en', 'source...",2186.111111


In [8]:
len(df)

2091

Let's prepare the training dataset now

If you use CodeLLama we need to format the instructions accordingly!

In [9]:
B_INST, E_INST = "[INST] ", " [/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
EOS = "</s>"

prompt_format = (
    B_INST
    + B_SYS
    + "You are an AI assistant designed to assist developers with everyday tasks related to Weight & Biases "
    + "and provide helpful information. As an expert in the open-source python SDK wandb answer the following "
    # + "question based on the context below. Answer in formatted Markdown.\n"
    + "question below. Answer in formatted Markdown.\n"
    + "{page_content}"
    + E_SYS
    + "{question}"
    + E_INST
    + "\n[W&B]\n"
    + "{answer}"
    + "\n[/W&B]"
    + EOS
)

def format_text(row):
    return prompt_format.format_map(row)

In [10]:
print(prompt_format)

[INST] <<SYS>>
You are an AI assistant designed to assist developers with everyday tasks related to Weight & Biases and provide helpful information. As an expert in the open-source python SDK wandb answer the following question below. Answer in formatted Markdown.
{page_content}
<</SYS>>

{question} [/INST]
[W&B]
{answer}
[/W&B]</s>


In [11]:
one_example = format_text(df.iloc[0])
print(one_example)

[INST] <<SYS>>
You are an AI assistant designed to assist developers with everyday tasks related to Weight & Biases and provide helpful information. As an expert in the open-source python SDK wandb answer the following question below. Answer in formatted Markdown.
import Tabs from ‘@theme/Tabs’;  

import TabItem from ‘@theme/TabItem’;


# Quickstart


Install W&B and start tracking your machine learning experiments in minutes.


## 1. Create an account and install W&B


Before you get started, make sure you create an account and install W&B:


1. Sign up for a free account at <https://wandb.ai/site> and then login to your wandb account.
2. Install the wandb library on your machine in a Python 3 environment using `pip`.



The following code snippets demonstrate how to install and log into W&B using the W&B CLI and Python Library:



Install the CLI and Python library for interacting with the Weights and Biases API:



```
pip install wandb

```


Install the CLI and Python library for

Let's compute the format over all the dataframe

In [12]:
df["text"] = df.apply(format_text, axis=1)

# print(df.iloc[200]["text"])

In [13]:
df.to_json("wandb_questions_ds.jsonl", orient='records', lines=True)

## Saving your work to W&B

We should log this to W&B so we can inspect the dataset interactively using W&B Tables

In [14]:
table = wandb.Table(dataframe=df)
wandb.log({"wandb_questions_ds": table})

# let's also save a the dataset at this stage
at = wandb.Artifact(
    name="wandb_questions_ds", 
    type="dataset",
    description="A wandbot dataset of questions and answers about W&B for training (non tokenized)",
    metadata={"prompt_format": prompt_format,
              "length": len(df),
             }
)
at.add_file("wandb_questions_ds.jsonl")
wandb.log_artifact(at)
wandb.finish()

VBox(children=(Label(value='43.532 MB of 59.734 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.7287…

## Tokenizing and saving the preprocessing
We can save time during training by pre-processing the dataset and loading directly a tokenized dataset!

In [18]:
MODEL_NAME = "codellama/CodeLlama-7b-Instruct-hf"

In [19]:
import wandb
from transformers import AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)okenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

we can convert the data to a huggingface parquet-based dataset for fast loading

In [20]:
wandb.init(project=WANDB_PROJECT, job_type="tokenizing")
artifact = wandb.use_artifact('capecape/aws_llm_workshop/wandb_questions_ds:v0', type='dataset')
artifact_dir = artifact.download()

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [21]:
train_dataset = load_dataset(
    path=".", 
    data_files=f"{artifact_dir}/wandb_questions_ds.jsonl", 
    split="train")
train_dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['context', 'question', 'answer', 'page_content', 'metadata', 'context_len', 'text'],
    num_rows: 2091
})

one sample looks like this 👇

In [22]:
train_dataset = train_dataset.select_columns(["text"])

### Packing and chunking

We define some helper functions to pack our samples into sequences of a given length and then tokenize them.tokenizer

In [23]:
from random import randint
from itertools import chain
from functools import partial


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{sample}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
train_dataset = train_dataset.map(template_dataset)
# print random sample
print(train_dataset[randint(0, len(train_dataset))]["text"])

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

Map:   0%|          | 0/2091 [00:00<?, ? examples/s]

{'text': '[INST] <<SYS>>\nYou are an AI assistant designed to assist developers with everyday tasks related to Weight & Biasesand provide helpful information. As an expert in the open-source python SDK wandb answer the followingquestion below. Answer in formatted Markdown.\nimport Tabs from ‘@theme/Tabs’;  \n\nimport TabItem from ‘@theme/TabItem’;\n\n\n# Quickstart\n\n\nInstall W&B and start tracking your machine learning experiments in minutes.\n\n\n## 1. Create an account and install W&B\n\n\nBefore you get started, make sure you create an account and install W&B:\n\n\n1. Sign up for a free account at <https://wandb.ai/site> and then login to your wandb account.\n2. Install the wandb library on your machine in a Python 3 environment using `pip`.\n\n\n\nThe following code snippets demonstrate how to install and log into W&B using the W&B CLI and Python Library:\n\n\n\nInstall the CLI and Python library for interacting with the Weights and Biases API:\n\n\n\n```\npip install wandb\n\n`

In [24]:
def chunk(sample, chunk_length=1024):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# tokenize and chunk dataset
lm_dataset = train_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(train_dataset.features)
).map(
    partial(chunk, chunk_length=1024),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(lm_dataset)}")

Map:   0%|          | 0/2091 [00:00<?, ? examples/s]

Map:   0%|          | 0/2091 [00:00<?, ? examples/s]

Total number of samples: 4907


## Save to a bucket and W&B

We are now going to use W&B Aritfacts

In [25]:
training_input_path = "./wandbot_train_ds"

lm_dataset.save_to_disk(training_input_path)

print("uploaded data to:")
print(f"training dataset to: {training_input_path}")

Saving the dataset (0/1 shards):   0%|          | 0/4907 [00:00<?, ? examples/s]

uploaded data to:
training dataset to: ./wandbot_train_ds


In [26]:
at = wandb.Artifact(
    name="wandbot_dataset_tokenized", 
    type="dataset",
    description="A wandbot dataset of questions and answers about W&B - CodeLLama tokenized",
    metadata={"model_name": MODEL_NAME, "tokenizer": MODEL_NAME},
)

In [27]:
at.add_dir(training_input_path)
wandb.log_artifact(at)

[34m[1mwandb[0m: Adding directory to artifact (./wandbot_train_ds)... Done. 0.1s


<Artifact wandbot_dataset_tokenized>

Let's finish this run

In [28]:
wandb.finish()

# Eval Dataset
We prepared a set of questions from `wandbot` that were gathered and curated by my colleague Ayush T. God's work here!

In [29]:
import json
import wandb

wandb.init(project=WANDB_PROJECT, job_type="eval_preprocessing")

In [38]:
question_artifacts = wandb.use_artifact(RAW_EVAL_DATASET_ARTIFACT, type='run_table')
question_artifacts.download()

[34m[1mwandb[0m:   1 of 1 files downloaded.  


'./artifacts/run-kinbxic4-responses:v0'

In [39]:
with open(question_artifacts.file()) as f:
    data = json.load(f)

columns = data["columns"]
data = data["data"]
eval_df = pd.DataFrame(data, columns=columns)

In [40]:
eval_df.head()

Unnamed: 0,query,retrieved_context,generated_response
0,Hey I have a question about using wandb with f...,"[`wandb.init()` returns a run object, and you ...",The initialization of `wandb.init()` should be...
1,Hey with wandb is it possible to link from the...,[### How do I best log models from runs in a s...,"Yes, you can link to the best run from a sweep..."
2,I am training a spacy textcat model. This proc...,[### Model Architecture\n\nOur config also def...,To log the best model's metrics instead of the...
3,Explain how I can version datasets with Weight...,[ # A 3-in-1 Intro to Weights & Biases: Comput...,Weights & Biases provides a feature called Art...
4,I'm using env = SubprocVecEnv(\n [mak...,[## WandbCallback 引数\n\n| 引数 | 使用法 | | --- | -...,To record a video for a specific subprocess en...


Let's remove retrieved Japanese text

### Clean up and prepare (pandas workout)

In [41]:
def contains_japanese(text):
    for char in text:
        if '一' <= char <= '龥':
            return True
    return False

In [42]:
s = "## WandbCallback 引数\n\n| 引数 | 使用法 |"

In [43]:
eval_df["retrieved_context_en"] = [[ctx for ctx in ctxs if not contains_japanese(ctx)] for ctxs in eval_df.retrieved_context.values]

In [44]:
eval_df.head()

Unnamed: 0,query,retrieved_context,generated_response,retrieved_context_en
0,Hey I have a question about using wandb with f...,"[`wandb.init()` returns a run object, and you ...",The initialization of `wandb.init()` should be...,"[`wandb.init()` returns a run object, and you ..."
1,Hey with wandb is it possible to link from the...,[### How do I best log models from runs in a s...,"Yes, you can link to the best run from a sweep...",[### How do I best log models from runs in a s...
2,I am training a spacy textcat model. This proc...,[### Model Architecture\n\nOur config also def...,To log the best model's metrics instead of the...,[### Model Architecture\n\nOur config also def...
3,Explain how I can version datasets with Weight...,[ # A 3-in-1 Intro to Weights & Biases: Comput...,Weights & Biases provides a feature called Art...,[ # A 3-in-1 Intro to Weights & Biases: Comput...
4,I'm using env = SubprocVecEnv(\n [mak...,[## WandbCallback 引数\n\n| 引数 | 使用法 | | --- | -...,To record a video for a specific subprocess en...,[## Basic Example\n\nThe W&B SB3 integration u...


In [45]:
eval_df["retrieved_context_stuff"] = ["\n".join(ctxs) for ctxs in eval_df.retrieved_context_en.values]

In [46]:
eval_df.head()

Unnamed: 0,query,retrieved_context,generated_response,retrieved_context_en,retrieved_context_stuff
0,Hey I have a question about using wandb with f...,"[`wandb.init()` returns a run object, and you ...",The initialization of `wandb.init()` should be...,"[`wandb.init()` returns a run object, and you ...","`wandb.init()` returns a run object, and you c..."
1,Hey with wandb is it possible to link from the...,[### How do I best log models from runs in a s...,"Yes, you can link to the best run from a sweep...",[### How do I best log models from runs in a s...,### How do I best log models from runs in a sw...
2,I am training a spacy textcat model. This proc...,[### Model Architecture\n\nOur config also def...,To log the best model's metrics instead of the...,[### Model Architecture\n\nOur config also def...,### Model Architecture\n\nOur config also defi...
3,Explain how I can version datasets with Weight...,[ # A 3-in-1 Intro to Weights & Biases: Comput...,Weights & Biases provides a feature called Art...,[ # A 3-in-1 Intro to Weights & Biases: Comput...,# A 3-in-1 Intro to Weights & Biases: Compute...
4,I'm using env = SubprocVecEnv(\n [mak...,[## WandbCallback 引数\n\n| 引数 | 使用法 | | --- | -...,To record a video for a specific subprocess en...,[## Basic Example\n\nThe W&B SB3 integration u...,## Basic Example\n\nThe W&B SB3 integration us...


In [47]:
eval_df = eval_df.assign(tokens = eval_df['retrieved_context_stuff'].str.len())

In [48]:
eval_df.head()

Unnamed: 0,query,retrieved_context,generated_response,retrieved_context_en,retrieved_context_stuff,tokens
0,Hey I have a question about using wandb with f...,"[`wandb.init()` returns a run object, and you ...",The initialization of `wandb.init()` should be...,"[`wandb.init()` returns a run object, and you ...","`wandb.init()` returns a run object, and you c...",15505
1,Hey with wandb is it possible to link from the...,[### How do I best log models from runs in a s...,"Yes, you can link to the best run from a sweep...",[### How do I best log models from runs in a s...,### How do I best log models from runs in a sw...,13161
2,I am training a spacy textcat model. This proc...,[### Model Architecture\n\nOur config also def...,To log the best model's metrics instead of the...,[### Model Architecture\n\nOur config also def...,### Model Architecture\n\nOur config also defi...,17812
3,Explain how I can version datasets with Weight...,[ # A 3-in-1 Intro to Weights & Biases: Comput...,Weights & Biases provides a feature called Art...,[ # A 3-in-1 Intro to Weights & Biases: Comput...,# A 3-in-1 Intro to Weights & Biases: Compute...,12755
4,I'm using env = SubprocVecEnv(\n [mak...,[## WandbCallback 引数\n\n| 引数 | 使用法 | | --- | -...,To record a video for a specific subprocess en...,[## Basic Example\n\nThe W&B SB3 integration u...,## Basic Example\n\nThe W&B SB3 integration us...,15480


In [49]:
eval_df = eval_df[["query", "generated_response", "retrieved_context_stuff", "tokens"]]

In [50]:
eval_df.columns = ["question", "answer", "retrieved_context", "char_len"]

In [51]:
eval_df = eval_df.rename({"retrieved_context": "page_content"}, axis=1)
eval_df.head()

Unnamed: 0,question,answer,page_content,char_len
0,Hey I have a question about using wandb with f...,The initialization of `wandb.init()` should be...,"`wandb.init()` returns a run object, and you c...",15505
1,Hey with wandb is it possible to link from the...,"Yes, you can link to the best run from a sweep...",### How do I best log models from runs in a sw...,13161
2,I am training a spacy textcat model. This proc...,To log the best model's metrics instead of the...,### Model Architecture\n\nOur config also defi...,17812
3,Explain how I can version datasets with Weight...,Weights & Biases provides a feature called Art...,# A 3-in-1 Intro to Weights & Biases: Compute...,12755
4,I'm using env = SubprocVecEnv(\n [mak...,To record a video for a specific subprocess en...,## Basic Example\n\nThe W&B SB3 integration us...,15480


## Save to W&B

Let's format the dataset in the same way we created the training dataset, we have to be consisten with naming
- We remove the answer, but we are going to keep it on the dataset to evaluate the model

In [52]:
B_INST, E_INST = "[INST] ", " [/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
EOS = "</s>"

eval_prompt_format = (
    B_INST
    + B_SYS
    + "You are an AI assistant designed to assist developers with everyday tasks related to Weight & Biases "
    + "and provide helpful information. As an expert in the open-source python SDK wandb answer the following "
    # + "question based on the context below. Answer in formatted Markdown.\n"
    + "question below. Answer in formatted Markdown.\n"
    + "{page_content}"
    + E_SYS
    + "{question}"
    + E_INST
    + "\n[W&B]\n"
    # + "{answer}"
    # + "\n[/W&B]"
    # + EOS
)

def eval_format_text(row):
    return eval_prompt_format.format_map(row)

In [53]:
eval_df["text"] = eval_df.apply(eval_format_text, axis=1)

Save to disk and create HF dataset

In [54]:
eval_df.to_json("wandbot_eval.jsonl", orient='records', lines=True)
eval_dataset = load_dataset(".", data_files="wandbot_eval.jsonl")
eval_dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'page_content', 'char_len', 'text'],
        num_rows: 132
    })
})

In [57]:
eval_input_path = "./wandbot_eval_ds"

In [58]:
eval_dataset.save_to_disk(eval_input_path)

Saving the dataset (0/1 shards):   0%|          | 0/132 [00:00<?, ? examples/s]

In [59]:
table = wandb.Table(dataframe=eval_df)
wandb.log({"wandbot_eval_dataset": table})



In [60]:
at = wandb.Artifact(
    name="wandbot_eval_dataset", 
    type="dataset",
    description="A wandbot dataset of questions and answers about W&B for evaluation",
    metadata={"prompt_format": eval_prompt_format,
              "length": len(eval_dataset),
             })

In [61]:
at.add_dir(eval_input_path)
wandb.log_artifact(at)

[34m[1mwandb[0m: Adding directory to artifact (./wandbot_eval_ds)... Done. 0.0s


<Artifact wandbot_eval_dataset>

In [62]:
wandb.finish()

VBox(children=(Label(value='12.082 MB of 12.082 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, m…