<a href="https://colab.research.google.com/github/lmassaron/fine-tuning-workshop/blob/main/synthetic_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
import os
!pip install --upgrade -qqq uv
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install!
    !pip install unsloth vllm synthetic-data-kit==0.0.3
else:
    try: import numpy; get_numpy = f"numpy=={numpy.__version__}"
    except: get_numpy = "numpy"
    try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
    except: is_t4 = False
    get_vllm, get_triton = ("vllm==0.9.2", "triton==3.2.0") if is_t4 else ("vllm==0.10.2", "triton")
    !uv pip install -qqq --upgrade         unsloth {get_vllm} {get_numpy} torchvision bitsandbytes xformers
    !uv pip install -qqq {get_triton}
    !uv pip install synthetic-data-kit==0.0.3
!uv pip install transformers==4.55.4
!uv pip install --no-deps trl==0.22.2
!uv pip install wikipedia-api

In [2]:
%%capture
import os
!pip install --upgrade -qqq uv
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install!
    !pip install unsloth vllm
else:
    try: import numpy; get_numpy = f"numpy=={numpy.__version__}"
    except: get_numpy = "numpy"
    try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
    except: is_t4 = False
    get_vllm, get_triton = ("vllm==0.9.2", "triton==3.2.0") if is_t4 else ("vllm==0.10.2", "triton")
    !uv pip install -qqq --upgrade \
        unsloth {get_vllm} {get_numpy} torchvision bitsandbytes xformers
    !uv pip install -qqq {get_triton}
!uv pip install transformers==4.55.4
!uv pip install --no-deps trl==0.22.2

In [3]:
import os
import time
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import wikipediaapi
from unsloth.dataprep import SyntheticDataKit
import huggingface_hub
from collections import Counter
import itertools
from datasets import Dataset
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 09-24 17:40:17 [__init__.py:216] Automatically detected platform cuda.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
# Pre-compile the regular expression pattern for better performance
BRACES_PATTERN = re.compile(r'\{.*?\}|\}')

def remove_braces_and_content(text):
    """Remove all occurrences of curly braces and their content from the given text"""
    return BRACES_PATTERN.sub('', text)

def clean_string(input_string):
    """Clean the input string."""

    # Remove extra spaces by splitting the string by spaces and joining back together
    cleaned_string = ' '.join(input_string.split())

    # Remove consecutive carriage return characters until there are no more consecutive occurrences
    cleaned_string = re.sub(r'\r+', '\r', cleaned_string)

    # Remove all occurrences of curly braces and their content from the cleaned string
    cleaned_string = remove_braces_and_content(cleaned_string)

    # Return the cleaned string
    return cleaned_string

In [5]:
def extract_wikipedia_pages(wiki_wiki, category_name):
    """Extract all references from a category on Wikipedia"""

    # Get the Wikipedia page corresponding to the provided category name
    category = wiki_wiki.page("Category:" + category_name)

    # Initialize an empty list to store page titles
    pages = []

    # Check if the category exists
    if category.exists():
        # Iterate through each article in the category and append its title to the list
        for article in category.categorymembers.values():
            pages.append(article.title)

    # Return the list of page titles
    return pages

In [6]:
def get_wikipedia_pages(categories):
    """Retrieve Wikipedia pages from a list of categories and extract their content"""

    # Create a Wikipedia object
    wiki_wiki = wikipediaapi.Wikipedia('Gemma AI Assistant (gemma@example.com)', 'en')

    # Initialize lists to store explored categories and Wikipedia pages
    explored_categories = []
    wikipedia_pages = []

    # Iterate through each category
    print("- Processing Wikipedia categories:")
    for category_name in categories:
        print(f"\tExploring {category_name} on Wikipedia")

        # Get the Wikipedia page corresponding to the category
        category = wiki_wiki.page("Category:" + category_name)

        # Extract Wikipedia pages from the category and extend the list
        wikipedia_pages.extend(extract_wikipedia_pages(wiki_wiki, category_name))

        # Add the explored category to the list
        explored_categories.append(category_name)

    # Extract subcategories and remove duplicate categories
    categories_to_explore = [item.replace("Category:", "") for item in wikipedia_pages if "Category:" in item]
    wikipedia_pages = list(set([item for item in wikipedia_pages if "Category:" not in item]))

    # Explore subcategories recursively
    while categories_to_explore:
        category_name = categories_to_explore.pop()
        print(f"\tExploring {category_name} on Wikipedia")

        # Extract more references from the subcategory
        more_refs = extract_wikipedia_pages(wiki_wiki, category_name)

        # Iterate through the references
        for ref in more_refs:
            # Check if the reference is a category
            if "Category:" in ref:
                new_category = ref.replace("Category:", "")
                # Add the new category to the explored categories list
                if new_category not in explored_categories:
                    explored_categories.append(new_category)
            else:
                # Add the reference to the Wikipedia pages list
                if ref not in wikipedia_pages:
                    wikipedia_pages.append(ref)

    # Initialize a list to store extracted texts
    extracted_texts = []

    # Iterate through each Wikipedia page
    print("- Processing Wikipedia pages:")
    for page_title in tqdm(wikipedia_pages):
        try:
            # Make a request to the Wikipedia page
            page = wiki_wiki.page(page_title)

            # Check if the page summary does not contain certain keywords
            if "Biden" not in page.summary and "Trump" not in page.summary:
                # Append the page title and summary to the extracted texts list
                if len(page.summary) > len(page.title):
                    extracted_texts.append(page.title + " : " + clean_string(page.summary))

                # Iterate through the sections in the page
                for section in page.sections:
                    # Append the page title and section text to the extracted texts list
                    if len(section.text) > len(page.title):
                        extracted_texts.append(page.title + " : " + clean_string(section.text))

        except Exception as e:
            print(f"Error processing page {page.title}: {e}")

    # Return the extracted texts
    return extracted_texts

In [7]:
categories = [
    "Sherlock_Holmes",
    "Arthur_Conan_Doyle",
    "A_Scandal_in_Bohemia",
    "The_Adventures_of_Sherlock_Holmes",
    "A_Study_in_Scarlet",
    "The_Sign_of_the_Four",
    "The_Memoirs_of_Sherlock_Holmes",
    "The_Hound_of_the_Baskervilles",
    "The_Return_of_Sherlock_Holmes",
    "The_Valley_of_Fear",
    "His_Last_Bow",
    "The_Case-Book_of_Sherlock_Holmes",
    "Canon_of_Sherlock_Holmes",
    "Dr._Watson",
    "221B_Baker_Street",
    "Mrs._Hudson",
    "Professor_Moriarty",
    "The_Strand_Magazine",
    "Minor_Sherlock_Holmes_characters",
    "Inspector_Lestrade",
    "Mycroft_Holmes",
    "Irene_Adler",
    "Colonel_Moran",
    "Baker_Street_Irregulars",
    "Giant_rat_of_Sumatra",
    "The_Story_of_the_Lost_Special",
    "How_Watson_Learned_the_Trick",
    "Diogenes_Club",
    "The_Dynamics_of_an_Asteroid",
    "Reichenbach_Falls",
    "A_Treatise_on_the_Binomial_Theorem",
    "Sherlockian_game",
    "List_of_Holmesian_studies",
    "The_New_Annotated_Sherlock_Holmes",
    "The_Private_Life_of_Sherlock_Holmes_(book)",
    "The_Great_Detective_(book)",
    "Naked_Is_the_Best_Disguise",
    "Sherlock_Holmes_fandom",
    "Sherlockiana",
    "Sherlock_Holmes_Museum",
    "The_Sherlock_Holmes",
    "The_Baker_Street_Irregulars",
    "The_Baker_Street_Journal",
    "Sidney_Paget",
    "The_Strand_Magazine",
    "Undershaw",
    "Canon_of_Sherlock_Holmes",
    "Adaptations_of_Sherlock_Holmes",
    "Sherlock_Holmes_pastiches",
    "Popular_culture_references_to_Sherlock_Holmes",
]
extracted_texts = get_wikipedia_pages(categories)
print("Found", len(extracted_texts), "Wikipedia pages")

- Processing Wikipedia categories:
	Exploring Sherlock_Holmes on Wikipedia
	Exploring Arthur_Conan_Doyle on Wikipedia
	Exploring A_Scandal_in_Bohemia on Wikipedia
	Exploring The_Adventures_of_Sherlock_Holmes on Wikipedia
	Exploring A_Study_in_Scarlet on Wikipedia
	Exploring The_Sign_of_the_Four on Wikipedia
	Exploring The_Memoirs_of_Sherlock_Holmes on Wikipedia
	Exploring The_Hound_of_the_Baskervilles on Wikipedia
	Exploring The_Return_of_Sherlock_Holmes on Wikipedia
	Exploring The_Valley_of_Fear on Wikipedia
	Exploring His_Last_Bow on Wikipedia
	Exploring The_Case-Book_of_Sherlock_Holmes on Wikipedia
	Exploring Canon_of_Sherlock_Holmes on Wikipedia
	Exploring Dr._Watson on Wikipedia
	Exploring 221B_Baker_Street on Wikipedia
	Exploring Mrs._Hudson on Wikipedia
	Exploring Professor_Moriarty on Wikipedia
	Exploring The_Strand_Magazine on Wikipedia
	Exploring Minor_Sherlock_Holmes_characters on Wikipedia
	Exploring Inspector_Lestrade on Wikipedia
	Exploring Mycroft_Holmes on Wikipedia
	Ex

100%|██████████| 779/779 [01:38<00:00,  7.87it/s]

Found 3591 Wikipedia pages





In [8]:
output_dir = 'data/output'
os.makedirs(output_dir, exist_ok=True)

for k, text in enumerate(extracted_texts):
    file_path = os.path.join(output_dir, f'sherlock_{k}.txt')
    with open(file_path, 'w') as f:
        f.write(text)

print("All texts have been saved successfully.")

All texts have been saved successfully.


In [9]:
filenames = [f"data/output/{file}" for file in os.listdir("data/output")]

In [10]:
generator = SyntheticDataKit.from_pretrained(
    # Choose any model from https://huggingface.co/unsloth
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = 2048, # Longer sequence lengths will be slower!
)

config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

INFO 09-24 17:42:31 [vllm_utils.py:688] Unsloth: Patching vLLM v1 graph capture
INFO 09-24 17:42:31 [vllm_utils.py:716] Unsloth: Patching vLLM v0 graph capture
Unsloth: Using dtype = torch.bfloat16 for vLLM.
Unsloth: vLLM loading unsloth/Llama-3.2-3B-Instruct with actual GPU utilization = 89.06%
Unsloth: Your GPU has CUDA compute capability 8.0 with VRAM = 39.56 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 320.
Unsloth: vLLM's KV Cache can use up to 29.25 GB. Also swap space = 6 GB.
Unsloth: Not an error, but `device` is not supported in vLLM. Skipping.
vLLM STDOUT: INFO 09-24 17:42:40 [__init__.py:216] Automatically detected platform cuda.
vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO 09-24 17:42:43 [api_server.py:1896] vLLM API server version 0.10.2
vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO 09-24 17:42:43 [utils.py:328] non-default args: {'model_tag': 'unsloth/Llama-3.2-3B-Instruct', 'model': 'unsloth/Llama-3.2-3B-Instruct'

In [11]:
generator.prepare_qa_generation(
    output_folder = "data", # Output location of synthetic data
    temperature = 0.7, # Higher temp makes more diverse datases
    top_p = 0.95,
    overlap = 64, # Overlap portion during chunking
    max_generation_tokens = 512, # Can increase for longer QA pairs
)

In [12]:
!synthetic-data-kit system-check

vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:     127.0.0.1:47084 - "GET /v1/models HTTP/1.1" 200 OK
[?25l[32m VLLM server is running at [0m[4;94mhttp://localhost:8000/v1[0m
[32m⠋[0m[32m Checking VLLM server at http://localhost:8000/v1...[0m[2KAvailable models: [1m{[0m[32m'object'[0m: [32m'list'[0m, [32m'data'[0m: [1m[[0m[1m{[0m[32m'id'[0m: 
[32m'unsloth/Llama-3.2-3B-Instruct'[0m, [32m'object'[0m: [32m'model'[0m, [32m'created'[0m: [1;36m1758735861[0m, 
[32m'owned_by'[0m: [32m'vllm'[0m, [32m'root'[0m: [32m'unsloth/Llama-3.2-3B-Instruct'[0m, [32m'parent'[0m: [3;35mNone[0m, 
[32m'max_model_len'[0m: [1;36m2048[0m, [32m'permission'[0m: [1m[[0m[1m{[0m[32m'id'[0m: 
[32m'modelperm-0f2af01cfc7444fe9ca48675c5ee7fe3'[0m, [32m'object'[0m: [32m'model_permission'[0m, 
[32m'created'[0m: [1;36m1758735861[0m, [32m'allow_create_engine'[0m: [3;91mFalse[0m, [32m'allow_sampling'[0m: [3;92mTrue[0m, 
[32m'allow_logprobs'[0m

In [13]:
# Process chunks
for filename in filenames:
    !synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create {filename} \
        --num-pairs 25 \
        --type "qa"
    time.sleep(2) # Sleep some time to leave some room for processing

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[2KSaving result to data/generated/sherlock_1510_qa_pairs.json
[2KSuccessfully wrote test file to data/generated/test_write.json
[2KSuccessfully wrote result to data/generated/sherlock_1510_qa_pairs.json
[2K[32m⠼[0m Generating qa content from data/output/sherlock_1510.txt...
[1A[2K[32m Content saved to [0m[1;32mdata/generated/sherlock_1510_qa_pairs.json[0m
vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:     127.0.0.1:49390 - "GET /v1/models HTTP/1.1" 200 OK
vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:     127.0.0.1:49402 - "GET /v1/models HTTP/1.1" 200 OK
[2K[32m⠏[0m Generating qa content from data/output/sherlock_2286.txt...vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:     127.0.0.1:49416 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2KProcessing 1 chunks to generate QA pairs...
[2K[32m⠹[0m Generating qa content from data/output/sherlock_2286.txt...vLLM STDOUT: [1;36m(APIServer pi

In [None]:
QUALITY_CHECK = True

if QUALITY_CHECK:
    qa_pairs_filenames = [
        f"data/generated/sherlock_{i}_qa_pairs.json"
        for i in range(len(filenames))
    ]
    for filename in qa_pairs_filenames:
        !synthetic-data-kit \
            -c synthetic_data_kit_config.yaml \
            curate --threshold 5.0 \
            {filename}

vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:     127.0.0.1:40438 - "GET /v1/models HTTP/1.1" 200 OK
vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:     127.0.0.1:40442 - "GET /v1/models HTTP/1.1" 200 OK
[?25lProcessing 2 batches of QA pairs...
[2K[32m⠙[0m Cleaning content from data/generated/sherlock_0_qa_pairs.json...vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:     127.0.0.1:40444 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2K[32m⠏[0m Cleaning content from data/generated/sherlock_0_qa_pairs.json...vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:     127.0.0.1:43598 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2KBatch processing complete.
[2KRated 15 QA pairs
[2KRetained 7 pairs (threshold: 5.0)
[2KAverage score: 4.4
[2K[32m⠙[0m Cleaning content from data/generated/sherlock_0_qa_pairs.json...
[1A[2K[32m Cleaned content saved to [0m[1;32mdata/cleaned/sherlock_0_qa_pairs_cleaned.json[0m
vLLM STDOUT: [1;36m(APIServer pid=3853)[0;0m INFO:

In [2]:
ls sample_data

[0m[01;32manscombe.json[0m*                mnist_test.csv
california_housing_test.csv   mnist_train_small.csv
california_housing_train.csv  [01;32mREADME.md[0m*


In [None]:
qa_pairs_filenames = [
    f"data/generated/sherlock_{i}_qa_pairs.json"
    for i in range(len(filenames))
]
for filename in qa_pairs_filenames:
    !synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        save-as {filename} -f ft

In [None]:
final_filenames = os.listdir("data/final")

conversations = pd.concat(
    [pd.read_json(f"data/final/{name}") for name in final_filenames]
).reset_index(drop=True)

In [None]:
all_contents = list(
    itertools.chain.from_iterable(
        [
            [message["content"] for message in conversation]
            for conversation in conversations["messages"]
        ]
    )
)

content_counts = Counter(all_contents)

most_common_content = content_counts.most_common()

In [None]:
print(most_common_content[:50])

In [None]:
dataset = Dataset.from_pandas(conversations)

In [None]:
final_dataset = DatasetDict({
    'train': dataset,
})

print("\nFinal Hugging Face Dataset object:")
print(final_dataset)

# You can inspect an example
print("\nExample from the training set:")
print(final_dataset['train'][0])

In [None]:
try:
    from huggingface_hub import login
    from google.colab import userdata

    # Retrieve your Hugging Face token from Colab's secrets manager
    # The name 'HF_TOKEN' should match the name you used in the secrets tab
    hf_token = userdata.get('HF_TOKEN')

    # Check if the token was successfully retrieved
    if hf_token:
        # Log in to Hugging Face using the retrieved token
        # The `add_to_git_credential=True` argument is optional and useful if you plan to push models to the Hub
        login(token=hf_token, add_to_git_credential=True)
        print("Hugging Face login successful using Google Colab secrets!")
    else:
        print("Error: HF_TOKEN not found in Google Colab secrets or is empty.")
        print("Please ensure you have created a secret named 'HF_TOKEN' in the 'Secrets' tab (🔑) on the left sidebar.")
except:
    pass

In [None]:
# Your final_dataset object from the script above is ready
repo_id = "lmassaron/Sherlock_QA"

print(f"\nUploading dataset to the Hub at {repo_id}...")

# This command uploads the dataset. It will create the repo if it doesn't exist.
final_dataset.push_to_hub(repo_id)

print("Upload complete!")

In [None]:
!zip -r my_data.zip data

In [None]:
from google.colab import files
files.download('my_data.zip')