<a href="https://colab.research.google.com/github/lmassaron/fine-tuning-workshop/blob/main/02_synthetic_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
import os
!pip install --upgrade -qqq uv
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install!
    !pip install unsloth vllm synthetic-data-kit==0.0.3
else:
    try: import numpy; get_numpy = f"numpy=={numpy.__version__}"
    except: get_numpy = "numpy"
    try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
    except: is_t4 = False
    get_vllm, get_triton = ("vllm==0.9.2", "triton==3.2.0") if is_t4 else ("vllm==0.10.2", "triton")
    !uv pip install -qqq --upgrade         unsloth {get_vllm} {get_numpy} torchvision bitsandbytes xformers
    !uv pip install -qqq {get_triton}
    !uv pip install synthetic-data-kit==0.0.3
!uv pip install transformers==4.55.4
!uv pip install --no-deps trl==0.22.2
!uv pip install wikipedia-api

In [2]:
%%capture
import os
!pip install --upgrade -qqq uv
if "COLAB_" not in "".join(os.environ.keys()):
    # If you're not in Colab, just use pip install!
    !pip install unsloth vllm
else:
    try: import numpy; get_numpy = f"numpy=={numpy.__version__}"
    except: get_numpy = "numpy"
    try: import subprocess; is_t4 = "Tesla T4" in str(subprocess.check_output(["nvidia-smi"]))
    except: is_t4 = False
    get_vllm, get_triton = ("vllm==0.9.2", "triton==3.2.0") if is_t4 else ("vllm==0.10.2", "triton")
    !uv pip install -qqq --upgrade \
        unsloth {get_vllm} {get_numpy} torchvision bitsandbytes xformers
    !uv pip install -qqq {get_triton}
!uv pip install transformers==4.55.4
!uv pip install --no-deps trl==0.22.2

In [3]:
import os
import time
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import wikipediaapi
from unsloth.dataprep import SyntheticDataKit
import huggingface_hub
from collections import Counter
import itertools
from datasets import Dataset
import pandas as pd
from datasets import Dataset, DatasetDict, ClassLabel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
INFO 09-25 08:02:33 [__init__.py:244] Automatically detected platform cuda.
ERROR 09-25 08:02:36 [fa_utils.py:57] Cannot use FA version 2 is not supported due to FA2 is only supported on devices with compute capability >= 8
🦥 Unsloth Zoo will now patch everything to make training faster!


In [4]:
DEMO = True

In [5]:
# Pre-compile the regular expression pattern for better performance
BRACES_PATTERN = re.compile(r'\{.*?\}|\}')

def remove_braces_and_content(text):
    """Remove all occurrences of curly braces and their content from the given text"""
    return BRACES_PATTERN.sub('', text)

def clean_string(input_string):
    """Clean the input string."""

    # Remove extra spaces by splitting the string by spaces and joining back together
    cleaned_string = ' '.join(input_string.split())

    # Remove consecutive carriage return characters until there are no more consecutive occurrences
    cleaned_string = re.sub(r'\r+', '\r', cleaned_string)

    # Remove all occurrences of curly braces and their content from the cleaned string
    cleaned_string = remove_braces_and_content(cleaned_string)

    # Return the cleaned string
    return cleaned_string

In [6]:
def extract_wikipedia_pages(wiki_wiki, category_name):
    """Extract all references from a category on Wikipedia"""

    # Get the Wikipedia page corresponding to the provided category name
    category = wiki_wiki.page("Category:" + category_name)

    # Initialize an empty list to store page titles
    pages = []

    # Check if the category exists
    if category.exists():
        # Iterate through each article in the category and append its title to the list
        for article in category.categorymembers.values():
            pages.append(article.title)

    # Return the list of page titles
    return pages

In [7]:
def get_wikipedia_pages(categories):
    """Retrieve Wikipedia pages from a list of categories and extract their content"""

    # Create a Wikipedia object
    wiki_wiki = wikipediaapi.Wikipedia('Gemma AI Assistant (gemma@example.com)', 'en')

    # Initialize lists to store explored categories and Wikipedia pages
    explored_categories = []
    wikipedia_pages = []

    # Iterate through each category
    print("- Processing Wikipedia categories:")
    for category_name in categories:
        print(f"\tExploring {category_name} on Wikipedia")

        # Get the Wikipedia page corresponding to the category
        category = wiki_wiki.page("Category:" + category_name)

        # Extract Wikipedia pages from the category and extend the list
        wikipedia_pages.extend(extract_wikipedia_pages(wiki_wiki, category_name))

        # Add the explored category to the list
        explored_categories.append(category_name)

    # Extract subcategories and remove duplicate categories
    categories_to_explore = [item.replace("Category:", "") for item in wikipedia_pages if "Category:" in item]
    wikipedia_pages = list(set([item for item in wikipedia_pages if "Category:" not in item]))

    # Explore subcategories recursively
    while categories_to_explore:
        category_name = categories_to_explore.pop()
        print(f"\tExploring {category_name} on Wikipedia")

        # Extract more references from the subcategory
        more_refs = extract_wikipedia_pages(wiki_wiki, category_name)

        # Iterate through the references
        for ref in more_refs:
            # Check if the reference is a category
            if "Category:" in ref:
                new_category = ref.replace("Category:", "")
                # Add the new category to the explored categories list
                if new_category not in explored_categories:
                    explored_categories.append(new_category)
            else:
                # Add the reference to the Wikipedia pages list
                if ref not in wikipedia_pages:
                    wikipedia_pages.append(ref)

    # Initialize a list to store extracted texts
    extracted_texts = []

    # Iterate through each Wikipedia page
    print("- Processing Wikipedia pages:")
    for page_title in tqdm(wikipedia_pages):
        try:
            # Make a request to the Wikipedia page
            page = wiki_wiki.page(page_title)

            # Check if the page summary does not contain certain keywords
            if "Biden" not in page.summary and "Trump" not in page.summary:
                # Append the page title and summary to the extracted texts list
                if len(page.summary) > len(page.title):
                    extracted_texts.append(page.title + " : " + clean_string(page.summary))

                # Iterate through the sections in the page
                for section in page.sections:
                    # Append the page title and section text to the extracted texts list
                    if len(section.text) > len(page.title):
                        extracted_texts.append(page.title + " : " + clean_string(section.text))

        except Exception as e:
            print(f"Error processing page {page.title}: {e}")

    # Return the extracted texts
    return extracted_texts

In [8]:
categories = [
    "Sherlock_Holmes",
    "Arthur_Conan_Doyle",
    "A_Scandal_in_Bohemia",
    "The_Adventures_of_Sherlock_Holmes",
    "A_Study_in_Scarlet",
    "The_Sign_of_the_Four",
    "The_Memoirs_of_Sherlock_Holmes",
    "The_Hound_of_the_Baskervilles",
    "The_Return_of_Sherlock_Holmes",
    "The_Valley_of_Fear",
    "His_Last_Bow",
    "The_Case-Book_of_Sherlock_Holmes",
    "Canon_of_Sherlock_Holmes",
    "Dr._Watson",
    "221B_Baker_Street",
    "Mrs._Hudson",
    "Professor_Moriarty",
    "The_Strand_Magazine",
    "Minor_Sherlock_Holmes_characters",
    "Inspector_Lestrade",
    "Mycroft_Holmes",
    "Irene_Adler",
    "Colonel_Moran",
    "Baker_Street_Irregulars",
    "Giant_rat_of_Sumatra",
    "The_Story_of_the_Lost_Special",
    "How_Watson_Learned_the_Trick",
    "Diogenes_Club",
    "The_Dynamics_of_an_Asteroid",
    "Reichenbach_Falls",
    "A_Treatise_on_the_Binomial_Theorem",
    "Sherlockian_game",
    "List_of_Holmesian_studies",
    "The_New_Annotated_Sherlock_Holmes",
    "The_Private_Life_of_Sherlock_Holmes_(book)",
    "The_Great_Detective_(book)",
    "Naked_Is_the_Best_Disguise",
    "Sherlock_Holmes_fandom",
    "Sherlockiana",
    "Sherlock_Holmes_Museum",
    "The_Sherlock_Holmes",
    "The_Baker_Street_Irregulars",
    "The_Baker_Street_Journal",
    "Sidney_Paget",
    "The_Strand_Magazine",
    "Undershaw",
    "Canon_of_Sherlock_Holmes",
    "Adaptations_of_Sherlock_Holmes",
    "Sherlock_Holmes_pastiches",
    "Popular_culture_references_to_Sherlock_Holmes",
]

if DEMO:
    categories = ["Sherlock_Holmes"]

extracted_texts = get_wikipedia_pages(categories)
print("Found", len(extracted_texts), "Wikipedia pages")

- Processing Wikipedia categories:
	Exploring Sherlock_Holmes on Wikipedia
	Exploring Writers of Sherlock Holmes pastiches on Wikipedia
	Exploring Works based on Sherlock Holmes on Wikipedia
	Exploring Sherlock Holmes short story collections on Wikipedia
	Exploring Sherlock Holmes short stories on Wikipedia
	Exploring Sherlock Holmes audio adaptations on Wikipedia
	Exploring Sherlock Holmes scholars on Wikipedia
	Exploring Sherlock Holmes novels on Wikipedia
	Exploring Sherlock Holmes navigational boxes on Wikipedia
	Exploring Sherlock Holmes lists on Wikipedia
	Exploring Dartmoor on Wikipedia
	Exploring Sherlock Holmes characters on Wikipedia
	Exploring Baker Street on Wikipedia
- Processing Wikipedia pages:


100%|██████████| 459/459 [02:23<00:00,  3.20it/s]

Found 2042 Wikipedia pages





In [9]:
output_dir = 'data/output'
os.makedirs(output_dir, exist_ok=True)

for k, text in enumerate(extracted_texts):
    file_path = os.path.join(output_dir, f'sherlock_{k}.txt')
    with open(file_path, 'w') as f:
        f.write(text)
    if DEMO and k > 9:
        break

print("All texts have been saved successfully.")

All texts have been saved successfully.


In [10]:
filenames = [f"data/output/{file}" for file in os.listdir("data/output")]

In [None]:
generator = SyntheticDataKit.from_pretrained(
    # Choose any model from https://huggingface.co/unsloth
    model_name = "unsloth/Llama-3.2-3B-Instruct",
    max_seq_length = 2048, # Longer sequence lengths will be slower!
)

config.json:   0%|          | 0.00/890 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

INFO 09-25 08:05:30 [vllm_utils.py:688] Unsloth: Patching vLLM v1 graph capture
INFO 09-25 08:05:30 [vllm_utils.py:716] Unsloth: Patching vLLM v0 graph capture
Unsloth: Using dtype = torch.float16 for vLLM.
Unsloth: vLLM loading unsloth/Llama-3.2-3B-Instruct with actual GPU utilization = 89.39%
Unsloth: Your GPU has CUDA compute capability 7.5 with VRAM = 14.74 GB.
Unsloth: Using conservativeness = 1.0. Chunked prefill tokens = 2048. Num Sequences = 192.
Unsloth: vLLM's KV Cache can use up to 7.19 GB. Also swap space = 0 GB.
vLLM STDOUT: INFO 09-25 08:05:50 [__init__.py:244] Automatically detected platform cuda.
vLLM STDOUT: INFO 09-25 08:06:00 [api_server.py:1395] vLLM API server version 0.9.2
vLLM STDOUT: INFO 09-25 08:06:00 [cli_args.py:325] non-default args: {'model': 'unsloth/Llama-3.2-3B-Instruct', 'dtype': 'float16', 'seed': 0, 'max_model_len': 2048, 'max_logprobs': 0, 'gpu_memory_utilization': 0.8938626454842437, 'swap_space': 0.0, 'enable_prefix_caching': True, 'max_num_batche

In [None]:
generator.prepare_qa_generation(
    output_folder = "data", # Output location of synthetic data
    temperature = 0.7, # Higher temp makes more diverse datases
    top_p = 0.95,
    overlap = 64, # Overlap portion during chunking
    max_generation_tokens = 512, # Can increase for longer QA pairs
)

In [None]:
!synthetic-data-kit system-check

In [None]:
# Process chunks
for filename in filenames:
    !synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        create {filename} \
        --num-pairs 25 \
        --type "qa"
    time.sleep(2) # Sleep some time to leave some room for processing

In [None]:
QUALITY_CHECK = True

if QUALITY_CHECK:
    qa_pairs_filenames = [
        f"data/generated/sherlock_{i}_qa_pairs.json"
        for i in range(len(filenames))
    ]
    for filename in qa_pairs_filenames:
        !synthetic-data-kit \
            -c synthetic_data_kit_config.yaml \
            curate --threshold 5.0 \
            {filename}

In [None]:
qa_pairs_filenames = [
    f"data/generated/sherlock_{i}_qa_pairs.json"
    for i in range(len(filenames))
]
for filename in qa_pairs_filenames:
    !synthetic-data-kit \
        -c synthetic_data_kit_config.yaml \
        save-as {filename} -f ft

In [None]:
final_filenames = os.listdir("data/final")

conversations = pd.concat(
    [pd.read_json(f"data/final/{name}") for name in final_filenames]
).reset_index(drop=True)

In [None]:
all_contents = list(
    itertools.chain.from_iterable(
        [
            [message["content"] for message in conversation]
            for conversation in conversations["messages"]
        ]
    )
)

content_counts = Counter(all_contents)

most_common_content = content_counts.most_common()

In [None]:
print(most_common_content[:50])

In [None]:
dataset = Dataset.from_pandas(conversations)

In [None]:
final_dataset = DatasetDict({
    'train': dataset,
})

print("\nFinal Hugging Face Dataset object:")
print(final_dataset)

# You can inspect an example
print("\nExample from the training set:")
print(final_dataset['train'][0])

In [None]:
try:
    from huggingface_hub import login
    from google.colab import userdata

    # Retrieve your Hugging Face token from Colab's secrets manager
    # The name 'HF_TOKEN' should match the name you used in the secrets tab
    hf_token = userdata.get('HF_TOKEN')

    # Check if the token was successfully retrieved
    if hf_token:
        # Log in to Hugging Face using the retrieved token
        # The `add_to_git_credential=True` argument is optional and useful if you plan to push models to the Hub
        login(token=hf_token, add_to_git_credential=True)
        print("Hugging Face login successful using Google Colab secrets!")
    else:
        print("Error: HF_TOKEN not found in Google Colab secrets or is empty.")
        print("Please ensure you have created a secret named 'HF_TOKEN' in the 'Secrets' tab (🔑) on the left sidebar.")
except:
    pass

In [None]:
if not DEMO:

    # Your final_dataset object from the script above is ready
    repo_id = "lmassaron/Sherlock_QA"
    print(f"\nUploading dataset to the Hub at {repo_id}...")

    # This command uploads the dataset. It will create the repo if it doesn't exist.
    final_dataset.push_to_hub(repo_id)
    print("Upload complete!")