# Dataset Exploration

This notebook demonstrates how to explore and prepare datasets for LLMForge fine-tuning.

In [None]:
# Install dependencies
!pip install datasets pandas matplotlib

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from datasets import load_dataset

# Load sample dataset
# Replace with your dataset path
DATASET_PATH = "path/to/your/dataset.jsonl"

In [None]:
# Example: Create a sample instruction dataset
sample_data = [
    {
        "instruction": "What is machine learning?",
        "input": "",
        "output": "Machine learning is a subset of artificial intelligence that enables computers to learn from data without being explicitly programmed."
    },
    {
        "instruction": "Explain the difference between supervised and unsupervised learning.",
        "input": "",
        "output": "Supervised learning uses labeled data to train models, while unsupervised learning finds patterns in unlabeled data."
    },
    {
        "instruction": "Write a Python function to calculate factorial.",
        "input": "",
        "output": "def factorial(n):\n    if n <= 1:\n        return 1\n    return n * factorial(n - 1)"
    }
]

# Save sample dataset
with open("sample_dataset.jsonl", "w") as f:
    for item in sample_data:
        f.write(json.dumps(item) + "\n")

print(f"Created sample dataset with {len(sample_data)} examples")

In [None]:
# Load and explore dataset
dataset = load_dataset("json", data_files="sample_dataset.jsonl", split="train")
print(f"Dataset size: {len(dataset)}")
print(f"Columns: {dataset.column_names}")
print(f"\nSample:\n{dataset[0]}")

In [None]:
# Analyze text lengths
df = pd.DataFrame(dataset)
df["instruction_len"] = df["instruction"].str.len()
df["output_len"] = df["output"].str.len()
df["total_len"] = df["instruction_len"] + df["output_len"]

print("Text Length Statistics:")
print(df[["instruction_len", "output_len", "total_len"]].describe())

In [None]:
# Plot length distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(df["instruction_len"], bins=30, edgecolor="black")
axes[0].set_xlabel("Instruction Length")
axes[0].set_ylabel("Count")
axes[0].set_title("Instruction Length Distribution")

axes[1].hist(df["output_len"], bins=30, edgecolor="black")
axes[1].set_xlabel("Output Length")
axes[1].set_ylabel("Count")
axes[1].set_title("Output Length Distribution")

plt.tight_layout()
plt.show()