# Data Exploration for E-commerce LLM Training

This notebook explores the datasets used for training our e-commerce LLM system:
- **ECInstruct**: Multi-task e-commerce instruction dataset
- **Alpaca**: General instruction-following data for preventing catastrophic forgetting

We'll analyze:
1. Task distribution across classification, extraction, and Q&A
2. Category distributions
3. Sequence lengths for optimal `max_seq_length` selection
4. Example prompts for each task type

In [None]:
# Install required packages if needed
# !pip install datasets transformers pandas matplotlib seaborn tqdm

In [None]:
import json
from collections import Counter

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from datasets import load_dataset
from tqdm.auto import tqdm
from transformers import AutoTokenizer

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Libraries loaded successfully!")

## 1. Load ECInstruct Dataset

ECInstruct contains 116K multi-task e-commerce examples covering:
- Product classification
- Attribute extraction
- Question answering
- And more...

In [None]:
# Load ECInstruct dataset
print("Loading ECInstruct dataset...")
ecinstruct = load_dataset("NingLab/ECInstruct", split="train")

print(f"\nDataset size: {len(ecinstruct):,} examples")
print(f"Features: {ecinstruct.features}")

In [None]:
# Show sample examples
print("Sample Examples from ECInstruct:")
print("=" * 80)

for i in range(3):
    example = ecinstruct[i]
    print(f"\n--- Example {i+1} ---")
    for key, value in example.items():
        # Truncate long values for display
        display_value = str(value)[:500] + "..." if len(str(value)) > 500 else str(value)
        print(f"{key}: {display_value}")
    print()

## 2. Analyze Task Distribution

Our e-commerce LLM handles three primary tasks:
- **[CLASSIFY]**: Hierarchical product categorization
- **[EXTRACT]**: Attribute-value extraction as JSON
- **[QA]**: Product question answering

In [None]:
def categorize_task(instruction):
    """Categorize an instruction into task types based on keywords."""
    instruction_lower = instruction.lower()
    
    # Classification keywords
    if any(kw in instruction_lower for kw in ['classify', 'category', 'categorize', 'product type']):
        return 'Classification'
    
    # Extraction keywords
    if any(kw in instruction_lower for kw in ['extract', 'attribute', 'specification', 'feature']):
        return 'Extraction'
    
    # Q&A keywords
    if any(kw in instruction_lower for kw in ['question', 'answer', 'what', 'how', 'why', 'does', 'is it']):
        return 'Q&A'
    
    # Other e-commerce tasks
    if any(kw in instruction_lower for kw in ['review', 'sentiment', 'rating']):
        return 'Sentiment'
    
    if any(kw in instruction_lower for kw in ['similar', 'recommend', 'substitute']):
        return 'Recommendation'
    
    return 'Other'

# Analyze task distribution
print("Analyzing task distribution...")
task_counts = Counter()

for example in tqdm(ecinstruct, desc="Categorizing tasks"):
    instruction = example.get('instruction', '') or example.get('input', '')
    task = categorize_task(instruction)
    task_counts[task] += 1

print("\nTask Distribution:")
for task, count in task_counts.most_common():
    percentage = count / len(ecinstruct) * 100
    print(f"  {task}: {count:,} ({percentage:.1f}%)")

In [None]:
# Visualize task distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
tasks = list(task_counts.keys())
counts = list(task_counts.values())

colors = sns.color_palette('husl', len(tasks))
axes[0].pie(counts, labels=tasks, autopct='%1.1f%%', colors=colors, startangle=90)
axes[0].set_title('Task Distribution in ECInstruct', fontsize=14, fontweight='bold')

# Bar chart
df_tasks = pd.DataFrame({'Task': tasks, 'Count': counts})
df_tasks = df_tasks.sort_values('Count', ascending=True)

bars = axes[1].barh(df_tasks['Task'], df_tasks['Count'], color=colors)
axes[1].set_xlabel('Number of Examples')
axes[1].set_title('Task Distribution (Counts)', fontsize=14, fontweight='bold')

# Add count labels on bars
for bar, count in zip(bars, df_tasks['Count']):
    axes[1].text(count + 100, bar.get_y() + bar.get_height()/2, 
                 f'{count:,}', va='center', fontsize=10)

plt.tight_layout()
plt.savefig('task_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Example Prompts for Each Task Type

Let's examine specific examples of each task type to understand the prompt format.

In [None]:
def find_examples_by_task(dataset, task_type, n=2):
    """Find n examples of a specific task type."""
    examples = []
    for example in dataset:
        instruction = example.get('instruction', '') or example.get('input', '')
        if categorize_task(instruction) == task_type:
            examples.append(example)
            if len(examples) >= n:
                break
    return examples

def display_example(example, task_type):
    """Display a formatted example."""
    print(f"\n{'='*80}")
    print(f"TASK TYPE: {task_type}")
    print(f"{'='*80}")
    
    instruction = example.get('instruction', 'N/A')
    input_text = example.get('input', 'N/A')
    output = example.get('output', 'N/A')
    
    print(f"\n[INSTRUCTION]\n{instruction[:1000]}")
    if input_text and input_text != 'N/A':
        print(f"\n[INPUT]\n{input_text[:1000]}")
    print(f"\n[OUTPUT]\n{output[:1000]}")

In [None]:
# Show examples for Classification task
print("\n" + "#"*80)
print("# CLASSIFICATION EXAMPLES")
print("#"*80)

classification_examples = find_examples_by_task(ecinstruct, 'Classification', n=2)
for ex in classification_examples:
    display_example(ex, 'Classification')

In [None]:
# Show examples for Extraction task
print("\n" + "#"*80)
print("# EXTRACTION EXAMPLES")
print("#"*80)

extraction_examples = find_examples_by_task(ecinstruct, 'Extraction', n=2)
for ex in extraction_examples:
    display_example(ex, 'Extraction')

In [None]:
# Show examples for Q&A task
print("\n" + "#"*80)
print("# QUESTION & ANSWERING EXAMPLES")
print("#"*80)

qa_examples = find_examples_by_task(ecinstruct, 'Q&A', n=2)
for ex in qa_examples:
    display_example(ex, 'Q&A')

## 4. Analyze Sequence Lengths

Understanding sequence lengths is critical for:
- Setting `max_seq_length` during training
- Estimating VRAM requirements
- Avoiding truncation of important content

In [None]:
# Load tokenizer for sequence length analysis
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Vocabulary size: {tokenizer.vocab_size:,}")
print(f"Model max length: {tokenizer.model_max_length:,}")

In [None]:
def format_prompt(example):
    """Format an example into a training prompt."""
    instruction = example.get('instruction', '')
    input_text = example.get('input', '')
    output = example.get('output', '')
    
    # Combine instruction and input
    if input_text:
        full_prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
    else:
        full_prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
    
    return full_prompt

# Calculate sequence lengths for a sample
print("Calculating sequence lengths (sampling 10,000 examples)...")

sample_size = min(10000, len(ecinstruct))
sample_indices = list(range(sample_size))

sequence_lengths = []
input_lengths = []
output_lengths = []

for i in tqdm(sample_indices, desc="Tokenizing"):
    example = ecinstruct[i]
    
    # Full prompt length
    full_prompt = format_prompt(example)
    tokens = tokenizer.encode(full_prompt, add_special_tokens=True)
    sequence_lengths.append(len(tokens))
    
    # Input only (instruction + input)
    instruction = example.get('instruction', '')
    input_text = example.get('input', '')
    input_prompt = f"{instruction} {input_text}".strip()
    input_tokens = tokenizer.encode(input_prompt, add_special_tokens=True)
    input_lengths.append(len(input_tokens))
    
    # Output only
    output = example.get('output', '')
    output_tokens = tokenizer.encode(output, add_special_tokens=False)
    output_lengths.append(len(output_tokens))

print(f"\nAnalyzed {len(sequence_lengths):,} examples")

In [None]:
# Sequence length statistics
import numpy as np

def print_stats(lengths, name):
    """Print statistics for a list of lengths."""
    arr = np.array(lengths)
    print(f"\n{name}:")
    print(f"  Min: {arr.min():,}")
    print(f"  Max: {arr.max():,}")
    print(f"  Mean: {arr.mean():,.1f}")
    print(f"  Median: {np.median(arr):,.1f}")
    print(f"  Std: {arr.std():,.1f}")
    print(f"  90th percentile: {np.percentile(arr, 90):,.0f}")
    print(f"  95th percentile: {np.percentile(arr, 95):,.0f}")
    print(f"  99th percentile: {np.percentile(arr, 99):,.0f}")

print("="*60)
print("SEQUENCE LENGTH STATISTICS")
print("="*60)

print_stats(sequence_lengths, "Full Sequence (Prompt + Response)")
print_stats(input_lengths, "Input Only (Instruction + Input)")
print_stats(output_lengths, "Output Only (Response)")

In [None]:
# Visualize sequence length distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Full sequence length distribution
axes[0, 0].hist(sequence_lengths, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0, 0].axvline(x=2048, color='red', linestyle='--', linewidth=2, label='2048 tokens')
axes[0, 0].axvline(x=4096, color='orange', linestyle='--', linewidth=2, label='4096 tokens')
axes[0, 0].set_xlabel('Sequence Length (tokens)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Full Sequence Length Distribution', fontweight='bold')
axes[0, 0].legend()

# Input length distribution
axes[0, 1].hist(input_lengths, bins=50, color='forestgreen', edgecolor='black', alpha=0.7)
axes[0, 1].axvline(x=1024, color='red', linestyle='--', linewidth=2, label='1024 tokens')
axes[0, 1].set_xlabel('Input Length (tokens)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Input Length Distribution', fontweight='bold')
axes[0, 1].legend()

# Output length distribution
axes[1, 0].hist(output_lengths, bins=50, color='coral', edgecolor='black', alpha=0.7)
axes[1, 0].axvline(x=512, color='red', linestyle='--', linewidth=2, label='512 tokens')
axes[1, 0].set_xlabel('Output Length (tokens)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Output Length Distribution', fontweight='bold')
axes[1, 0].legend()

# Cumulative distribution for max_seq_length selection
sorted_lengths = np.sort(sequence_lengths)
cumulative = np.arange(1, len(sorted_lengths) + 1) / len(sorted_lengths)

axes[1, 1].plot(sorted_lengths, cumulative, color='purple', linewidth=2)
axes[1, 1].axhline(y=0.95, color='gray', linestyle=':', alpha=0.7)
axes[1, 1].axhline(y=0.99, color='gray', linestyle=':', alpha=0.7)
axes[1, 1].axvline(x=2048, color='red', linestyle='--', linewidth=2, label='2048 tokens')
axes[1, 1].axvline(x=4096, color='orange', linestyle='--', linewidth=2, label='4096 tokens')
axes[1, 1].set_xlabel('Sequence Length (tokens)')
axes[1, 1].set_ylabel('Cumulative Proportion')
axes[1, 1].set_title('Cumulative Distribution (for max_seq_length selection)', fontweight='bold')
axes[1, 1].legend()
axes[1, 1].set_xlim(0, 6000)

plt.tight_layout()
plt.savefig('sequence_lengths.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Recommendation for max_seq_length
percentiles = [90, 95, 99]
max_lengths = [2048, 3072, 4096]

print("\n" + "="*60)
print("MAX_SEQ_LENGTH RECOMMENDATION")
print("="*60)

for max_len in max_lengths:
    coverage = sum(1 for l in sequence_lengths if l <= max_len) / len(sequence_lengths) * 100
    print(f"\nmax_seq_length = {max_len}:")
    print(f"  Coverage: {coverage:.1f}% of examples")
    print(f"  Truncated: {100-coverage:.1f}% of examples")

print("\n" + "-"*60)
print("RECOMMENDATION: Use max_seq_length=4096 for complete coverage")
print("              or max_seq_length=2048 for faster training with ~5% truncation")
print("-"*60)

## 5. Load and Preview Alpaca Dataset

We mix ~10% general instruction data (Alpaca) to prevent catastrophic forgetting during fine-tuning.

In [None]:
# Load Alpaca dataset
print("Loading Alpaca dataset...")
alpaca = load_dataset("tatsu-lab/alpaca", split="train")

print(f"\nAlpaca dataset size: {len(alpaca):,} examples")
print(f"Features: {alpaca.features}")

In [None]:
# Show sample Alpaca examples
print("\nSample Alpaca Examples:")
print("="*80)

for i in range(3):
    example = alpaca[i]
    print(f"\n--- Example {i+1} ---")
    print(f"Instruction: {example['instruction'][:200]}..." if len(example['instruction']) > 200 else f"Instruction: {example['instruction']}")
    print(f"Input: {example['input'][:200]}..." if len(str(example['input'])) > 200 else f"Input: {example['input']}")
    print(f"Output: {example['output'][:300]}..." if len(example['output']) > 300 else f"Output: {example['output']}")

In [None]:
# Calculate mixing ratio
ecommerce_size = len(ecinstruct)
alpaca_size = len(alpaca)

# We want 10% general data
target_ratio = 0.10
alpaca_samples_needed = int(ecommerce_size * target_ratio / (1 - target_ratio))

print("\n" + "="*60)
print("DATASET MIXING STRATEGY")
print("="*60)
print(f"\nECInstruct examples: {ecommerce_size:,}")
print(f"Alpaca examples available: {alpaca_size:,}")
print(f"\nFor 10% general data mixing:")
print(f"  Alpaca samples needed: {alpaca_samples_needed:,}")
print(f"  Total training examples: {ecommerce_size + alpaca_samples_needed:,}")
print(f"  E-commerce ratio: {ecommerce_size / (ecommerce_size + alpaca_samples_needed) * 100:.1f}%")
print(f"  General ratio: {alpaca_samples_needed / (ecommerce_size + alpaca_samples_needed) * 100:.1f}%")

In [None]:
# Analyze Alpaca task types
alpaca_tasks = []
for example in tqdm(alpaca, desc="Categorizing Alpaca tasks"):
    instruction = example['instruction'].lower()
    
    if any(kw in instruction for kw in ['write', 'compose', 'create', 'generate']):
        alpaca_tasks.append('Writing')
    elif any(kw in instruction for kw in ['explain', 'describe', 'what is', 'define']):
        alpaca_tasks.append('Explanation')
    elif any(kw in instruction for kw in ['translate', 'convert']):
        alpaca_tasks.append('Translation')
    elif any(kw in instruction for kw in ['summarize', 'summary']):
        alpaca_tasks.append('Summarization')
    elif any(kw in instruction for kw in ['code', 'program', 'function', 'script']):
        alpaca_tasks.append('Coding')
    elif any(kw in instruction for kw in ['math', 'calculate', 'solve']):
        alpaca_tasks.append('Math')
    else:
        alpaca_tasks.append('Other')

alpaca_task_counts = Counter(alpaca_tasks)

print("\nAlpaca Task Distribution:")
for task, count in alpaca_task_counts.most_common():
    print(f"  {task}: {count:,} ({count/len(alpaca)*100:.1f}%)")

## 6. Summary and Next Steps

### Key Findings:

1. **Task Distribution**: ECInstruct provides good coverage of classification, extraction, and Q&A tasks

2. **Sequence Lengths**: 
   - Most examples fit within 2048 tokens
   - Recommend `max_seq_length=4096` for full coverage
   - Or `max_seq_length=2048` for faster training with minimal truncation

3. **Data Mixing**:
   - Mix ~10% Alpaca data to prevent catastrophic forgetting
   - This maintains general instruction-following capabilities

### Next Steps:
1. Proceed to `02_training_demo.ipynb` for QLoRA fine-tuning
2. Configure training with the recommended `max_seq_length`
3. Implement task-specific prompt formatting with [CLASSIFY], [EXTRACT], [QA] prefixes

In [None]:
# Save analysis results for training notebook
analysis_results = {
    'ecinstruct_size': len(ecinstruct),
    'alpaca_size': len(alpaca),
    'recommended_max_seq_length': 4096,
    'sequence_length_95_percentile': int(np.percentile(sequence_lengths, 95)),
    'sequence_length_99_percentile': int(np.percentile(sequence_lengths, 99)),
    'task_distribution': dict(task_counts),
    'alpaca_mix_ratio': 0.10
}

with open('data_exploration_results.json', 'w') as f:
    json.dump(analysis_results, f, indent=2)

print("Analysis results saved to data_exploration_results.json")
print(json.dumps(analysis_results, indent=2))