# Wolof NLP - Evaluation

This notebook evaluates the tokenizer on a gold standard corpus of 542 annotated Wolof sentences.

In [None]:
import sys
import json
import re
sys.path.insert(0, '../src')

from wolof_nlp import WolofTokenizer

print("Loading gold standard...")
with open('../data/gold_standard.json') as f:
    gold = json.load(f)

# Clean gold standard (remove empty tokens)
for sent in gold:
    sent['tokens'] = [t.lower() for t in sent['tokens'] if t.strip()]

print(f"Loaded {len(gold)} sentences")

## 1. Gold Standard Overview

The gold standard contains sentences from Senegalese YouTube comments, categorized by difficulty.

In [None]:
# Category distribution
categories = {}
for sent in gold:
    cat = sent.get('category', 'unknown')
    categories[cat] = categories.get(cat, 0) + 1

print("Category distribution:")
for cat, count in sorted(categories.items()):
    print(f"  {cat:<15} {count:>4} sentences ({count/len(gold)*100:.1f}%)")

print("\nSample sentences:")
for sent in gold[:3]:
    print(f"  {sent['text']:<30} -> {sent['tokens']}")

## 2. Evaluation Function

In [None]:
def evaluate_tokenizer(tokenize_func, name):
    """Evaluate a tokenizer against gold standard."""
    exact_match = 0
    total_correct = 0
    total_pred = 0
    total_gold = 0
    errors = []
    
    for sent in gold:
        expected = sent['tokens']
        try:
            predicted = tokenize_func(sent['text'])
        except Exception as e:
            predicted = []
        
        total_gold += len(expected)
        total_pred += len(predicted)
        
        # Token-level matching
        for tok in predicted:
            if tok in expected:
                total_correct += 1
        
        # Exact match
        if predicted == expected:
            exact_match += 1
        elif len(errors) < 20:
            errors.append({
                'text': sent['text'],
                'expected': expected,
                'predicted': predicted,
                'category': sent.get('category', 'unknown')
            })
    
    precision = total_correct / total_pred if total_pred else 0
    recall = total_correct / total_gold if total_gold else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    
    return {
        'name': name,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'exact_match': exact_match,
        'total': len(gold),
        'errors': errors
    }

## 3. Baseline Comparison

Compare Wolof NLP tokenizer against standard baselines.

In [None]:
# Wolof NLP tokenizer
tokenizer = WolofTokenizer(normalize=True, detect_language=True, segment_attached=True)
def wolof_tokenize(text):
    return [t.text.lower() for t in tokenizer.tokenize(text) 
            if t.type.name in ('WORD', 'PUNCTUATION')]

# Regex tokenizer (similar to NLTK RegexpTokenizer)
def regex_tokenize(text):
    return [t.lower() for t in re.findall(r'\w+|[^\w\s]', text)]

# Whitespace tokenizer
def whitespace_tokenize(text):
    return text.lower().split()

# Run evaluation
results = [
    evaluate_tokenizer(wolof_tokenize, "Wolof NLP"),
    evaluate_tokenizer(regex_tokenize, "Regex (\\w+)"),
    evaluate_tokenizer(whitespace_tokenize, "Whitespace"),
]

# Print results
print(f"{'System':<20} {'Precision':>10} {'Recall':>10} {'F1':>10} {'Exact Match':>15}")
print("-" * 70)
for r in results:
    em_str = f"{r['exact_match']}/{r['total']} ({r['exact_match']/r['total']*100:.1f}%)"
    print(f"{r['name']:<20} {r['precision']*100:>9.1f}% {r['recall']*100:>9.1f}% {r['f1']*100:>9.1f}% {em_str:>15}")

## 4. Results by Category

In [None]:
# Evaluate by category
cat_results = {}
for sent in gold:
    cat = sent.get('category', 'unknown')
    if cat not in cat_results:
        cat_results[cat] = {'correct': 0, 'total': 0}
    
    cat_results[cat]['total'] += 1
    expected = sent['tokens']
    predicted = wolof_tokenize(sent['text'])
    if predicted == expected:
        cat_results[cat]['correct'] += 1

print(f"{'Category':<15} {'Correct':>10} {'Total':>10} {'Accuracy':>12}")
print("-" * 50)
for cat, data in sorted(cat_results.items()):
    acc = data['correct'] / data['total'] * 100
    print(f"{cat:<15} {data['correct']:>10} {data['total']:>10} {acc:>11.1f}%")

## 5. Error Analysis

In [None]:
# Get errors from Wolof NLP
wolof_result = results[0]
errors = wolof_result['errors']

print(f"Sample errors ({len(errors)} shown):\n")
for err in errors[:10]:
    print(f"Text:     {err['text']}")
    print(f"Expected: {err['expected']}")
    print(f"Got:      {err['predicted']}")
    print(f"Category: {err['category']}")
    print()

In [None]:
# Categorize errors
error_types = {
    'code_switch': 0,
    'punctuation': 0,
    'verbal_suffix': 0,
    'other': 0
}

for err in errors:
    text = err['text'].lower()
    if err['category'] == 'code_switch':
        error_types['code_switch'] += 1
    elif err['category'] == 'punctuation':
        error_types['punctuation'] += 1
    elif any(s in text for s in ['oo', 'loo', 'uloo']):
        error_types['verbal_suffix'] += 1
    else:
        error_types['other'] += 1

total_errors = sum(error_types.values())
print("Error type distribution:")
for etype, count in sorted(error_types.items(), key=lambda x: -x[1]):
    pct = count / total_errors * 100 if total_errors else 0
    print(f"  {etype:<20} {count:>4} ({pct:.0f}%)")

## 6. Summary

The Wolof NLP tokenizer significantly outperforms generic baselines because it handles:

1. **Morpheme segmentation**: Splits TAM markers (e.g., `damay` â†’ `da ma y`)
2. **Orthography normalization**: Handles spelling variations
3. **Language-aware processing**: Doesn't incorrectly segment French words

Main error sources:
- Code-switching boundaries
- Complex verbal suffixes (-oo, -loo)
- Punctuation handling edge cases