# Accents-PT-BR — Dataset Pipeline + HuggingFace Publication

**Projeto:** Controle Explícito de Sotaque Regional em pt-BR  
**Objetivo:** Construir o dataset derivado **Accents-PT-BR** (CORAA-MUPE + Common Voice PT),  
executar toda a pipeline de validação (confounds, splits) e publicar no HuggingFace Hub.  
**Config:** `configs/accent_classifier.yaml` (single source of truth).  

**Seções:**
1. Setup do ambiente
2. CORAA-MUPE manifest
3. Common Voice PT manifest
4. Dataset combinado Accents-PT-BR
5. Análise de confounds (accent × gender, duration, source)
6. Speaker-disjoint splits
7. Construção do HuggingFace Dataset
8. Dataset card + publicação no HuggingFace Hub

Este notebook é a **camada de orquestração**. Toda lógica está em `src/` (testável, auditável).  
O notebook apenas: instala deps → configura ambiente → chama módulos → publica resultado.

## 1. Setup do Ambiente

In [None]:
import os, subprocess, sys

# --- Platform-aware setup: works on Colab, Lightning.ai, and local ---
# Detection order: Lightning.ai -> Google Colab -> Local

# 1. Determine repo directory
_lightning_studio = '/teamspace/studios/this_studio'
if os.path.exists(_lightning_studio):
    REPO_DIR = os.path.join(_lightning_studio, 'TCC')
    _platform = 'lightning'
elif 'google.colab' in sys.modules or os.path.exists('/content'):
    REPO_DIR = '/content/TCC'
    _platform = 'colab'
else:
    REPO_DIR = os.getcwd()
    _platform = 'local'

# 2. Clone repo if needed (idempotent)
if not os.path.exists(os.path.join(REPO_DIR, '.git')):
    subprocess.run(['rm', '-rf', REPO_DIR], check=False)
    subprocess.run(
        ['git', 'clone', 'https://github.com/paulohenriquevn/tcc.git', REPO_DIR],
        check=True,
    )

os.chdir(REPO_DIR)
if REPO_DIR not in sys.path:
    sys.path.insert(0, REPO_DIR)

# 3. Install dependencies
subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt', '-q'], check=True)
subprocess.run([sys.executable, '-m', 'pip', 'install', 'huggingface_hub', '-q'], check=True)

# 4. NumPy ABI check — Colab pre-loads numpy 2.x in memory, but
#    requirements.txt pins 1.26.4. After pip downgrades, stale C-extensions
#    cause binary incompatibility. Fix: restart runtime ONCE.
_installed_np = subprocess.check_output(
    [sys.executable, '-c', 'import numpy; print(numpy.__version__)'],
    text=True,
).strip()

try:
    import numpy as _np
    _loaded_np = _np.__version__
except Exception:
    _loaded_np = None

if _loaded_np != _installed_np:
    print(f'\nNumPy ABI mismatch: loaded={_loaded_np}, installed={_installed_np}')
    print('Restarting runtime... After restart, re-run this cell (no second restart).')
    os.kill(os.getpid(), 9)
else:
    print(f'\nPlatform: {_platform}')
    print(f'Repo: {REPO_DIR}')
    print(f'Environment OK (numpy=={_installed_np})')

In [None]:
import sys, os, yaml, json, logging, hashlib
from pathlib import Path
from collections import Counter
from datetime import datetime

import torch
import numpy as np
import pandas as pd

# Platform-aware persistent cache setup
from src.utils.platform import detect_platform, setup_environment

platform = detect_platform()
setup_environment(platform)

# Mount Google Drive only on Colab (Lightning.ai has persistent disk built-in)
if platform.needs_drive_mount:
    from google.colab import drive
    drive.mount('/content/drive')

from src.utils.seed import set_global_seed
from src.data.manifest import (
    ManifestEntry, read_manifest, write_manifest,
    normalize_cv_accent, compute_file_hash,
)
from src.data.manifest_builder import build_manifest_from_hf_dataset
from src.data.cv_manifest_builder import build_manifest_from_common_voice
from src.data.combined_manifest import combine_manifests, analyze_source_distribution
from src.data.splits import (
    generate_speaker_disjoint_splits,
    save_splits,
    assign_entries_to_splits,
)
from src.analysis.confounds import run_all_confound_checks

# Load config — single source of truth for all experiment parameters
with open('configs/accent_classifier.yaml') as f:
    config = yaml.safe_load(f)

SEED = config['seed']['global']
generator = set_global_seed(SEED)

logging.basicConfig(
    level=logging.INFO,
    format='%(name)s - %(levelname)s - %(message)s',
)

# Drive cache base directory — platform-aware
DRIVE_BASE = platform.cache_base
DRIVE_BASE.mkdir(parents=True, exist_ok=True)

print(f'Platform: {platform.name}')
print(f'Config: {config["experiment"]["name"]}')
print(f'Seed: {SEED}')
print(f'Cache: {DRIVE_BASE}')

## 2. CORAA-MUPE Manifest

Download CORAA-MUPE-ASR from HuggingFace, apply filters, build manifest JSONL.  
**Filtros:** `speaker_type='R'`, duração 3–15s, `birth_state` → macro-região IBGE.  
Runs subsequentes usam cache do Drive.

In [None]:
from datasets import load_dataset, concatenate_datasets

CORAA_AUDIO_DIR = DRIVE_BASE / 'coraa_mupe' / 'audio'
CORAA_MANIFEST_PATH = DRIVE_BASE / 'coraa_mupe' / 'manifest.jsonl'

if CORAA_MANIFEST_PATH.exists():
    print(f'Loading CORAA-MUPE manifest from cache: {CORAA_MANIFEST_PATH}')
    coraa_entries = read_manifest(CORAA_MANIFEST_PATH)
    coraa_sha256 = compute_file_hash(CORAA_MANIFEST_PATH)
    print(f'Loaded {len(coraa_entries):,} entries (SHA-256: {coraa_sha256[:16]}...)')
else:
    print('Downloading CORAA-MUPE-ASR from HuggingFace...')
    print('(~42 GB na primeira vez)')

    ds = load_dataset('nilc-nlp/CORAA-MUPE-ASR')
    print(f'Splits: {list(ds.keys())}')

    # Concatenate all splits — we create our own speaker-disjoint splits
    all_data = concatenate_datasets([ds[split] for split in ds.keys()])
    print(f'Total concatenado: {len(all_data):,} rows')

    coraa_entries, coraa_stats = build_manifest_from_hf_dataset(
        dataset=all_data,
        audio_output_dir=CORAA_AUDIO_DIR,
        manifest_output_path=CORAA_MANIFEST_PATH,
        speaker_type_filter=config['dataset']['filters'].get('speaker_type', 'R'),
        min_duration_s=config['dataset']['filters']['min_duration_s'],
        max_duration_s=config['dataset']['filters']['max_duration_s'],
        min_speakers_per_region=config['dataset']['filters']['min_speakers_per_region'],
        min_utterances_per_speaker=config['dataset']['filters'].get('min_utterances_per_speaker', 3),
    )
    coraa_sha256 = coraa_stats['manifest_sha256']
    print(f'CORAA-MUPE: {len(coraa_entries):,} entries, SHA-256: {coraa_sha256}')

region_counts = Counter(e.accent for e in coraa_entries)
print(f'\nCORAA-MUPE: {len(coraa_entries):,} entries')
print(f'Regions: {dict(sorted(region_counts.items()))}')

## 3. Common Voice PT Manifest

Common Voice Portuguese (v17.0): campo `accent` user-submitted, normalizado via `normalize_cv_accent()`.  
IDs prefixados com `cv_` para evitar colisões com CORAA-MUPE.

In [None]:
CV_AUDIO_DIR = DRIVE_BASE / 'common_voice_pt' / 'audio'
CV_MANIFEST_PATH = DRIVE_BASE / 'common_voice_pt' / 'manifest.jsonl'

if CV_MANIFEST_PATH.exists():
    print(f'Loading Common Voice PT manifest from cache: {CV_MANIFEST_PATH}')
    cv_entries = read_manifest(CV_MANIFEST_PATH)
    cv_sha256 = compute_file_hash(CV_MANIFEST_PATH)
    print(f'Loaded {len(cv_entries):,} entries (SHA-256: {cv_sha256[:16]}...)')
else:
    print('Loading Common Voice PT from HuggingFace...')

    cv_hf_id = config['dataset']['sources'][1]['hf_id']
    cv_lang = config['dataset']['sources'][1]['hf_lang']

    cv_dataset = load_dataset(cv_hf_id, cv_lang, split='validated')
    print(f'Common Voice validated: {len(cv_dataset):,} rows')

    cv_entries, cv_stats = build_manifest_from_common_voice(
        dataset=cv_dataset,
        audio_output_dir=CV_AUDIO_DIR,
        manifest_output_path=CV_MANIFEST_PATH,
        min_duration_s=config['dataset']['filters']['min_duration_s'],
        max_duration_s=config['dataset']['filters']['max_duration_s'],
        min_speakers_per_region=config['dataset']['filters']['min_speakers_per_region'],
        min_utterances_per_speaker=config['dataset']['filters'].get('min_utterances_per_speaker', 3),
    )
    cv_sha256 = cv_stats['manifest_sha256']
    print(f'Common Voice PT: {len(cv_entries):,} entries, SHA-256: {cv_sha256}')

region_counts_cv = Counter(e.accent for e in cv_entries)
print(f'\nCommon Voice PT: {len(cv_entries):,} entries')
print(f'Regions: {dict(sorted(region_counts_cv.items()))}')

## 4. Dataset Combinado Accents-PT-BR

Merge CORAA-MUPE + Common Voice: validação de colisões, consistência speaker→accent, filtros de região.

In [None]:
COMBINED_MANIFEST_PATH = DRIVE_BASE / 'accents_pt_br' / 'manifest.jsonl'

if COMBINED_MANIFEST_PATH.exists():
    print(f'Loading combined manifest from cache: {COMBINED_MANIFEST_PATH}')
    combined_entries = read_manifest(COMBINED_MANIFEST_PATH)
    combined_sha256 = compute_file_hash(COMBINED_MANIFEST_PATH)
    print(f'Loaded {len(combined_entries):,} entries (SHA-256: {combined_sha256[:16]}...)')
else:
    combined_entries, combined_stats = combine_manifests(
        manifests=[
            (CORAA_MANIFEST_PATH, 'CORAA-MUPE'),
            (CV_MANIFEST_PATH, 'CommonVoice-PT'),
        ],
        output_path=COMBINED_MANIFEST_PATH,
        min_speakers_per_region=config['dataset']['filters']['min_speakers_per_region'],
        min_utterances_per_speaker=config['dataset']['filters'].get('min_utterances_per_speaker', 3),
    )
    combined_sha256 = combined_stats['manifest_sha256']
    print(f'Combined: {len(combined_entries):,} entries, SHA-256: {combined_sha256}')

# Source distribution analysis
source_dist = analyze_source_distribution(combined_entries)

print(f'\n=== SOURCE DISTRIBUTION ===')
for src, counts in source_dist['source_x_accent'].items():
    print(f'  {src}: {dict(sorted(counts.items()))}')

if source_dist['warnings']:
    print(f'\nWARNINGS:')
    for w in source_dist['warnings']:
        print(f'  {w}')

total_speakers = len({e.speaker_id for e in combined_entries})
region_counts_all = Counter(e.accent for e in combined_entries)
print(f'\nTotal: {len(combined_entries):,} entries, {total_speakers} speakers')
print(f'Regions: {dict(sorted(region_counts_all.items()))}')

## 5. Análise de Confounds

Checks obrigatórios: accent × gender (chi² + Cramer’s V), accent × duration (Kruskal-Wallis), accent × source.

In [None]:
confound_results = run_all_confound_checks(
    combined_entries,
    gender_blocking_threshold=config['confounds']['accent_x_gender']['threshold_blocker'],
    duration_practical_diff_s=config['confounds']['accent_x_duration']['practical_diff_s'],
    check_snr=False,
    source_blocking_threshold=config['confounds']['accent_x_source']['threshold_blocker'],
)

print('=== CONFOUND ANALYSIS ===')
blocking_found = False
confound_summary = []
for result in confound_results:
    status = 'BLOCKING' if result.is_blocking else ('SIGNIFICANT' if result.is_significant else 'OK')
    if result.is_blocking:
        blocking_found = True
    print(f'\n{result.variable_a} x {result.variable_b}: {status}')
    print(f'  {result.test_name}: stat={result.statistic:.4f}, p={result.p_value:.6f}')
    print(f'  {result.effect_size_name}={result.effect_size:.4f}')
    confound_summary.append({
        'test': result.test_name,
        'variables': f'{result.variable_a} x {result.variable_b}',
        'statistic': result.statistic,
        'p_value': result.p_value,
        'effect_size': result.effect_size,
        'effect_size_name': result.effect_size_name,
        'is_blocking': result.is_blocking,
    })

# Cross-tabulations
gender_table = pd.crosstab(
    [e.accent for e in combined_entries],
    [e.gender for e in combined_entries],
    margins=True,
)
print('\n=== ACCENT x GENDER ===')
print(gender_table)

source_table = pd.crosstab(
    [e.accent for e in combined_entries],
    [e.source for e in combined_entries],
    margins=True,
)
print('\n=== ACCENT x SOURCE ===')
print(source_table)

if blocking_found:
    print('\n*** BLOCKING CONFOUND DETECTED. Review before proceeding. ***')
else:
    print('\nNo blocking confounds. Proceeding.')

## 6. Speaker-Disjoint Splits

**Obrigatório:** nenhum speaker aparece em mais de um split.  
Splits estratificados por sotaque para representação em todos os splits.

In [None]:
split_info = generate_speaker_disjoint_splits(
    combined_entries,
    train_ratio=config['splits']['ratios']['train'],
    val_ratio=config['splits']['ratios']['val'],
    test_ratio=config['splits']['ratios']['test'],
    seed=config['splits']['seed'],
)

# Persist splits
split_output_dir = Path(config['splits']['output_dir'])
split_path = save_splits(split_info, split_output_dir)
print(f'Splits saved: {split_path}')
print(f'Train: {len(split_info.train_speakers)} speakers, {split_info.utterances_per_split["train"]:,} utts')
print(f'Val:   {len(split_info.val_speakers)} speakers, {split_info.utterances_per_split["val"]:,} utts')
print(f'Test:  {len(split_info.test_speakers)} speakers, {split_info.utterances_per_split["test"]:,} utts')

# Assign entries to splits
split_entries = assign_entries_to_splits(combined_entries, split_info)

train_entries = split_entries['train']
val_entries = split_entries['val']
test_entries = split_entries['test']

# Speaker-disjoint verification (HARD FAIL if violated)
train_spk = {e.speaker_id for e in train_entries}
val_spk = {e.speaker_id for e in val_entries}
test_spk = {e.speaker_id for e in test_entries}

assert len(train_spk & val_spk) == 0, 'Speaker leakage train -> val'
assert len(train_spk & test_spk) == 0, 'Speaker leakage train -> test'
assert len(val_spk & test_spk) == 0, 'Speaker leakage val -> test'
print('\nSpeaker-disjoint verification: PASSED')

for split_name, entries_list in split_entries.items():
    accent_dist = Counter(e.accent for e in entries_list)
    print(f'  {split_name}: {dict(sorted(accent_dist.items()))}')

## 7. Construção do HuggingFace Dataset

Converte as entries do manifest em um `datasets.DatasetDict` com:
- `Audio()` feature (decode automático, 16kHz)
- Metadados: `speaker_id`, `accent`, `gender`, `duration_s`, `source`, `birth_state`, `utt_id`
- Splits: `train`, `validation`, `test` (speaker-disjoint)

In [None]:
from datasets import Dataset, DatasetDict, Audio, Features, Value, ClassLabel

# Build ordered label lists for ClassLabel features
accent_labels = sorted({e.accent for e in combined_entries})
gender_labels = sorted({e.gender for e in combined_entries})
source_labels = sorted({e.source for e in combined_entries})

print(f'Accent classes: {accent_labels}')
print(f'Gender classes: {gender_labels}')
print(f'Source classes: {source_labels}')


def entries_to_hf_dict(entries: list) -> dict:
    """Convert ManifestEntry list to dict-of-lists for HF Dataset."""
    return {
        'audio': [e.audio_path for e in entries],
        'utt_id': [e.utt_id for e in entries],
        'speaker_id': [e.speaker_id for e in entries],
        'accent': [e.accent for e in entries],
        'gender': [e.gender for e in entries],
        'duration_s': [e.duration_s for e in entries],
        'source': [e.source for e in entries],
        'birth_state': [e.birth_state for e in entries],
        'text_id': [e.text_id or '' for e in entries],
    }


# Define features schema
features = Features({
    'audio': Audio(sampling_rate=16_000),
    'utt_id': Value('string'),
    'speaker_id': Value('string'),
    'accent': ClassLabel(names=accent_labels),
    'gender': ClassLabel(names=gender_labels),
    'duration_s': Value('float32'),
    'source': ClassLabel(names=source_labels),
    'birth_state': Value('string'),
    'text_id': Value('string'),
})

# Build DatasetDict with speaker-disjoint splits
dataset_dict = DatasetDict({
    'train': Dataset.from_dict(
        entries_to_hf_dict(train_entries),
        features=features,
    ),
    'validation': Dataset.from_dict(
        entries_to_hf_dict(val_entries),
        features=features,
    ),
    'test': Dataset.from_dict(
        entries_to_hf_dict(test_entries),
        features=features,
    ),
})

print(f'\nDatasetDict criado:')
print(dataset_dict)
for split_name, ds in dataset_dict.items():
    print(f'  {split_name}: {len(ds)} rows, columns={ds.column_names}')

# Verify a sample
sample = dataset_dict['train'][0]
print(f'\nSample (train[0]):')
print(f'  utt_id: {sample["utt_id"]}')
print(f'  speaker_id: {sample["speaker_id"]}')
print(f'  accent: {sample["accent"]}')
print(f'  gender: {sample["gender"]}')
print(f'  duration_s: {sample["duration_s"]:.2f}')
print(f'  source: {sample["source"]}')
print(f'  audio sample_rate: {sample["audio"]["sampling_rate"]}')
print(f'  audio array shape: {np.array(sample["audio"]["array"]).shape}')

## 8. Publicação no HuggingFace Hub

Autentica no HuggingFace, gera dataset card com estatísticas e publica.  

**IMPORTANTE:** O token precisa de permissão `write` no HuggingFace Hub.  
Gere um token em https://huggingface.co/settings/tokens.

In [None]:
from huggingface_hub import notebook_login

# Login — will prompt for token if not already cached
notebook_login()

In [None]:
# --- Build dataset card content ---

# Compute statistics for the card
total_entries = len(combined_entries)
total_speakers_count = len({e.speaker_id for e in combined_entries})
total_duration_h = sum(e.duration_s for e in combined_entries) / 3600

accent_stats = Counter(e.accent for e in combined_entries)
gender_stats = Counter(e.gender for e in combined_entries)
source_stats = Counter(e.source for e in combined_entries)

# Per-split stats
split_stats = {}
for name, entries in split_entries.items():
    split_stats[name] = {
        'utterances': len(entries),
        'speakers': len({e.speaker_id for e in entries}),
        'duration_h': sum(e.duration_s for e in entries) / 3600,
        'accents': dict(sorted(Counter(e.accent for e in entries).items())),
    }

# Confound summary text
confound_lines = []
for cs in confound_summary:
    status = 'BLOCKING' if cs['is_blocking'] else 'OK'
    confound_lines.append(
        f"| {cs['variables']} | {cs['test']} | {cs['statistic']:.4f} | "
        f"{cs['p_value']:.6f} | {cs['effect_size_name']}={cs['effect_size']:.4f} | {status} |"
    )
confound_table = '\n'.join(confound_lines)

# Accent distribution table
accent_lines = []
for acc in sorted(accent_stats.keys()):
    n = accent_stats[acc]
    pct = n / total_entries * 100
    spk_count = len({e.speaker_id for e in combined_entries if e.accent == acc})
    accent_lines.append(f'| {acc} | {n:,} | {pct:.1f}% | {spk_count} |')
accent_table = '\n'.join(accent_lines)

# Get commit hash
try:
    commit_hash = subprocess.check_output(['git', 'rev-parse', 'HEAD'], text=True).strip()
except Exception:
    commit_hash = 'unknown'

# Get manifest hash
manifest_sha = compute_file_hash(COMBINED_MANIFEST_PATH) if COMBINED_MANIFEST_PATH.exists() else 'N/A'

print(f'Total: {total_entries:,} utterances, {total_speakers_count} speakers, {total_duration_h:.1f}h')
print(f'Manifest SHA-256: {manifest_sha[:16]}...')
print(f'Commit: {commit_hash[:8]}...')

In [None]:
# --- Dataset Card (README.md) ---

DATASET_CARD = f"""---
language:
  - pt
license: cc-by-4.0
task_categories:
  - audio-classification
tags:
  - accent-classification
  - brazilian-portuguese
  - speech
  - regional-accent
  - ibge-macro-regions
  - tts-evaluation
size_categories:
  - 1K<n<10K
---

# Accents-PT-BR

A curated, multi-source dataset of Brazilian Portuguese speech annotated with IBGE macro-region
accent labels. Designed for training and evaluating accent classifiers used as external evaluators
in accent-controllable TTS research.

## Dataset Description

**Accents-PT-BR** combines two complementary sources of Brazilian Portuguese speech:

| Source | Type | Accent label origin |
|--------|------|--------------------|
| [CORAA-MUPE-ASR](https://huggingface.co/datasets/nilc-nlp/CORAA-MUPE-ASR) | Professional interviews | `birth_state` field (verified) |
| [Common Voice PT](https://commonvoice.mozilla.org/pt) (v17.0) | Crowd-sourced read speech | User-submitted `accent` field (noisy) |

Accent labels are normalized to **IBGE macro-regions**: N (Norte), NE (Nordeste), CO (Centro-Oeste),
SE (Sudeste), S (Sul).

### Key Properties

- **Speaker-disjoint splits**: No speaker appears in more than one split (train/validation/test).
  This is critical for fair evaluation of accent classifiers.
- **Source-prefixed IDs**: CORAA-MUPE and Common Voice entries use distinct ID namespaces
  (`cv_` prefix for Common Voice) to prevent collisions.
- **Multi-source**: Enables cross-source evaluation to detect source confounds
  (classifier learning recording conditions instead of accent).
- **All audio at 16 kHz mono WAV**.

## Dataset Statistics

| Metric | Value |
|--------|-------|
| Total utterances | {total_entries:,} |
| Total speakers | {total_speakers_count} |
| Total duration | {total_duration_h:.1f} hours |
| Accent classes | {len(accent_stats)} (IBGE macro-regions) |
| Audio format | 16 kHz mono WAV |

### Accent Distribution

| Region | Utterances | % | Speakers |
|--------|-----------|---|----------|
{accent_table}

### Source Distribution

| Source | Utterances |
|--------|------------|
| CORAA-MUPE | {source_stats.get('CORAA-MUPE', 0):,} |
| CommonVoice-PT | {source_stats.get('CommonVoice-PT', 0):,} |

### Splits (Speaker-Disjoint)

| Split | Utterances | Speakers | Duration |
|-------|-----------|----------|----------|
| train | {split_stats['train']['utterances']:,} | {split_stats['train']['speakers']} | {split_stats['train']['duration_h']:.1f}h |
| validation | {split_stats['validation']['utterances']:,} | {split_stats['validation']['speakers']} | {split_stats['validation']['duration_h']:.1f}h |
| test | {split_stats['test']['utterances']:,} | {split_stats['test']['speakers']} | {split_stats['test']['duration_h']:.1f}h |

## Confound Analysis

Mandatory confound checks were run before publication:

| Variables | Test | Statistic | p-value | Effect size | Status |
|-----------|------|-----------|---------|-------------|--------|
{confound_table}

## Dataset Fields

| Field | Type | Description |
|-------|------|-------------|
| `audio` | Audio (16kHz) | Audio waveform |
| `utt_id` | string | Unique utterance identifier |
| `speaker_id` | string | Speaker identifier (unique per person, `cv_` prefix for Common Voice) |
| `accent` | ClassLabel | IBGE macro-region: {accent_labels} |
| `gender` | ClassLabel | Speaker gender: {gender_labels} |
| `duration_s` | float32 | Duration in seconds |
| `source` | ClassLabel | Source dataset: {source_labels} |
| `birth_state` | string | Original birth state / accent label from source |
| `text_id` | string | Transcription ID (if available) |

## Usage

```python
from datasets import load_dataset

ds = load_dataset("paulohenriquevn/accents-pt-br")

# Access a sample
sample = ds['train'][0]
print(sample['accent'])      # e.g., 'SE'
print(sample['speaker_id'])  # e.g., 'coraa_spk123'
print(sample['audio'])       # {{'array': array([...]), 'sampling_rate': 16000}}

# Filter by accent
nordeste = ds['train'].filter(lambda x: x['accent'] == 'NE')
```

## Intended Use

This dataset is designed for:
1. **Training accent classifiers** for evaluating accent-controllable TTS systems.
2. **Cross-source generalization studies** (train on one source, test on another).
3. **Research on Brazilian Portuguese regional accent variation.**

It is NOT intended for:
- Speaker identification or re-identification.
- Commercial voice profiling.

## Limitations

- **Accent as proxy**: IBGE macro-regions are coarse. Intra-regional variation exists.
- **Common Voice labels are noisy**: User-submitted, not verified.
- **Source confound risk**: Different recording conditions between sources.
  Cross-source evaluation is recommended.
- **Class imbalance**: Some regions (N, CO) may have fewer speakers.

## Citation

If you use this dataset, please cite the underlying sources:

- **CORAA-MUPE**: Candido Jr. et al. (2023). CORAA: a large corpus of spontaneous and prepared speech.
- **Common Voice**: Ardila et al. (2020). Common Voice: A Massively-Multilingual Speech Corpus.

## Provenance

- **Manifest SHA-256**: `{manifest_sha}`
- **Pipeline commit**: `{commit_hash}`
- **Build date**: {datetime.now().strftime('%Y-%m-%d')}
- **Seed**: {SEED}
- **Config**: `configs/accent_classifier.yaml`
"""

print('Dataset card generated.')
print(f'Card length: {len(DATASET_CARD)} chars')

In [None]:
# --- Publish to HuggingFace Hub ---

HF_REPO_ID = 'paulohenriquevn/accents-pt-br'

print(f'Publishing to: https://huggingface.co/datasets/{HF_REPO_ID}')
print(f'Splits: {list(dataset_dict.keys())}')
print(f'Total rows: {sum(len(ds) for ds in dataset_dict.values()):,}')
print()

dataset_dict.push_to_hub(
    HF_REPO_ID,
    private=False,
)

print(f'\nDataset uploaded successfully!')
print(f'URL: https://huggingface.co/datasets/{HF_REPO_ID}')

In [None]:
# --- Upload dataset card ---

from huggingface_hub import HfApi

api = HfApi()

# Write card to temp file and upload
card_path = Path('/tmp/accents_pt_br_README.md')
card_path.write_text(DATASET_CARD, encoding='utf-8')

api.upload_file(
    path_or_fileobj=str(card_path),
    path_in_repo='README.md',
    repo_id=HF_REPO_ID,
    repo_type='dataset',
)

print(f'Dataset card uploaded to {HF_REPO_ID}')
print(f'\n=== PUBLICATION COMPLETE ===')
print(f'Dataset: https://huggingface.co/datasets/{HF_REPO_ID}')
print(f'Utterances: {total_entries:,}')
print(f'Speakers: {total_speakers_count}')
print(f'Duration: {total_duration_h:.1f}h')
print(f'Manifest SHA-256: {manifest_sha}')

In [None]:
# --- Verification: load back from Hub ---

print(f'Verifying: loading {HF_REPO_ID} from Hub...')
ds_verify = load_dataset(HF_REPO_ID)

print(f'\nLoaded successfully:')
print(ds_verify)

for split_name, split_ds in ds_verify.items():
    print(f'  {split_name}: {len(split_ds)} rows')

# Verify accent distribution matches
for split_name in ['train', 'validation', 'test']:
    local_count = len(split_entries[split_name.replace('validation', 'val')])
    remote_count = len(ds_verify[split_name])
    match = 'OK' if local_count == remote_count else 'MISMATCH'
    print(f'  {split_name}: local={local_count}, remote={remote_count} [{match}]')

# Verify a sample decodes correctly
sample = ds_verify['train'][0]
assert sample['audio']['sampling_rate'] == 16000, 'Sampling rate mismatch'
assert len(sample['audio']['array']) > 0, 'Empty audio'
print(f'\nSample verification PASSED (sr={sample["audio"]["sampling_rate"]}, '
      f'len={len(sample["audio"]["array"])})')

print(f'\n=== ALL VERIFICATIONS PASSED ===')