# English-Gujarati NMT: Data Preparation

This notebook prepares the dataset for training the NMT model.

In [None]:
# Install dependencies
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
%pip install transformers tokenizers sentencepiece datasets sacrebleu pyyaml tqdm wandb requests

In [None]:
# Clone repository or upload files
# Option 1: Clone from GitHub
# !git clone https://github.com/ns-1456/NMT.git
# %cd NMT

# Option 2: Upload files manually
# Upload the entire project folder to Colab
import sys
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd()))

In [None]:
# Download and prepare dataset
from src.data.download import prepare_dataset
from src.data.preprocess import preprocess_parallel_files
from src.data.dataset import create_data_splits
from pathlib import Path

data_dir = Path("data")
data_dir.mkdir(exist_ok=True)

print("Downloading FLORES dataset...")
en_file, gu_file = prepare_dataset(data_dir, dataset_name="flores")
print(f"Downloaded: {en_file}, {gu_file}")

In [None]:
# Preprocess data
print("Preprocessing data...")
preprocess_parallel_files(
    source_file=en_file,
    target_file=gu_file,
    output_source=data_dir / "processed" / "en.txt",
    output_target=data_dir / "processed" / "gu.txt",
    min_length=3,
    max_length=128,
    remove_dup=True
)
print("Preprocessing complete!")

In [None]:
# Create train/val/test splits
print("Creating data splits...")
create_data_splits(
    source_file=data_dir / "processed" / "en.txt",
    target_file=data_dir / "processed" / "gu.txt",
    output_dir=data_dir / "splits",
    train_ratio=0.8,
    val_ratio=0.1,
    test_ratio=0.1,
    seed=42
)
print("Data splits created!")

In [None]:
# Train tokenizers
from src.utils.config import load_config
from src.tokenization.bpe import BPETokenizer
from src.tokenization.unigram import UnigramTokenizer

config = load_config("config.yaml")
tokenizer_type = config['tokenization']['type']
vocab_size = config['tokenization']['vocab_size']
splits_dir = Path(config['paths']['splits_dir'])

print(f"Training {tokenizer_type.upper()} tokenizers with vocab_size={vocab_size}...")

if tokenizer_type == "bpe":
    source_tokenizer = BPETokenizer(vocab_size=vocab_size)
    target_tokenizer = BPETokenizer(vocab_size=vocab_size)
else:
    source_tokenizer = UnigramTokenizer(vocab_size=vocab_size)
    target_tokenizer = UnigramTokenizer(vocab_size=vocab_size)

# Train source tokenizer
source_tokenizer.train([str(splits_dir / "train.source")], vocab_size=vocab_size)
source_path = splits_dir / f"source_tokenizer_{tokenizer_type}.json" if tokenizer_type == "bpe" else splits_dir / f"source_tokenizer_{tokenizer_type}.model"
source_tokenizer.save(source_path)
print(f"Source tokenizer saved to {source_path}")

# Train target tokenizer
target_tokenizer.train([str(splits_dir / "train.target")], vocab_size=vocab_size)
target_path = splits_dir / f"target_tokenizer_{tokenizer_type}.json" if tokenizer_type == "bpe" else splits_dir / f"target_tokenizer_{tokenizer_type}.model"
target_tokenizer.save(target_path)
print(f"Target tokenizer saved to {target_path}")

print(f"\nSource vocab size: {source_tokenizer.get_vocab_size()}")
print(f"Target vocab size: {target_tokenizer.get_vocab_size()}")

## Data Preparation Complete!

Next steps:
1. Download the prepared data and tokenizers
2. Use notebook `02_train_teacher.ipynb` to train the teacher model
3. Use notebook `03_train_student.ipynb` to train the student model with distillation