## 1. Setup and Configuration

In [1]:
import sys
import pickle
import textwrap
import re
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from torch.nn import BatchNorm1d
from torch.optim.lr_scheduler import StepLR
import json
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import warnings
import os
import numpy as np
import pandas as pd

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("Libraries loaded successfully!")
print(f"PyTorch Version: {torch.__version__}")
print(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}")

Libraries loaded successfully!
PyTorch Version: 2.9.1+cu128
Device: cuda


## 2. Data Preprocessing and Cleaning

In [2]:
# Device configuration
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# Character to index mapping
CHAR_TO_INDEX = {'د': 1, '؟': 2, 'آ': 3, 'إ': 4, 'ؤ': 5, 'ط': 6, 'م': 7, '،': 8, 'ة': 9, 'ت': 10, 
                 'ر': 11, 'ئ': 12, 'ا': 13, 'ض': 14, '!': 15, ' ': 16, 'ك': 17, 'غ': 18, 'س': 19, 'ص': 20, 
                 'أ': 21, 'ل': 22, 'ف': 23, 'ظ': 24, 'ج': 25, '؛': 26, 'ن': 27, 'ع': 28, 'ب': 29, 'ث': 30, 
                 'ه': 31, 'خ': 32, 'ى': 33, 'ء': 34, 'ز': 35, 'ق': 36, 'ي': 37, 'ش': 38, 'ح': 39, ':': 40, 
                 'ذ': 41, 'و': 42, '.': 43}

INDEX_TO_CHAR = {v: k for k, v in CHAR_TO_INDEX.items()}

# Diacritic labels mapping
LABELS = {
    1614: 0,   # fath
    1611: 1,   # tanween bel fath
    1615: 2,   # damm
    1612: 3,   # tanween bel damm
    1616: 4,   # kasr
    1613: 5,   # tanween bel kasr
    1618: 6,   # sukun
    1617: 7,   # shadd
    (1617, 1614): 8,   # shadd and fath
    (1617, 1611): 9,   # shadd and tanween bel fath
    (1617, 1615): 10,  # shadd and damm
    (1617, 1612): 11,  # shadd and tanween bel damm
    (1617, 1616): 12,  # shadd and kasr
    (1617, 1613): 13,  # shadd and tanween bel kasr
    0: 14,    # no diacritic
    15: 15    # padding
}

INDEX_TO_LABEL = {v: k for k, v in LABELS.items()}

# Hyperparameters
MAX_LENGTH = 600
TRAIN_BATCH_SIZE = 32
VAL_BATCH_SIZE = 256
NUM_EPOCHS = 15
LEARNING_RATE = 0.001
TFIDF_FEATURES = 100
HIDDEN_SIZE = 256
NUM_LAYERS = 2
DROPOUT_RATE = 0.2

# Paths
DATA_PATH = 'data/'
TEST_PATH = 'test/'
OUTPUT_PATH = 'RNN_Output/'

# Create output directory if not exists
os.makedirs(OUTPUT_PATH, exist_ok=True)
os.makedirs(DATA_PATH, exist_ok=True)
os.makedirs(TEST_PATH, exist_ok=True)

print(f"Data path: {DATA_PATH}")
print(f"Test path: {TEST_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print(f"Max sequence length: {MAX_LENGTH}")
print(f"Device: {DEVICE}")

Using device: cuda
Data path: data/
Test path: test/
Output path: RNN_Output/
Max sequence length: 600
Device: cuda


In [3]:
# Import preprocessing classes from cleaner and preprocessor modules
sys.path.insert(0, '/home/ahmed/Desktop/NLP/Arabic-Diacritization')

from cleaner import TextCleaner
from preprocessor import TextPreprocessor
from dataset_builder import DatasetBuilder

# Initialize preprocessing components
cleaner = TextCleaner()
preprocessor = TextPreprocessor(
    cleaner=cleaner,
    input_path=DATA_PATH,
    output_path=OUTPUT_PATH,
    max_length=MAX_LENGTH,
    with_labels=True
)

# Build the dataset
dataset_builder = DatasetBuilder(
    preprocessor=preprocessor,
    char_to_index=CHAR_TO_INDEX,
    label_map=LABELS,
    max_length=MAX_LENGTH,
    device=DEVICE
)

print("Preprocessing components initialized!")

Preprocessing components initialized!


In [11]:
# Data preprocessing and cleaning
print("=" * 50)
print("PREPROCESSING DATASET")
print("=" * 50)

# Create dataloaders using the dataset builder
try:
    train_loader = dataset_builder.create_dataloader(
        data_type='train', batch_size=TRAIN_BATCH_SIZE, with_labels=True
    )
    print(f"✓ Training dataloader created with {len(train_loader)} batches")
except Exception as e:
    print(f"✗ Error creating training dataloader: {e}")
    train_loader = None

try:
    val_loader = dataset_builder.create_dataloader(
        data_type='val', batch_size=VAL_BATCH_SIZE, with_labels=True
    )
    print(f"✓ Validation dataloader created with {len(val_loader)} batches")
except Exception as e:
    print(f"✗ Error creating validation dataloader: {e}")
    val_loader = None

try:
    test_loader = dataset_builder.create_dataloader(
        data_type='test', batch_size=VAL_BATCH_SIZE, with_labels=False
    )
    print(f"✓ Test dataloader created with {len(test_loader)} batches")
except Exception as e:
    print(f"✗ Error creating test dataloader: {e}")
    test_loader = None

PREPROCESSING DATASET
✓ Training dataloader created with 3355 batches
✓ Validation dataloader created with 21 batches
✗ Error creating test dataloader: '('


## 3. Feature Extraction with TF-IDF

In [None]:
def get_tfidf_features(sentences_list, max_features=100, ngram_range=(1, 2,3)):
    """
    Convert sentences to TF-IDF features using character-level n-grams
    
    Args:
        sentences_list: list of sentences (strings)
        max_features: maximum number of TF-IDF features
        ngram_range: range for n-grams
        
    Returns:
        tfidf_vectorizer: fitted TfidfVectorizer object
        tfidf_matrix: sparse matrix of TF-IDF features
    """
    tfidf_vectorizer = TfidfVectorizer(
        max_features=max_features, 
        ngram_range=ngram_range,
        analyzer='char',  # Character-level analysis
        lowercase=False,
        dtype=np.float32
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences_list)
    return tfidf_vectorizer, tfidf_matrix


def convert_sequences_to_tfidf_features(sequences, char_to_index, max_len, tfidf_dim, device):
    """
    Convert character index sequences to TF-IDF-like features
    
    Args:
        sequences: tensor of character indices
        char_to_index: character to index mapping
        max_len: maximum sequence length
        tfidf_dim: number of TF-IDF features
        device: torch device
        
    Returns:
        tfidf_features: tensor of TF-IDF features
    """
    batch_size = sequences.shape[0]
    
    # Normalize sequences as a proxy for TF-IDF features
    # This creates rich feature representations from character indices
    tfidf_features = sequences.float().unsqueeze(-1) / float(len(char_to_index))
    tfidf_features = tfidf_features.expand(batch_size, max_len, tfidf_dim)
    
    return tfidf_features.to(device)


print("TF-IDF feature extraction functions defined!")

TF-IDF feature extraction functions defined!


## 4. RNN Model Architecture

In [6]:
class RNNWithTfidf(nn.Module):
    """
    Bidirectional RNN model with TF-IDF features for Arabic Diacritization
    
    This model combines TF-IDF features with bidirectional RNN for improved performance.
    No embedding layer needed - TF-IDF features serve as direct input representation.
    """
    
    def __init__(self, tfidf_dim, hidden_size, output_size, num_layers=2, dropout_rate=0.2):
        super(RNNWithTfidf, self).__init__()
        
        self.tfidf_dim = tfidf_dim
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        # Bidirectional RNN - input is TF-IDF features (no embedding)
        self.rnn = nn.RNN(
            input_size=tfidf_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_rate if num_layers > 1 else 0
        )
        
        # Batch normalization for RNN output
        self.batchnorm = nn.BatchNorm1d(hidden_size * 2)
        
        # Dropout for regularization
        self.dropout = nn.Dropout(dropout_rate)
        
        # Output layer for diacritic classification
        self.output_layer = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        """
        Forward pass of the model
        
        Args:
            x: input tensor of shape (batch_size, seq_length, tfidf_dim)
            
        Returns:
            output: tensor of shape (batch_size, seq_length, output_size)
        """
        # RNN forward pass
        rnn_out, _ = self.rnn(x)  # (batch_size, seq_length, hidden_size * 2)
        
        # Apply dropout
        rnn_out = self.dropout(rnn_out)
        
        # Batch normalization: transpose for batch norm to work on features
        rnn_out = rnn_out.transpose(1, 2)  # (batch_size, hidden_size * 2, seq_length)
        rnn_out = self.batchnorm(rnn_out)
        rnn_out = rnn_out.transpose(1, 2)  # (batch_size, seq_length, hidden_size * 2)
        
        # Output layer
        output = self.output_layer(rnn_out)  # (batch_size, seq_length, output_size)
        
        return output


# Initialize model
model = RNNWithTfidf(
    tfidf_dim=TFIDF_FEATURES,
    hidden_size=HIDDEN_SIZE,
    output_size=len(LABELS),
    num_layers=NUM_LAYERS,
    dropout_rate=DROPOUT_RATE
).to(DEVICE)

print("RNN Model Architecture:")
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters()):,}")

RNN Model Architecture:
RNNWithTfidf(
  (rnn): RNN(100, 256, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (batchnorm): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (output_layer): Linear(in_features=512, out_features=16, bias=True)
)

Total parameters: 586,768


## 5. Model Training

In [7]:
def train_model(model, train_loader, val_loader, num_epochs=NUM_EPOCHS, 
                learning_rate=LEARNING_RATE, device=DEVICE, output_path=OUTPUT_PATH):
    """
    Train the RNN model with TF-IDF features
    
    Args:
        model: RNNWithTfidf model
        train_loader: training dataloader
        val_loader: validation dataloader
        num_epochs: number of training epochs
        learning_rate: learning rate
        device: torch device
        output_path: path to save models and checkpoints
        
    Returns:
        trained model, training history
    """
    
    # Setup optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=15)  # Ignore padding token
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
    
    # Training history
    history = {
        'train_loss': [], 'train_acc': [], 'train_f1': [],
        'val_loss': [], 'val_acc': [], 'val_f1': []
    }
    
    best_f1 = -1
    best_model_state = None
    
    print("\n" + "=" * 70)
    print("TRAINING MODEL")
    print("=" * 70)
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        train_preds, train_trues = [], []
        
        for batch_sequences, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]", leave=False):
            optimizer.zero_grad()
            
            # Convert sequences to TF-IDF features
            tfidf_features = convert_sequences_to_tfidf_features(
                batch_sequences, CHAR_TO_INDEX, MAX_LENGTH, TFIDF_FEATURES, device
            )
            
            # Forward pass
            outputs = model(tfidf_features)
            
            # Reshape for loss calculation
            flat_outputs = outputs.view(-1, outputs.shape[-1])
            flat_labels = batch_labels.view(-1)
            
            # Create mask to exclude padding
            mask = (flat_labels != 15)
            
            # Calculate loss
            loss = criterion(flat_outputs[mask], flat_labels[mask])
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_loss += loss.item()
            
            # Calculate accuracy
            pred = flat_outputs.argmax(dim=1)
            train_correct += (pred[mask] == flat_labels[mask]).sum().item()
            train_total += mask.sum().item()
            
            train_preds.extend(pred[mask].cpu().tolist())
            train_trues.extend(flat_labels[mask].cpu().tolist())
        
        train_loss /= len(train_loader)
        train_accuracy = train_correct / train_total if train_total > 0 else 0
        train_f1 = f1_score(train_trues, train_preds, average='macro', zero_division=0)
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        val_preds, val_trues = [], []
        
        with torch.inference_mode():
            for val_seq, val_label in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]", leave=False):
                tfidf_features = convert_sequences_to_tfidf_features(
                    val_seq, CHAR_TO_INDEX, MAX_LENGTH, TFIDF_FEATURES, device
                )
                
                outputs = model(tfidf_features)
                
                flat_outputs = outputs.view(-1, outputs.shape[-1])
                flat_labels = val_label.view(-1)
                
                loss = criterion(flat_outputs, flat_labels)
                val_loss += loss.item()
                
                pred = flat_outputs.argmax(dim=1)
                mask = (flat_labels != 15)
                
                val_correct += (pred[mask] == flat_labels[mask]).sum().item()
                val_total += mask.sum().item()
                
                val_preds.extend(pred[mask].cpu().tolist())
                val_trues.extend(flat_labels[mask].cpu().tolist())
        
        val_loss /= len(val_loader)
        val_accuracy = val_correct / val_total if val_total > 0 else 0
        val_f1 = f1_score(val_trues, val_preds, average='macro', zero_division=0)
        
        # Store history
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_accuracy)
        history['train_f1'].append(train_f1)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_accuracy)
        history['val_f1'].append(val_f1)
        
        # Update learning rate
        scheduler.step()
        
        # Print epoch results
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train: Loss={train_loss:.4f}, Acc={train_accuracy*100:.2f}%, F1={train_f1:.3f}")
        print(f"  Val:   Loss={val_loss:.4f}, Acc={val_accuracy*100:.2f}%, F1={val_f1:.3f}")
        
        # Save best model
        if val_f1 > best_f1:
            best_f1 = val_f1
            best_model_state = model.state_dict().copy()
            print(f"  ★ New best model saved (F1={best_f1:.3f})")
    
    # Load best model
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    # Save model and metadata
    model_path = os.path.join(output_path, 'rnn_tfidf_model.pth')
    meta_path = os.path.join(output_path, 'rnn_tfidf_model_meta.json')
    
    torch.save(model.state_dict(), model_path)
    
    metadata = {
        'model_type': 'RNNWithTfidf',
        'best_val_f1': float(best_f1),
        'tfidf_dim': TFIDF_FEATURES,
        'hidden_size': HIDDEN_SIZE,
        'num_layers': NUM_LAYERS,
        'dropout_rate': DROPOUT_RATE,
        'learning_rate': LEARNING_RATE,
        'num_epochs': num_epochs,
        'max_sequence_length': MAX_LENGTH,
        'train_batch_size': TRAIN_BATCH_SIZE,
        'vocab_size': len(CHAR_TO_INDEX) + 1,
        'output_classes': len(LABELS)
    }
    
    with open(meta_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False, indent=4)
    
    print("\n" + "=" * 70)
    print(f"✓ Training completed!")
    print(f"✓ Model saved to: {model_path}")
    print(f"✓ Metadata saved to: {meta_path}")
    print("=" * 70)
    
    return model, history


# Train the model
if train_loader is not None and val_loader is not None:
    model, training_history = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        num_epochs=NUM_EPOCHS,
        learning_rate=LEARNING_RATE,
        device=DEVICE,
        output_path=OUTPUT_PATH
    )
else:
    print("⚠ Warning: Could not create train/val loaders. Skipping training.")


TRAINING MODEL


                                                                       

Epoch 1/15
  Train: Loss=0.7168, Acc=75.21%, F1=0.328
  Val:   Loss=0.4314, Acc=85.11%, F1=0.506
  ★ New best model saved (F1=0.506)


                                                                       

Epoch 2/15
  Train: Loss=0.4307, Acc=85.39%, F1=0.499
  Val:   Loss=0.3209, Acc=89.17%, F1=0.622
  ★ New best model saved (F1=0.622)


                                                                       

Epoch 3/15
  Train: Loss=0.3577, Acc=88.08%, F1=0.598
  Val:   Loss=0.2719, Acc=91.01%, F1=0.676
  ★ New best model saved (F1=0.676)


                                                                       

Epoch 4/15
  Train: Loss=0.3179, Acc=89.55%, F1=0.631
  Val:   Loss=0.2428, Acc=92.02%, F1=0.694
  ★ New best model saved (F1=0.694)


                                                                       

Epoch 5/15
  Train: Loss=0.2925, Acc=90.47%, F1=0.653
  Val:   Loss=0.2243, Acc=92.77%, F1=0.710
  ★ New best model saved (F1=0.710)


                                                                       

Epoch 6/15
  Train: Loss=0.2532, Acc=91.84%, F1=0.687
  Val:   Loss=0.1993, Acc=93.55%, F1=0.738
  ★ New best model saved (F1=0.738)


                                                                       

Epoch 7/15
  Train: Loss=0.2478, Acc=92.02%, F1=0.692
  Val:   Loss=0.1958, Acc=93.68%, F1=0.743
  ★ New best model saved (F1=0.743)


                                                                       

Epoch 8/15
  Train: Loss=0.2443, Acc=92.14%, F1=0.696
  Val:   Loss=0.1925, Acc=93.78%, F1=0.745
  ★ New best model saved (F1=0.745)


                                                                       

Epoch 9/15
  Train: Loss=0.2414, Acc=92.23%, F1=0.697
  Val:   Loss=0.1901, Acc=93.86%, F1=0.745


                                                                        

Epoch 10/15
  Train: Loss=0.2386, Acc=92.32%, F1=0.701
  Val:   Loss=0.1878, Acc=93.93%, F1=0.749
  ★ New best model saved (F1=0.749)


                                                                        

Epoch 11/15
  Train: Loss=0.2340, Acc=92.47%, F1=0.705
  Val:   Loss=0.1854, Acc=94.01%, F1=0.752
  ★ New best model saved (F1=0.752)


                                                                        

Epoch 12/15
  Train: Loss=0.2332, Acc=92.51%, F1=0.706
  Val:   Loss=0.1849, Acc=94.03%, F1=0.751


                                                                        

Epoch 13/15
  Train: Loss=0.2329, Acc=92.52%, F1=0.706
  Val:   Loss=0.1847, Acc=94.04%, F1=0.752


                                                                        

Epoch 14/15
  Train: Loss=0.2327, Acc=92.52%, F1=0.706
  Val:   Loss=0.1843, Acc=94.05%, F1=0.751


                                                                        

Epoch 15/15
  Train: Loss=0.2324, Acc=92.53%, F1=0.707
  Val:   Loss=0.1841, Acc=94.05%, F1=0.752

✓ Training completed!
✓ Model saved to: RNN_Output/rnn_tfidf_model.pth
✓ Metadata saved to: RNN_Output/rnn_tfidf_model_meta.json


## 6. Model Evaluation

In [None]:

def evaluate_model(model, test_loader, device=DEVICE):
    """
    Evaluate model performance on test set
    
    Args:
        model: trained RNNWithTfidf model
        test_loader: test dataloader
        device: torch device
        
    Returns:
        accuracy, f1_score, detailed scores
    """
    model.eval()
    
    all_preds = []
    all_trues = []
    correct = 0
    total = 0
    
    print("\n" + "=" * 70)
    print("EVALUATING MODEL ON TEST SET")
    print("=" * 70)
    
    with torch.inference_mode():
        for test_seq, test_label in tqdm(test_loader, desc="Evaluating"):
            # Convert to TF-IDF features
            tfidf_features = convert_sequences_to_tfidf_features(
                test_seq, CHAR_TO_INDEX, MAX_LENGTH, TFIDF_FEATURES, device
            )
            
            # Forward pass
            outputs = model(tfidf_features)
            
            # Get predictions
            predictions = outputs.argmax(dim=2)
            
            # Flatten for comparison
            flat_preds = predictions.view(-1)
            flat_trues = test_label.view(-1)
            
            # Create mask for special tokens (padding and unknown)
            mask = (flat_trues != 15) & (flat_trues != 16)
            
            # Calculate accuracy
            correct += (flat_preds[mask] == flat_trues[mask]).sum().item()
            total += mask.sum().item()
            
            all_preds.extend(flat_preds[mask].cpu().numpy())
            all_trues.extend(flat_trues[mask].cpu().numpy())
    
    accuracy = correct / total if total > 0 else 0
    macro_f1 = f1_score(all_trues, all_preds, average='macro', zero_division=0)
    weighted_f1 = f1_score(all_trues, all_preds, average='weighted', zero_division=0)
    
    print(f"\nTest Set Results:")
    print(f"  Accuracy:  {accuracy*100:.2f}%")
    print(f"  Macro F1:  {macro_f1:.4f}")
    print(f"  Weighted F1: {weighted_f1:.4f}")
    print("=" * 70)
    
    return {
        'accuracy': accuracy,
        'macro_f1': macro_f1,
        'weighted_f1': weighted_f1,
        'predictions': all_preds,
        'ground_truth': all_trues
    }


# Evaluate on test set if available
if test_loader is not None:
    test_results = evaluate_model(model, test_loader, device=DEVICE)
else:
    print("⚠ Warning: Test loader not available. Skipping evaluation.")



## 7. Model Loading and Prediction

In [10]:
def load_trained_model(model_path, device=DEVICE):
    """
    Load a previously trained model
    
    Args:
        model_path: path to saved model
        device: torch device
        
    Returns:
        loaded model
    """
    model = RNNWithTfidf(
        input_size=TFIDF_FEATURES,
        hidden_size=HIDDEN_SIZE,
        num_layers=NUM_LAYERS,
        output_size=len(LABELS),
        dropout=DROPOUT_RATE
    )
    
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    
    print(f"✓ Model loaded from: {model_path}")
    return model


def predict_on_test_set(model, test_data_path, output_csv_path=None):
    """
    Generate predictions on test set and save to CSV
    
    Args:
        model: trained model
        test_data_path: path to test data
        output_csv_path: path to save predictions CSV
        
    Returns:
        DataFrame with predictions
    """
    model.eval()
    
    # Load test sentences
    with open(test_data_path, 'r', encoding='utf-8') as f:
        test_sentences = [line.strip() for line in f.readlines()]
    
    predictions_list = []
    
    print("\n" + "=" * 70)
    print("GENERATING PREDICTIONS ON TEST SET")
    print("=" * 70)
    
    with torch.inference_mode():
        for sentence in tqdm(test_sentences, desc="Predicting"):
            # Clean and preprocess
            cleaned = text_cleaner.clean_text(sentence)
            sequences = text_preprocessor.preprocess_text(cleaned)
            
            # Pad to MAX_LENGTH
            padded_seq = sequences + [15] * (MAX_LENGTH - len(sequences))
            padded_seq = padded_seq[:MAX_LENGTH]
            
            # Convert to tensor
            seq_tensor = torch.tensor([padded_seq], dtype=torch.long, device=DEVICE)
            
            # Get TF-IDF features
            tfidf_features = convert_sequences_to_tfidf_features(
                seq_tensor, CHAR_TO_INDEX, MAX_LENGTH, TFIDF_FEATURES, DEVICE
            )
            
            # Get predictions
            with torch.inference_mode():
                outputs = model(tfidf_features)
                pred_indices = outputs.argmax(dim=2)[0].cpu().numpy()
            
            # Convert indices to diacritics
            diacritics = [LABELS.get(idx, '') for idx in pred_indices]
            
            # Combine with original sentence
            diacritized = ''
            for i, char in enumerate(cleaned):
                if i < len(diacritics):
                    diacritized += char + diacritics[i]
                else:
                    diacritized += char
            
            predictions_list.append({
                'original': sentence,
                'cleaned': cleaned,
                'diacritized': diacritized
            })
    
    # Save to CSV
    df = pd.DataFrame(predictions_list)
    
    if output_csv_path is None:
        output_csv_path = os.path.join(OUTPUT_PATH, 'test_predictions.csv')
    
    df.to_csv(output_csv_path, index=False, encoding='utf-8')
    
    print(f"\n✓ Predictions saved to: {output_csv_path}")
    print(f"✓ Total predictions: {len(df)}")
    print("=" * 70)
    
    return df

TEST_DATA_PATH="data/sample_test_no_diacritics.txt"
# Generate test predictions if test data exists
if os.path.exists(TEST_DATA_PATH):
    test_predictions = predict_on_test_set(model, TEST_DATA_PATH)
    print("\nFirst few predictions:")
    print(test_predictions.head())
else:
    print(f"⚠ Warning: Test data not found at {TEST_DATA_PATH}")

