In [2]:
import re


class TextCleaner:
    @staticmethod
    def replace_regex(text, pattern, replacement=''):
        return pattern.sub(replacement, text)

    @staticmethod
    def clean_lines(lines):
        for i in range(len(lines)):
            # remove any brackets that have only numbers inside and remove all numbers
            reg = r'\(\s*(\d+)\s*\)|\(\s*(\d+)\s*\/\s*(\d+)\s*\)|\d+'
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(reg))
            # replace all different types of brackets with a single type
            reg_brackets = r'[\[\{\(\]\}\)]'
            lines[i] = re.compile(reg_brackets).sub('', lines[i])
            # remove some unwanted characters
            reg = r'[/\/\\\-]'
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(reg))
            # remove unwanted characters
            reg = r'[,»–\';«*\u200f\u200d\u200b\u200c\u200e"\\~`%…_]'
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(reg))
            # remove English characters (a-z, A-Z)
            reg = r'[a-zA-Z]'
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(reg))
            # remove fractions and superscripts/subscripts
            reg = r'[\u00BC-\u00BE\u2150-\u215E]'
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(reg))
            # remove emojis and other symbols
            reg = r'[\U0001F000-\U0001FFFF]'
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(reg))
            # remove gender symbols and similar miscellaneous symbols
            reg = r'[\u2600-\u26FF\u2700-\u27BF]'
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(reg))
            # remove extra spaces
            reg = r'\s+'
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(reg), ' ')
        return lines

    @staticmethod
    def remove_diacritics(lines):
        diacritics_pattern = r'[\u064B-\u065F\u0670\uFE70-\uFE7F]'
        for i in range(len(lines)):
            lines[i] = TextCleaner.replace_regex(lines[i], re.compile(diacritics_pattern))
        return lines

    @staticmethod
    def count_spaces(text):
        return len(re.findall(r'\s', text))

In [3]:
import re
import textwrap


class TextPreprocessor:
    def __init__(self, cleaner, input_path='.', output_path='.', max_length=600, with_labels=True):
        self.cleaner = cleaner
        self.input_path = input_path
        self.output_path = output_path
        self.max_length = max_length
        self.with_labels = with_labels

    def clean_lines(self, lines, data_type):
        lines = self.cleaner.clean_lines(lines)

        if not lines:
            return lines

        # Save lines with diacritics
        if self.with_labels:
            with open(f'{self.output_path}cleaned_{data_type}_with_diacritics.txt', 'a+', encoding='utf-8') as f:
                f.write('\n'.join(lines) + '\n')

        # Remove diacritics
        lines_no_diacritics = self.cleaner.remove_diacritics(lines)

        # Save lines without diacritics
        with open(f'{self.output_path}cleaned_{data_type}_without_diacritics.txt', 'a+', encoding='utf-8') as f:
            f.write('\n'.join(lines_no_diacritics) + '\n')

        return lines_no_diacritics

    def preprocess_file(self, data_type, limit=None):
        # Clear previous output
        open(f'{self.output_path}cleaned_{data_type}_with_diacritics.txt', 'w', encoding='utf-8').close()
        open(f'{self.output_path}cleaned_{data_type}_without_diacritics.txt', 'w', encoding='utf-8').close()

        # Read raw file
        with open(f'{self.input_path}{data_type}.txt', 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f.readlines()]

        if limit is not None:
            lines = lines[:limit]

        return self.clean_lines(lines, data_type)

    def tokenize_file(self, data_type):
        tokenized_no_diacritics = []
        space_counts = []

        # Tokenize file without diacritics
        with open(f'{self.output_path}cleaned_{data_type}_without_diacritics.txt', 'r', encoding='utf-8') as f:
            for line in f:
                line = re.sub(r'[\n\r\t]', '', line)
                line = re.sub(r'\s+', ' ', line).strip()
                sentences = textwrap.wrap(line, self.max_length)
                for sentence in sentences:
                    tokenized_no_diacritics.append(sentence)
                    space_counts.append(self.cleaner.count_spaces(sentence))

        tokenized_with_diacritics = []

        if self.with_labels:
            space_index = 0
            with open(f'{self.output_path}cleaned_{data_type}_with_diacritics.txt', 'r', encoding='utf-8') as f:
                for line in f:
                    line = re.sub(r'[\n\r\t]', '', line)
                    line = re.sub(r'\s+', ' ', line).strip()
                    remaining_text = line
                    while remaining_text:
                        spaces_to_include = space_counts[space_index]
                        space_index += 1
                        words = remaining_text.split()
                        if len(words) <= spaces_to_include + 1:
                            tokenized_with_diacritics.append(remaining_text.strip())
                            break
                        sentence = ' '.join(words[:spaces_to_include + 1])
                        tokenized_with_diacritics.append(sentence.strip())
                        remaining_text = ' '.join(words[spaces_to_include + 1:]).strip()

        return tokenized_no_diacritics, tokenized_with_diacritics


In [4]:
import torch
from torch.utils.data import DataLoader, TensorDataset


class DatasetBuilder:
    def __init__(self, preprocessor, char_to_index, label_map, max_length=600, device='cpu'):
        self.preprocessor = preprocessor
        self.char_to_index = char_to_index
        self.label_map = label_map
        self.max_length = max_length
        self.device = device

    def encode_chars(self, sequences):
        indexed = [[self.char_to_index[char] for char in seq] for seq in sequences]
        padded = [seq + [0] * (self.max_length - len(seq)) for seq in indexed]
        return torch.tensor(padded).to(self.device)

    def encode_labels(self, sequences_with_diacritics):
        all_labels = []

        for sentence in sequences_with_diacritics:
            labels = []
            i = 0
            length = len(sentence)

            while i < length:
                ch = ord(sentence[i])

                # letter
                if ch not in self.label_map:

                    # Check next character
                    if i + 1 < length:
                        d1 = ord(sentence[i + 1])

                        # shadda + another diacritic
                        if d1 == 1617 and i + 2 < length:
                            d2 = ord(sentence[i + 2])
                            if d2 in self.label_map:
                                labels.append(self.label_map[(1617, d2)])
                                i += 3
                                continue

                        # only shadda
                        if d1 == 1617:
                            labels.append(self.label_map[1617])
                            i += 2
                            continue

                        # one diacritic
                        if d1 in self.label_map:
                            labels.append(self.label_map[d1])
                            i += 2
                            continue

                    # no diacritic
                    labels.append(14)

                i += 1

            # pad
            if len(labels) < self.max_length:
                labels.extend([15] * (self.max_length - len(labels)))

            all_labels.append(labels)

        return torch.tensor(all_labels, device=self.device)

    def create_dataloader(self, data_type, batch_size=256, with_labels=True):
        self.preprocessor.preprocess_file(data_type)
        sequences, sequences_with_diacritics = self.preprocessor.tokenize_file(data_type)

        data_tensor = self.encode_chars(sequences)

        if with_labels:
            labels_tensor = self.encode_labels(sequences_with_diacritics)
            dataset = TensorDataset(data_tensor, labels_tensor)
        else:
            dataset = TensorDataset(data_tensor)

        return DataLoader(dataset, batch_size=batch_size)

In [11]:
import torch
import torch.nn as nn
from torch.nn import BatchNorm1d
import pickle
import json


class CharBiLSTM(nn.Module):
    def __init__(
        self,
        vocab_size=44,
        embedding_dim=300,
        hidden_dim=256,
        output_dim=16,
        dropout_rate=0.2,
        num_layers=1,
        max_length=600,
    ):
        super(CharBiLSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout_rate,
        )
        self.batchnorm = nn.BatchNorm1d(max_length)
        self.output = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        x_embed = self.embedding(x)
        lstm_out, _ = self.lstm(x_embed)
        lstm_out = self.batchnorm(lstm_out)
        out = self.output(lstm_out)
        return out

    @staticmethod
    def load_model_pkl(
        model_file="best_model.pkl", meta_file="best_model_meta.json", device=None
    ):
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        base_path = "/kaggle/working/"

        with open(base_path + meta_file, "r", encoding="utf-8") as f:
            metadata = json.load(f)

        model = CharBiLSTM(
            vocab_size=metadata["vocab_size"],
            embedding_dim=metadata["embedding_size"],
            hidden_dim=metadata["hidden_size"],
            output_dim=metadata["output_classes"],
            dropout_rate=metadata["dropout_rate"],
            num_layers=metadata["num_layers"],
            max_length=metadata["max_sequence_length"],
        ).to(device)

        # state_dict = torch.load(base_path + model_file, map_location=device)
        # model.load_state_dict(state_dict)
        with open(base_path + model_file, "rb") as f:
            state_dict = pickle.load(f)

        model.load_state_dict(state_dict)
        model.to(device)

        model.eval()
        return model, metadata

    @staticmethod
    def load_model_pth(
        model_file="best_model.pth", meta_file="best_model_meta.json", device=None
    ):
        if device is None:
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        base_path = "./"

        with open(base_path + meta_file, "r", encoding="utf-8") as f:
            metadata = json.load(f)

        model = CharBiLSTM(
            vocab_size=metadata["vocab_size"],
            embedding_dim=metadata["embedding_size"],
            hidden_dim=metadata["hidden_size"],
            output_dim=metadata["output_classes"],
            dropout_rate=metadata["dropout_rate"],
            num_layers=metadata["num_layers"],
            max_length=metadata["max_sequence_length"],
        ).to(device)

        model.load_state_dict(torch.load("model_weights.pth"))
        model.to(device)

        model.eval()
        return model, metadata

In [6]:
import torch
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
import json
from sklearn.metrics import f1_score
from models import CharBiLSTM


class Trainer:
    def __init__(
        self, model, optimizer=None, scheduler=None, criterion=None,
        train_loader=None, val_loader=None,
        pad_idx=15, max_length=600, device=None,
        checkpoint_file="checkpoint.pth", best_model_file="best_model.pth",
        meta_file="best_model_meta.json"
    ):
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(self.device)

        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion
        self.train_loader = train_loader
        self.val_loader = val_loader

        self.pad_idx = pad_idx
        self.max_length = max_length

        self.checkpoint_file = checkpoint_file
        self.best_model_file = best_model_file
        self.meta_file = meta_file

        self.best_val_acc = -1

        # Save metadata once
        self._save_metadata()

    def _save_metadata(self):
        meta = {
            "vocab_size": self.model.vocab_size,
            "embedding_size": self.model.embedding.embedding_dim,
            "hidden_size": self.model.hidden_size,
            "output_classes": self.model.output_size,
            "dropout_rate": self.model.dropout_rate,
            "num_layers": self.model.num_layers,
            "max_sequence_length": self.max_length,
            "learning_rate": self.optimizer.param_groups[0]["lr"],
            "pad_idx": self.pad_idx,
        }

        with open(self.meta_file, "w", encoding="utf-8") as f:
            json.dump(meta, f, indent=4)

    @staticmethod
    def load_checkpoint(
        checkpoint_file="checkpoint.pth",
        meta_file="best_model_meta.json",
        model=None,
        device=None,
    ):
        device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")

        checkpoint = torch.load(checkpoint_file, map_location=device)

        with open(meta_file, "r", encoding="utf-8") as f:
            meta = json.load(f)

        if model is None:
            model = CharBiLSTM(
                vocab_size=meta["vocab_size"],
                embedding_dim=meta["embedding_size"],
                hidden_dim=meta["hidden_size"],
                output_dim=meta["output_classes"],
                dropout_rate=meta["dropout_rate"],
                num_layers=meta["num_layers"],
                max_length=meta["max_sequence_length"]
            ).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=meta["learning_rate"])
        scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

        model.load_state_dict(checkpoint["model_state"])
        optimizer.load_state_dict(checkpoint["optimizer_state"])

        if checkpoint.get("scheduler_state") is not None:
            scheduler.load_state_dict(checkpoint["scheduler_state"])

        start_epoch = checkpoint["epoch"] + 1
        best_val_acc = checkpoint["best_val_acc"]

        return model, optimizer, scheduler, start_epoch, best_val_acc

    def train_epoch(self):
        self.model.train()
        correct, total = 0, 0
        preds, trues = [], []

        for batch_seq, batch_labels in self.train_loader:
            batch_seq, batch_labels = batch_seq.to(self.device), batch_labels.to(self.device)
            self.optimizer.zero_grad()

            outputs = self.model(batch_seq)

            flat_outputs = outputs.view(-1, outputs.shape[-1])
            flat_labels = batch_labels.view(-1)
            mask = (flat_labels != self.pad_idx)

            loss = self.criterion(flat_outputs[mask], flat_labels[mask])
            loss.backward()
            self.optimizer.step()

            pred = flat_outputs.argmax(dim=1)
            correct += (pred[mask] == flat_labels[mask]).sum().item()
            total += mask.sum().item()

            preds.extend(pred[mask].cpu().tolist())
            trues.extend(flat_labels[mask].cpu().tolist())

        acc = correct / total
        f1 = f1_score(trues, preds, average='macro')
        return acc, f1, loss.item()

    def validate(self):
        self.model.eval()
        correct, total = 0, 0
        preds, trues = [], []

        with torch.inference_mode():
            for batch_seq, batch_labels in self.val_loader:
                batch_seq, batch_labels = batch_seq.to(self.device), batch_labels.to(self.device)
                outputs = self.model(batch_seq)

                pred = outputs.argmax(dim=2)
                flat_pred = pred.view(-1)
                flat_labels = batch_labels.view(-1)
                mask = (flat_labels != self.pad_idx)

                correct += (flat_pred[mask] == flat_labels[mask]).sum().item()
                total += mask.sum().item()

                preds.extend(flat_pred[mask].cpu().tolist())
                trues.extend(flat_labels[mask].cpu().tolist())

        acc = correct / total
        f1 = f1_score(trues, preds, average='macro')
        return acc, f1

    def train(self, num_epochs=20):
        print("Training starts")

        for epoch in range(num_epochs):
            train_acc, train_f1, loss_val = self.train_epoch()
            val_acc, val_f1 = self.validate()

            if self.scheduler:
                self.scheduler.step()

            print(
                f"Epoch {epoch+1}/{num_epochs} | Loss: {loss_val:.5f} | "
                f"Train Acc: {train_acc*100:.2f}% | Train F1: {train_f1:.3f} | "
                f"Val Acc: {val_acc*100:.2f}% | Val F1: {val_f1:.3f}"
            )

            # SAVE BEST MODEL (based on val_acc)
            if val_acc > self.best_val_acc:
                self.best_val_acc = val_acc
                torch.save(self.model.state_dict(), self.best_model_file)
                print(f"New best model saved! (Val Acc = {val_acc*100:.2f}%)")

            # SAVE LAST CHECKPOINT
            torch.save({
                "model_state": self.model.state_dict(),
                "optimizer_state": self.optimizer.state_dict(),
                "scheduler_state": self.scheduler.state_dict() if self.scheduler else None,
                "best_val_acc": self.best_val_acc,
                "epoch": epoch
            }, self.checkpoint_file)

        return self.best_val_acc

    def test(self, test_loader):
        self.model.eval()
        correct, total = 0, 0

        with torch.inference_mode():
            for batch_seq, batch_labels in tqdm(test_loader, desc="Testing"):
                batch_seq, batch_labels = batch_seq.to(self.device), batch_labels.to(self.device)
                outputs = self.model(batch_seq)

                pred_labels = outputs.argmax(dim=2)
                mask = (batch_labels != self.pad_idx)

                correct += ((pred_labels == batch_labels) & mask).sum().item()
                total += mask.sum().item()

        acc = correct / total if total > 0 else 0
        print(f"Test Accuracy: {acc*100:.3f}%")
        return acc

In [7]:
import torch
import re
import textwrap
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from cleaner import TextCleaner


def convert2idx(data, char_to_index, max_len=200, device='cpu'):
    sequences = [[char_to_index[ch] for ch in seq] for seq in data]
    sequences = [seq + [0] * (max_len - len(seq)) for seq in sequences]
    return torch.tensor(sequences, device=device)


class Predictor:
    def __init__(self, model, char_to_index, index_to_label, device=None):
        self.model = model
        self.char_to_index = char_to_index
        self.index_to_label = index_to_label
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.IGNORE_IDX = {0, 2, 8, 15, 16, 26, 40, 43}
    def predict_sentence(self, original_sentence, max_length=200, batch_size=256):
        # Clean & normalize
        clean = TextCleaner.clean_lines([original_sentence.strip()])[0]
        clean = re.sub(r'[\n\r\t]', '', clean)
        clean = re.sub(r'\s+', ' ', clean).strip()
        cleaned_sentences = TextCleaner.remove_diacritics([clean])[0]
        tokenized_sentences = []

        # Split by dot
        parts = [p.strip() for p in cleaned_sentences.split('.') if p.strip()]

        # Wrap long strings without cutting words
        for part in parts:
            tokenized_sentences.extend(textwrap.wrap(part, max_length))

        seq_tensor = convert2idx(tokenized_sentences, self.char_to_index, max_length, self.device)
        loader = DataLoader(TensorDataset(seq_tensor, seq_tensor), batch_size=batch_size)

        predicted_labels = []

        # Run model
        self.model.eval()
        for batch_seq, batch_lbl in loader:
            outputs = self.model(batch_seq)
            batch_pred = outputs.argmax(dim=2)

            # Mask ignored chars
            mask = ~torch.isin(batch_seq, torch.tensor(list(self.IGNORE_IDX), device=self.device))
            predicted_labels.extend(batch_pred[mask].tolist())

        predicted_sentence = ""

        idx = 0

        for ch in cleaned_sentences:
            predicted_sentence += ch

            if ch not in self.char_to_index:
                continue

            code = self.char_to_index[ch]
            if code in self.IGNORE_IDX:
                continue

            pred_class = self.index_to_label[predicted_labels[idx]]

            if pred_class == 0:
                idx += 1
                continue

            if isinstance(pred_class, tuple):
                predicted_sentence += chr(pred_class[0]) + chr(pred_class[1])
            else:
                predicted_sentence += chr(pred_class)

            idx += 1

        return predicted_sentence

    def predict_dataset(self, dataloader):
        predicted_labels = []
        ignore_indices = {0, 2, 8, 15, 16, 26, 40, 43}

        self.model.eval()
        with torch.inference_mode():
            for batch_seq, _ in dataloader:
                batch_seq = batch_seq.to(self.device)
                outputs = self.model(batch_seq)
                batch_pred = outputs.argmax(dim=2)

                mask = ~torch.isin(batch_seq, torch.tensor(list(ignore_indices), device=self.device))

                predicted_labels.extend(batch_pred[mask].tolist())

        with open('submission.csv', 'w', encoding='utf-8') as f:
            f.write('ID,label\n')
            for i, label in enumerate(predicted_labels):
                f.write(f"{i},{label}\n")

        return predicted_labels

    def evaluate(self, test_loader):
        self.model.eval()
        correct, total = 0, 0

        with torch.inference_mode():
            for seq, labels in tqdm(test_loader, desc="Evaluating"):
                seq = seq.to(self.device)
                labels = labels.to(self.device)

                outputs = self.model(seq)
                pred = outputs.argmax(dim=2)

                flat_pred = pred.view(-1)
                flat_labels = labels.view(-1)

                # mask out PAD = 15
                mask = (flat_labels != 15)

                correct += (flat_pred[mask] == flat_labels[mask]).sum().item()
                total += mask.sum().item()

        return correct / total if total > 0 else 0


In [8]:
import torch
from torch.optim.lr_scheduler import StepLR
from cleaner import TextCleaner
from preprocessor import TextPreprocessor
from dataset_builder import DatasetBuilder
from models import CharBiLSTM
from trainer import Trainer
from predictor import Predictor
from tqdm import tqdm
import gradio as gr

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CHAR_TO_INDEX = {
    "د": 1,
    "؟": 2,
    "آ": 3,
    "إ": 4,
    "ؤ": 5,
    "ط": 6,
    "م": 7,
    "،": 8,
    "ة": 9,
    "ت": 10,
    "ر": 11,
    "ئ": 12,
    "ا": 13,
    "ض": 14,
    "!": 15,
    " ": 16,
    "ك": 17,
    "غ": 18,
    "س": 19,
    "ص": 20,
    "أ": 21,
    "ل": 22,
    "ف": 23,
    "ظ": 24,
    "ج": 25,
    "؛": 26,
    "ن": 27,
    "ع": 28,
    "ب": 29,
    "ث": 30,
    "ه": 31,
    "خ": 32,
    "ى": 33,
    "ء": 34,
    "ز": 35,
    "ق": 36,
    "ي": 37,
    "ش": 38,
    "ح": 39,
    ":": 40,
    "ذ": 41,
    "و": 42,
    ".": 43,
}
INDEX_TO_CHAR = {v: k for k, v in CHAR_TO_INDEX.items()}

LABELS = {
    1614: 0,
    1611: 1,
    1615: 2,
    1612: 3,
    1616: 4,
    1613: 5,
    1618: 6,
    1617: 7,
    (1617, 1614): 8,
    (1617, 1611): 9,
    (1617, 1615): 10,
    (1617, 1612): 11,
    (1617, 1616): 12,
    (1617, 1613): 13,
    0: 14,
    15: 15,
}
INDEX_TO_LABEL = {v: k for k, v in LABELS.items()}

MAX_LENGTH = 600
TRAIN_BATCH_SIZE = 32
VAL_BATCH_SIZE = 256
READ_PATH = "data/"
WRITE_PATH = "cleaned_outputs/"


def test_last_char_text(
    model,
    data_loader,
    max_len=600,
    batch_size=256,
    char_to_index=CHAR_TO_INDEX,
    index_to_label=INDEX_TO_LABEL,
    labels=LABELS,
    index_to_char=INDEX_TO_CHAR,
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    model.eval()

    total_last_char = 0
    correct_last_char = 0
    words_text = []

    with torch.inference_mode():
        for batch_sequences, batch_labels in tqdm(data_loader, desc="Last Char Test"):
            outputs = model(batch_sequences)  # batch_size * seq_length * output_size
            predicted_labels = outputs.argmax(dim=2)

            batch_size_seq = batch_sequences.shape[0]

            for i in range(batch_size_seq):
                seq = batch_sequences[i]
                true_labels = batch_labels[i]
                pred_labels = predicted_labels[i]

                word = ""
                last_char_true = None
                last_char_pred = None

                for idx, c in enumerate(seq):
                    if c in [0, 2, 8, 15, 16, 26, 40, 43]:
                        if word:
                            last_char_true_val = (
                                last_char_true if last_char_true is not None else 0
                            )
                            last_char_pred_val = (
                                last_char_pred if last_char_pred is not None else 0
                            )
                            words_text.append(
                                f"{word}:{last_char_true_val}->{last_char_pred_val}"
                            )
                            if last_char_true_val == last_char_pred_val:
                                correct_last_char += 1
                            total_last_char += 1

                            word = ""
                            last_char_true = None
                            last_char_pred = None
                        continue

                    word += index_to_char[int(c)]
                    last_char_true = index_to_label[int(true_labels[idx])]
                    last_char_pred = index_to_label[int(pred_labels[idx])]

                # catch last word
                if word:
                    last_char_true_val = (
                        last_char_true if last_char_true is not None else 0
                    )
                    last_char_pred_val = (
                        last_char_pred if last_char_pred is not None else 0
                    )
                    words_text.append(
                        f"{word}:{last_char_true_val}->{last_char_pred_val}"
                    )
                    if last_char_true_val == last_char_pred_val:
                        correct_last_char += 1
                    total_last_char += 1

    accuracy = correct_last_char / total_last_char if total_last_char > 0 else 0
    print(f"Last Character Accuracy: {accuracy*100:.3f}%")

    return accuracy


if __name__ == "__main__":
    # Initialize preprocessing classes
    cleaner = TextCleaner()
    preprocessor = TextPreprocessor(
        cleaner, input_path=READ_PATH, output_path=WRITE_PATH
    )
    dataset_builder = DatasetBuilder(
        preprocessor,
        char_to_index=CHAR_TO_INDEX,
        label_map=LABELS,
        max_length=MAX_LENGTH,
        device=DEVICE,
    )

    # Prepare dataloaders
    # train_loader = dataset_builder.create_dataloader(
    #     data_type="train", batch_size=TRAIN_BATCH_SIZE
    # )
    # val_loader = dataset_builder.create_dataloader(
    #     data_type="val", batch_size=VAL_BATCH_SIZE
    # )
    # test_loader = dataset_builder.create_dataloader(
    #     data_type="test", batch_size=VAL_BATCH_SIZE, with_labels=False
    # )

    # Initialize model
    vocab_size = len(CHAR_TO_INDEX) + 1
    embedding_dim = 300
    hidden_dim = 256
    output_dim = len(LABELS)
    dropout_rate = 0.2
    num_layers = 5

    model = CharBiLSTM(
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        dropout_rate,
        num_layers,
        max_length=MAX_LENGTH,
    ).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = StepLR(optimizer, step_size=5, gamma=0.1)

    # Train the model
    # criterion = torch.nn.CrossEntropyLoss()
    # trainer = Trainer(
    #     model=model,
    #     optimizer=optimizer,
    #     scheduler=scheduler,
    #     criterion=criterion,
    #     train_loader=train_loader,
    #     val_loader=val_loader,
    #     device=DEVICE,
    #     checkpoint_file="checkpoint.pth",
    # )

    # trainer.train(num_epochs=20)

    # Load best model for prediction
    model, meta = CharBiLSTM.load_model_pth()
    predictor = Predictor(model, CHAR_TO_INDEX, INDEX_TO_LABEL, device=DEVICE)

    def diacritize_text(text):
        if not text or not text.strip():
            return ""

        try:
            predicted_sentence = predictor.predict_sentence(text, max_length=MAX_LENGTH)
            return predicted_sentence
        except Exception as e:
            return f"Error during prediction: {str(e)}"

    iface = gr.Interface(
        fn=diacritize_text,
        inputs=gr.Textbox(
            lines=2,
            placeholder="Enter Arabic text here...",
            label="Input Sentence",
            rtl=True,
        ),
        outputs=gr.Textbox(label="Diacritized Sentence", rtl=True),
        title="Arabic Diacritization",
        description="Enter an Arabic sentence to predict its diacritics.",
    )
    iface.launch()
    # Predict on test dataset
    # predictor.predict_dataset(test_loader)

    # Predict a single sentence
    # test_sentence = ''
    # predicted_sentence = predictor.predict_sentence(test_sentence, max_length=MAX_LENGTH, batch_size=VAL_BATCH_SIZE)
    # print("Original sentence:", test_sentence)
    # print("Predicted sentence:", predicted_sentence)

    # Evaluate on validation set
    # predictor.evaluate(val_loader)


In [12]:
model, meta = CharBiLSTM.load_model()

In [13]:
meta

{'model_type': 'CharLSTM',
 'best_f1': 0.934231058215857,
 'embedding_size': 300,
 'hidden_size': 256,
 'num_layers': 5,
 'dropout_rate': 0.2,
 'learning_rate': 0.001,
 'num_epochs': 19,
 'max_sequence_length': 600,
 'batch_size': 32,
 'vocab_size': 44,
 'output_classes': 16,
 'model_file': 'best_model.pkl'}

In [14]:
predictor = Predictor(model, CHAR_TO_INDEX, INDEX_TO_LABEL, device=DEVICE)

In [13]:
train_loader = dataset_builder.create_dataloader(data_type='train', batch_size=TRAIN_BATCH_SIZE)

In [14]:
predictor.evaluate(train_loader)

Evaluating: 100%|██████████| 1678/1678 [03:31<00:00,  7.93it/s]

8029990
Test Accuracy: 96.151%





0.9615052569138062

In [17]:
val_loader = dataset_builder.create_dataloader(data_type='val', batch_size=VAL_BATCH_SIZE)

In [18]:
predictor.evaluate(val_loader)

Evaluating: 100%|██████████| 11/11 [00:08<00:00,  1.36it/s]


0.9691978862307833

In [31]:
test_loader = dataset_builder.create_dataloader(data_type='test', batch_size=VAL_BATCH_SIZE, with_labels=False)

In [32]:
predictor.predict_dataset(test_loader);

In [33]:
import pandas as pd
test = pd.read_csv('/kaggle/working/submission.csv')
test

Unnamed: 0,ID,label
0,0,4
1,1,14
2,2,14
3,3,6
4,4,4
...,...,...
237235,237235,6
237236,237236,4
237237,237237,0
237238,237238,6


In [35]:
import pandas as pd
test = pd.read_csv('/kaggle/input/contest-data/cufe-cmp-mainstream-nlp-fall-2025/test_no_diacritics.csv')
test

Unnamed: 0,id,line_number,letter,case_ending
0,0,0,ف,False
1,1,0,ي,True
2,2,0,ا,False
3,3,0,ل,False
4,4,0,م,False
...,...,...,...,...
237235,237235,2468,ب,False
237236,237236,2468,ن,True
237237,237237,2468,ش,False
237238,237238,2468,ي,False


In [36]:
gold = pd.read_csv('submission.csv')
gold

Unnamed: 0,ID,label
0,0,4
1,1,14
2,2,14
3,3,6
4,4,4
...,...,...
237235,237235,6
237236,237236,4
237237,237237,0
237238,237238,6


In [37]:
gold[test.case_ending]

Unnamed: 0,ID,label
1,1,14
7,7,4
12,12,12
18,18,4
20,20,6
...,...,...
237224,237224,14
237228,237228,2
237231,237231,14
237236,237236,4


In [38]:
gold[test.case_ending].to_csv("saved_data.csv", index=False, encoding="utf-8")

In [2]:
# from huggingface_hub import hf_hub_download
# from huggingface_hub import login
# repo_id = "OmarHashem80/LSTM98"
# model_file = "best_model.pkl"
# meta_file = "best_model_meta.json"

# # download from HF Hub
# local_model_path = hf_hub_download(repo_id=repo_id, filename=model_file)
# local_meta_path = hf_hub_download(repo_id=repo_id, filename=meta_file)

# print("Downloaded model:", local_model_path)
# import shutil

# destination_path = '/kaggle/working/best_model.pkl'

# shutil.copy(local_model_path, destination_path)
# print(f'Model saved to {destination_path}')

# destination_path = '/kaggle/working/best_model_meta.json'

# shutil.copy(local_meta_path, destination_path)
# print(f'Model saved to {destination_path}')

best_model_meta.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

Downloaded model: /root/.cache/huggingface/hub/models--OmarHashem80--LSTM98/snapshots/064a2a433e8f9d0d2a72b0f2185af280380d6fe8/best_model.pkl
Model saved to /kaggle/working/best_model.pkl
Model saved to /kaggle/working/best_model_meta.json


In [15]:
max_len = 600

In [10]:
model_file = "best_model.pkl"
meta_file = "best_model_meta.json"
model, _ = CharBiLSTM.load_model(model_file, meta_file)

In [21]:
def test_last_char_text(model, data_loader, max_len=600, batch_size=256,
                        char_to_index=CHAR_TO_INDEX, index_to_label=INDEX_TO_LABEL, labels=LABELS, index_to_char=INDEX_TO_CHAR):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)

    model.eval()

    total_last_char = 0
    correct_last_char = 0
    words_text = []

    with torch.inference_mode():
        for batch_sequences, batch_labels in tqdm(data_loader, desc="Last Char Test"):
            outputs = model(batch_sequences)  # batch_size * seq_length * output_size
            predicted_labels = outputs.argmax(dim=2)

            batch_size_seq = batch_sequences.shape[0]

            for i in range(batch_size_seq):
                seq = batch_sequences[i]
                true_labels = batch_labels[i]
                pred_labels = predicted_labels[i]

                word = ''
                last_char_true = None
                last_char_pred = None

                for idx, c in enumerate(seq):
                    if c in [0, 2, 8, 15, 16, 26, 40, 43]:  # padding/unwanted chars
                        if word:
                            last_char_true_val = last_char_true if last_char_true is not None else 0
                            last_char_pred_val = last_char_pred if last_char_pred is not None else 0
                            words_text.append(f"{word}:{last_char_true_val}->{last_char_pred_val}")
                            if last_char_true_val == last_char_pred_val:
                                correct_last_char += 1
                            total_last_char += 1

                            word = ''
                            last_char_true = None
                            last_char_pred = None
                        continue

                    word += index_to_char[int(c)]
                    last_char_true = index_to_label[int(true_labels[idx])]
                    last_char_pred = index_to_label[int(pred_labels[idx])]

                # catch last word in sequence
                if word:
                    last_char_true_val = last_char_true if last_char_true is not None else 0
                    last_char_pred_val = last_char_pred if last_char_pred is not None else 0
                    words_text.append(f"{word}:{last_char_true_val}->{last_char_pred_val}")
                    if last_char_true_val == last_char_pred_val:
                        correct_last_char += 1
                    total_last_char += 1

    accuracy = correct_last_char / total_last_char if total_last_char > 0 else 0
    print(f"Last Character Accuracy: {accuracy*100:.3f}%")

    return accuracy


In [17]:
val_loader = dataset_builder.create_dataloader(data_type='val', batch_size=VAL_BATCH_SIZE)

In [16]:
test_last_char_text(model, val_loader)

cuda


Last Char Test: 100%|██████████| 11/11 [02:11<00:00, 11.96s/it]

Last Character Accuracy: 96.553%





0.9655344021702007

In [24]:
train_loader = dataset_builder.create_dataloader(data_type='train', batch_size=VAL_BATCH_SIZE)

In [None]:
test_last_char_text(model, data_loader=train_loader)

cuda


Last Char Test:  10%|▉         | 20/210 [04:11<39:42, 12.54s/it]