## FineTashkeel
This notebook allows you to upload a text file containing undiacritized text Arabic lines, and diacritize them using [FineTashkeel](https://huggingface.co/mush42/fine-tashkeel/).


## Install/upgrade packages

In [None]:
!pip install --upgrade pip setuptools wheel
!pip install transformers tqdm

## Plumming

In [None]:
import enum
import re


MODEL_ID = "mush42/fine-tashkeel"
OUTPUT_FILENAME = "diac.txt"
WHITESPACE_RE = re.compile(r"\s+")


class ArabicDiacritics(enum.Enum):
    """All possible Arabic diacritics."""

    NO_DIACRITIC = ""
    SUKOON = "ْ"
    SHADDA = "ّ"
    DAMMA = "ُ"
    FATHA = "َ"
    KASRA = "ِ"
    TANWEEN_DAMMA = "ٌ"
    TANWEEN_FATHA = "ً"
    TANWEEN_KASRA = "ٍ"
    SHADDA_PLUS_DAMMA = "ُّ"
    SHADDA_PLUS_FATHA = "َّ"
    SHADDA_PLUS_KASRA = "ِّ"
    SHADDA_PLUS_TANWEEN_DAMMA = "ٌّ"
    SHADDA_PLUS_TANWEEN_FATHA = "ًّ"
    SHADDA_PLUS_TANWEEN_KASRA = "ٍّ"

    @classmethod
    def chars(cls):
        return {
            cls.SUKOON,
            cls.SHADDA,
            cls.DAMMA,
            cls.FATHA,
            cls.KASRA,
            cls.TANWEEN_DAMMA,
            cls.TANWEEN_FATHA,
            cls.TANWEEN_KASRA,
        }

    @classmethod
    def valid(cls):
        return {
            cls.NO_DIACRITIC,
            cls.SUKOON,
            cls.DAMMA,
            cls.FATHA,
            cls.KASRA,
            cls.TANWEEN_DAMMA,
            cls.TANWEEN_FATHA,
            cls.TANWEEN_KASRA,
            cls.SHADDA_PLUS_DAMMA,
            cls.SHADDA_PLUS_FATHA,
            cls.SHADDA_PLUS_KASRA,
            cls.SHADDA_PLUS_TANWEEN_DAMMA,
            cls.SHADDA_PLUS_TANWEEN_FATHA,
            cls.SHADDA_PLUS_TANWEEN_KASRA,
        }

    @classmethod
    def diacritic_to_label(cls):
        return {
            member.value: name
            for (name, member) in cls.__members__.items()
        }


WORD_SEPARATOR = chr(0x20)
ARABIC_LETTERS = frozenset(
    {chr(x) for x in (list(range(0x0621, 0x63B)) + list(range(0x0641, 0x064B)))}
)
PUNCTUATIONS = frozenset({".", "،", ":", "؛", "-", "؟", "!", "(", ")", "[", "]", '"', "«", "»", "/",})
DIACRITIC_CHARS = {diac.value for diac in ArabicDiacritics.chars()}
ALL_VALID_DIACRITICS = {m.value for m in ArabicDiacritics.valid()}
DIACRITIC_LABELS = ArabicDiacritics.diacritic_to_label()
ARABIC_VOWELS = {
    chr(c)
    for c in [0x621, 0x622, 0x623, 0x624, 0x625, 0x626, 0x627, 0x648, 0x649, 0x64a]
}
SENTENCE_DELIMITERS = {".", "؟", "!", "،", ":", "؛", "(", ")", "[", "]", '"', "«", "»",}
WORD_DELIMITERS = {WORD_SEPARATOR, *SENTENCE_DELIMITERS}

_ARABIC_NUMERALS = [str(i) for i in range(10)]
_INDIAN_NUMERALS = [chr(c) for c in range(0x660, 0x66a)]
NUMERAL_CHARS = _ARABIC_NUMERALS + _INDIAN_NUMERALS
VALID_ARABIC_CHARS = {
    WORD_SEPARATOR,
    *ARABIC_LETTERS,
    *PUNCTUATIONS,
    *DIACRITIC_CHARS,
    *NUMERAL_CHARS
}


def collapse_whitespace(text):
    text = re.sub(WHITESPACE_RE, " ", text)
    return text


def basic_cleaner(text):
    text = collapse_whitespace(text)
    return text.strip()


def valid_arabic_cleaner(text):
    return valid_vocab_char_cleaner(text, VALID_ARABIC_CHARS)


def valid_vocab_char_cleaner(text, vocab_chars):
    text = filter(lambda c: c in vocab_chars, text)
    text = collapse_whitespace("".join(list(text)))
    return text.strip()


def diacritics_cleaner(text: str) -> str:
    return text.translate(str.maketrans("", "", "".join(DIACRITIC_CHARS)))

def fix_puncs(text):
    puncs = [".", "؛", "،", "؟", "!", ",", ";", "?"]
    for p in puncs:
        text = text.replace(" " + p, p)
        text = text.replace("  " + p, p)
        text = text.replace("   " + p, p)
    return text


## Upload "lines.txt" Text File

In [None]:
from google.colab import files

uploaded = files.upload()
source_text = uploaded["lines.txt"].decode("utf-8")
source_lines = source_text.splitlines()
source_lines = [line for l in source_lines if (line := l.strip())]
print(f"Found {len(source_lines)} lines in the uploaded file")


## Start diacritization process

In [None]:
from transformers import T5ForConditionalGeneration, ByT5Tokenizer
from tqdm import tqdm


CLEAN_ARABIC_CHARS = frozenset(VALID_ARABIC_CHARS - set(DIACRITIC_CHARS))

tok = ByT5Tokenizer(extra_ids=0)
model = T5ForConditionalGeneration.from_pretrained(MODEL_ID)


def diac(sent):
    input_sent = valid_vocab_char_cleaner(
        fix_puncs(sent),
        CLEAN_ARABIC_CHARS
    )
    inputs = tok(input_sent, return_tensors='pt')
    max_new_tokens = len(sent) * 5
    out = model.generate(inputs.input_ids, max_new_tokens=max_new_tokens) 
    return tok.batch_decode(out, skip_special_tokens=True)


with open(OUTPUT_FILENAME, "a", encoding="utf-8") as outfile:
    for line in tqdm(source_lines, total=len(source_lines), desc="diacritizing line"):
        sents = "\n".join(diac(line))
        outfile.write(sents)
        outfile.write("\n")


print("Process done!")

Download diacritized file

In [None]:
from google.colab import files

files.download(OUTPUT_FILENAME)