In [11]:
roman_stoi = {'?': 0,
 '_': 1,
 '<': 2,
 '>': 3,
 'a': 4,
 'b': 5,
 'c': 6,
 'd': 7,
 'e': 8,
 'f': 9,
 'g': 10,
 'h': 11,
 'i': 12,
 'j': 13,
 'k': 14,
 'l': 15,
 'm': 16,
 'n': 17,
 'o': 18,
 'p': 19,
 'q': 20,
 'r': 21,
 's': 22,
 't': 23,
 'u': 24,
 'v': 25,
 'w': 26,
 'x': 27,
 'y': 28,
 'z': 29}

devnagari_stoi = {'?': 0,
 '_': 1,
 'ँ': 2,
 'ं': 3,
 'ः': 4,
 'अ': 5,
 'आ': 6,
 'इ': 7,
 'ई': 8,
 'उ': 9,
 'ऊ': 10,
 'ऋ': 11,
 'ए': 12,
 'ऐ': 13,
 'ऑ': 14,
 'ओ': 15,
 'औ': 16,
 'क': 17,
 'ख': 18,
 'ग': 19,
 'घ': 20,
 'ङ': 21,
 'च': 22,
 'छ': 23,
 'ज': 24,
 'झ': 25,
 'ञ': 26,
 'ट': 27,
 'ठ': 28,
 'ड': 29,
 'ढ': 30,
 'ण': 31,
 'त': 32,
 'थ': 33,
 'द': 34,
 'ध': 35,
 'न': 36,
 'प': 37,
 'फ': 38,
 'ब': 39,
 'भ': 40,
 'म': 41,
 'य': 42,
 'र': 43,
 'ल': 44,
 'व': 45,
 'श': 46,
 'ष': 47,
 'स': 48,
 'ह': 49,
 '़': 50,
 'ऽ': 51,
 'ा': 52,
 'ि': 53,
 'ी': 54,
 'ु': 55,
 'ू': 56,
 'ृ': 57,
 'े': 58,
 'ै': 59,
 'ॉ': 60,
 'ो': 61,
 'ौ': 62,
 '्': 63,
 '॰': 64}

roman_itos = {v: k for k, v in roman_stoi.items()}
devnagari_itos = {v: k for k, v in devnagari_stoi.items()}

unk_token = "?"
pad_token = "_"
start_token = "<"
end_token = ">"

special_tokens = [unk_token, pad_token, start_token, end_token]

In [12]:
import re
import numpy as np

nepali_to_english_numbers = {
    "०": "0",
    "१": "1",
    "२": "2",
    "३": "3",
    "४": "4",
    "५": "5",
    "६": "6",
    "७": "7",
    "८": "8",
    "९": "9",
    "।": "."
}

def clean_and_map_nepali_text(text):
    """clean the devnagari text
    Args:
        text (str): Input Nepali text.

    Returns:
        str: Cleaned and processed text.
    """
    # Define the characters to remove (punctuation and special symbols)
    pattern = r"[,!?\"'।।‘’“”():;—-]"

    # Remove punctuation
    cleaned_text = re.sub(pattern, "", text)

    return cleaned_text

def one_hot_encode_tokens(tokens: list):
    """One hot encode the token list"""
    MAX_ENCODER_SEQUENCE_LENGTH = 65
    MAX_ENCODER_TOKENS = 19
    ohe = np.zeros((MAX_ENCODER_TOKENS, MAX_ENCODER_SEQUENCE_LENGTH))
    for t, tkn in enumerate(tokens):
        ohe[t, tkn] = 1.0
    # for other just encode the position of the pad token

    t = len(tokens) - 1 if tokens else -1
    ohe[t+1:, devnagari_stoi[pad_token]] = 1.0
    ohe = ohe.reshape(1, ohe.shape[0], ohe.shape[1])
    return ohe

if __name__ == '__main__':
    # Example usage
       # Example usage
    nepali_text = "०७८ साउन १९ मा एकैदिन चार करोड ९१ लाख ९४ हजार रुपैयाँ सारिएको प्रतिवेदनमा उल्लेख छ । गोर्खा मिडियाका उपाध्यक्ष छविलाल जोशीको घरबाट बरामद भएको हार्ड डिस्कमा सानो पाइला सहकारीबाट पनि रकम सिधै गोर्खा मिडियामा सारिएको फेला परेको हो । प्रहरीका अनुसार सानो पाइला सहकारीका अध्यक्ष अनन्तबाबु राई, सचिव देवेन्द्रबाबु राई, कोषाध्यक्ष कुमार रम्तेल, प्रबन्धक असरफ अली सिद्धिकी, संरक्षक गीतेन्द्रबाबु (जीबी) राईलगायत ११ जना हालसम्म फरार छन् । सहकारीकी पूर्वउपाध्यक्ष नेहा पौडेल गत साउन २१ मा जिल्ला अदालत पर्साबाट न्यायीक परीक्षण हुँदा ठहरेबमोजिम हुने गरी हाललाई एक करोड १० लाख धरौटीमा रिहा भइन् । यसैगरी सहकारीका कर्मचारी राधेचन्द्र यादव पनि ३५ लाख रुपैयाँ धरौटीमा रिहा भएका छन् ।"
    cleaned_text = clean_and_map_nepali_text(nepali_text)

    # print(cleaned_text)
    tokens = [0,2]
    ohe = one_hot_encode_tokens(tokens)
    print(ohe)

[[[1. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]]]


In [13]:


class Tokenizer:
    def __init__(self):
        self.pad_token = roman_stoi[special_tokens[1]]
        self.start_token = roman_stoi[special_tokens[2]]
        self.end_token = roman_stoi[special_tokens[3]]
        self.unk_token = roman_stoi[special_tokens[0]]

    def tokenize_devnagari(self, text: list[str]):
        """tokenize the devnagari text into token

        args:
            text: str: The list of devnagari text to be tokenized

        returns:
            list: The list of tokens
        """
        tokens = []
        for word in text:
            token = []
            for character in word:
                try:
                    token.append(devnagari_stoi[character])
                except KeyError:
                    token.append(self.unk_token)
            tokens.append(token)
        return tokens

    def detokenize_devnagari(self, tokens: list[list[int]]):
        """detokenize the roman tokens into text

        args:
            tokens: list: The list of tokens to be detokenized

        returns:
            str: The detokenized list of text
        """
        texts = []
        for token in tokens:
            text = "".join([devnagari_itos[character] for character in token])
            texts.append(text)
        return texts

    def tokenize_roman(self, text: list[str]):
        """tokenize the roman text into token

        args:
            text: str: The roman text to be tokenized

        returns:
            list: The list of tokens
        """
        tokens = []
        for word in text:
            token = []
            for character in word:
                try:
                    token.append(roman_stoi[character])
                except KeyError:
                    token.append(self.unk_token)
            tokens.append(token)
        return tokens

    def detokenize_roman(self, tokens: list[list[int]]):
        """detokenize the roman tokens into text

        args:
            tokens: list: The list of tokens to be detokenized

        returns:
            str: The detokenized list of text
        """
        texts = []
        for tkn in tokens:
            text = "".join([roman_itos[token] for token in tkn])
            texts.append(text)
        return texts

if __name__ == '__main__':
    tokenizer = Tokenizer()
    print("_____ORIGINAL_____")
    text = "सहकारी hello र pसम्पत033 k3j43"
    print(f"Text: {text}\n")

    devnagari_tokens = tokenizer.tokenize_devnagari(text.split())
    devnagari_text = tokenizer.detokenize_devnagari(devnagari_tokens)
    print("_____DEVANAGARI_TOKENS_____")
    print(f"{devnagari_tokens}\n")

    roman_tokens = tokenizer.tokenize_roman(text.split())
    roman_text = tokenizer.detokenize_roman(roman_tokens)
    print("_____ROMAN_TOKENS_____")
    print(roman_tokens)

    unk_ = tokenizer.detokenize_devnagari(devnagari_tokens)
    print("\n_____DETOKENIZED_DEVANAGARI_TOKENS_____")
    print(f"{unk_}")

    unk_ = tokenizer.detokenize_roman(roman_tokens)
    print("_____DETOKENIZED_ROMAN_TOKENS_____")
    print(f"\n{unk_}")

_____ORIGINAL_____
Text: सहकारी hello र pसम्पत033 k3j43

_____DEVANAGARI_TOKENS_____
[[48, 49, 17, 52, 43, 54], [0, 0, 0, 0, 0], [43], [0, 48, 41, 63, 37, 32, 0, 0, 0], [0, 0, 0, 0, 0]]

_____ROMAN_TOKENS_____
[[0, 0, 0, 0, 0, 0], [11, 8, 15, 15, 18], [0], [19, 0, 0, 0, 0, 0, 0, 0, 0], [14, 0, 13, 0, 0]]

_____DETOKENIZED_DEVANAGARI_TOKENS_____
['सहकारी', '?????', 'र', '?सम्पत???', '?????']
_____DETOKENIZED_ROMAN_TOKENS_____

['??????', 'hello', '?', 'p????????', 'k?j??']


In [14]:
def map_numbers(text):
    # Replace Nepali numbers with English numbers
    for nep_num, eng_num in nepali_to_english_numbers.items():
        text = text.replace(nep_num, eng_num)

    return text

def clean(text):
    # clean the text
    cleaned_text = clean_and_map_nepali_text(text)
    # make a list of text
    tokenized_text = cleaned_text.split(" ")

    return tokenized_text

In [15]:
import keras
import numpy as np
import tensorflow as tf
from functools import lru_cache

class Model:
    _instance = None  # Singleton pattern for caching model

    def __new__(cls, model_path):
        if not cls._instance:
            cls._instance = super().__new__(cls)
        return cls._instance

    def __init__(self, model_path):
        if not hasattr(self, 'model'):
            self.model = keras.models.load_model(model_path)
            self.LATENT_DIM = 256
            # Warm up the models
            self._encoder_model = self._create_encoder()
            self._decoder_model = self._create_decoder()

    def _create_encoder(self):
        """Create encoder model only once"""
        encoder_inputs = self.model.input[0]
        encoder_outputs, state_h_enc, state_c_enc = self.model.layers[2].output
        return keras.Model(encoder_inputs, [state_h_enc, state_c_enc])

    def _create_decoder(self):
        """Create decoder model only once"""
        decoder_inputs = self.model.input[1]
        decoder_state_input_h = keras.Input(shape=(self.LATENT_DIM,), name="decoder_state_1")
        decoder_state_input_c = keras.Input(shape=(self.LATENT_DIM,), name="decoder_state_2")
        decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

        decoder_lstm = self.model.layers[3]
        decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
            decoder_inputs, initial_state=decoder_states_inputs)

        decoder_dense = self.model.layers[4]
        decoder_outputs = decoder_dense(decoder_outputs)

        return keras.Model(
            [decoder_inputs] + decoder_states_inputs,
            [decoder_outputs, state_h_dec, state_c_dec]
        )

    @property
    def encoder_model(self):
        return self._encoder_model

    @property
    def decoder_model(self):
        return self._decoder_model

In [16]:
MODEL_LOCATION="./model_1.keras"
NUM_DECODER_TOKENS=30
MAX_OUTPUT_SEQUENCE_LENGTH=100

In [17]:
import numpy as np
import tensorflow as tf
from functools import wraps, lru_cache

def array_to_tuple(arr):
    """Convert NumPy array to a hashable tuple representation"""
    if isinstance(arr, np.ndarray):
        # For 2D or 3D arrays, convert to nested tuples
        if arr.ndim == 3:
            return tuple(tuple(tuple(row) for row in layer) for layer in arr)
        elif arr.ndim == 2:
            return tuple(tuple(row) for row in arr)
        elif arr.ndim == 1:
            return tuple(arr)
    return arr

def tuple_to_array(tup):
    """Convert tuple representation back to NumPy array"""
    if isinstance(tup, tuple):
        # Handle 3D array case
        if tup and isinstance(tup[0], tuple) and isinstance(tup[0][0], tuple):
            return np.array([list(list(row) for row in layer) for layer in tup])
        # Handle 2D array case
        elif tup and isinstance(tup[0], tuple):
            return np.array(list(list(row) for row in tup))
        # Handle 1D array case
        else:
            return np.array(tup)
    return tup

def cached_decode_sequence(maxsize=1000):
    """Custom caching decorator that handles NumPy arrays"""
    def decorator(func):
        cache = {}

        @wraps(func)
        def wrapper(input_seq, *args, **kwargs):
            # Convert input to hashable form
            hashable_key = array_to_tuple(input_seq)

            # Check if result is already in cache
            if hashable_key in cache:
                return cache[hashable_key]

            # Compute result
            result = func(input_seq, *args, **kwargs)

            # Store in cache
            cache[hashable_key] = result

            # Limit cache size
            if len(cache) > maxsize:
                cache.popitem()

            return result

        return wrapper
    return decorator

@cached_decode_sequence(maxsize=1000)
def decode_sequence(input_seq, model_path=MODEL_LOCATION):
    """Optimized sequence decoding with custom caching"""
    # Ensure input_seq is a NumPy array
    if isinstance(input_seq, tuple):
        input_seq = tuple_to_array(input_seq)

    model = Model(model_path)

    # Convert input to tensor for potential performance boost
    input_seq = tf.convert_to_tensor(input_seq, dtype=tf.float32)

    # Encode the input as state vectors.
    states_value = model.encoder_model.predict(input_seq, verbose=0)
    states_value = [tf.convert_to_tensor(state) for state in states_value]

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, NUM_DECODER_TOKENS), dtype=np.float32)
    target_seq[0, 0, roman_stoi["<"]] = 1.0

    # Sampling loop with early stopping
    decoded_sentence = ""
    for _ in range(MAX_OUTPUT_SEQUENCE_LENGTH):
        output_tokens, h, c = model.decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = roman_itos[sampled_token_index]

        # Exit conditions
        if sampled_char == ">" or len(decoded_sentence) >= MAX_OUTPUT_SEQUENCE_LENGTH:
            break

        decoded_sentence += sampled_char

        # Update the target sequence
        target_seq = np.zeros((1, 1, NUM_DECODER_TOKENS), dtype=np.float32)
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [18]:
def transliterate(text: str) -> str:
    # Tokenization and one-hot encoding in one step
    tokenized_devnagari = tokenizer.tokenize_devnagari(text)

    # Ensure non-empty tokenization
    if not tokenized_devnagari:
        return ""

    # Convert in number with error handling
    try:
        ohe_token = one_hot_encode_tokens(tokenized_devnagari[0])
    except Exception as e:
        print(f"Encoding error for text '{text}': {e}")
        return ""

    # Translation with caching
    return decode_sequence(ohe_token)

In [19]:
def process_text_batch(text, batch_size=128):
    """Process text in batches for potential performance improvement"""
    sentences = text.split("।")
    roman_translation = []

    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        batch_translations = []

        for line in batch:
            words = line.split()
            translated_sentence = []

            for word in words:
                if any(char.isdigit() for char in word):
                    translated_sentence.append(map_numbers(word))
                else:
                    clean_text = clean(word)
                    if clean_text:
                        translated_sentence.append(transliterate(clean_text))

            # Post-processing
            translated_sentence = " ".join(translated_sentence).strip()
            if translated_sentence:
                translated_sentence = translated_sentence[0].upper() + translated_sentence[1:] + "."
                batch_translations.append(translated_sentence)

        roman_translation.extend(batch_translations)

    return roman_translation

In [20]:
process_text_batch("तस्बिरहरु ऐजेन्सी चिनियाँ सहर जुहाईको हुँदै मकाउ जोड्ने")

['Tasbirharu aigensi chiniyan sahar juhaiko hundai makau jodne.']

In [None]:
def process_full_file(input_file, output_file):
    # Read the input file
    with open(input_file, 'r', encoding='utf-8') as file:
        text = file.read()

    # Split text into sentences
    sentences = text.split('।')

    # Process each sentence
    roman_translation = []
    total_sentences = len(sentences)

    for i, sentence in enumerate(sentences, 1):
        # Skip empty sentences
        if not sentence.strip():
            continue

        # Process words in the sentence
        words = sentence.split()
        translated_words = []

        for word in words:
            # Handle numbers separately
            if any(char.isdigit() for char in word):
                translated_words.append(map_numbers(word))
                continue

            # Clean and transliterate
            clean_text = clean(word)
            if not clean_text:
                continue

            try:
                translated_word = transliterate(clean_text)
                translated_words.append(translated_word)
            except Exception as e:
                print(f"Error transliterating word '{word}': {e}")
                translated_words.append(word)

        # Combine translated words
        if translated_words:
            translated_sentence = " ".join(translated_words)
            # Capitalize first letter
            translated_sentence = translated_sentence[0].upper() + translated_sentence[1:]
            roman_translation.append(translated_sentence + '.')

        # Print progress
        progress = (i / total_sentences) * 100
        print(f"Processing: {progress:.2f}% ({i}/{total_sentences})")

    # Write to output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write('\n'.join(roman_translation))

    print(f"Transliteration complete. Output saved to {output_file}")

# Usage
input_text = "./nepali_1.txt"
output_text = "./output_1.txt"
process_full_file(input_text, output_text)

Processing: 0.00% (1/36638)
Processing: 0.01% (2/36638)
Processing: 0.01% (3/36638)
Processing: 0.01% (4/36638)
Processing: 0.01% (5/36638)
Processing: 0.02% (6/36638)
Processing: 0.02% (7/36638)
Processing: 0.02% (8/36638)
Processing: 0.02% (9/36638)
Processing: 0.03% (10/36638)
Processing: 0.03% (11/36638)
Processing: 0.03% (12/36638)
Processing: 0.04% (13/36638)
Processing: 0.04% (14/36638)
Processing: 0.04% (15/36638)
Processing: 0.04% (16/36638)
Processing: 0.05% (17/36638)
Processing: 0.05% (18/36638)
Processing: 0.05% (19/36638)
Processing: 0.05% (20/36638)
Processing: 0.06% (21/36638)
Processing: 0.06% (22/36638)
Processing: 0.06% (23/36638)
Processing: 0.07% (24/36638)
Processing: 0.07% (25/36638)
Processing: 0.07% (26/36638)
Processing: 0.07% (27/36638)
Processing: 0.08% (28/36638)
Processing: 0.08% (29/36638)
Processing: 0.08% (30/36638)
Processing: 0.08% (31/36638)
Processing: 0.09% (32/36638)
Processing: 0.09% (33/36638)
Processing: 0.09% (34/36638)
Processing: 0.10% (35/3

## optimize