In [1]:
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
from indicnlp.transliterate.unicode_transliterate import ItransTransliterator
from google.transliteration import transliterate_text
from indictrans import Transliterator
import re

In [2]:
roman_scripts_file_path = "../datasets/dakshina_dataset_v1.0/hi/romanized/hi.romanized.rejoined.test.roman.txt"
native_scripts_file_path = "../datasets//dakshina_dataset_v1.0/hi/romanized/hi.romanized.rejoined.test.native.txt"

In [3]:
native_lines = []
with open(native_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        native_lines.append(line)
        # cleaned_line = line.replace('-', '')
        #cleaned_line = re.sub(r'["\'#,]', '', line)
        #native_lines.append(cleaned_line)

romanized_lines = []
with open(roman_scripts_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        romanized_lines.append(line)
        # cleaned_line = line.replace('-', '')
        #cleaned_line = re.sub(r'["\',]', '', line)
        #romanized_lines.append(cleaned_line)

In [4]:
len(native_lines)

5000

In [5]:
len(romanized_lines)

5000

In [6]:
df = pd.DataFrame({'Hinglish': romanized_lines, 'Hindi': native_lines})
print(df.head())

                                            Hinglish  \
0  Kumbh rashi men janme log carbanion se bhari e...   
1                         Iska ulat bhi satya hai.\n   
2  kuch devta jo mukhyatah nagar devta the, apne ...   
3  Tel ke utpadan men sansar men Romania ka chhat...   
4  Banarasi Lal se milkar police ne sara bhed pra...   

                                               Hindi  
0  कुंभ राशि में जन्मे लोग संभावनाओं से भरी एक जग...  
1                             इसका उलट भी सत्य है।\n  
2  कुछ देवता जो मुख्यत: नगर देवता थे, अपने संप्रद...  
3  तेल के उत्पादन में संसार में रोमानिया का छठा स...  
4  बनारसी लाल से मिलकर पुलिस ने सारा भेद प्राप्त ...  


In [7]:
len(df)

5000

In [8]:
sampled_df = df.sample(n = 100, replace = False, random_state = 42)

In [9]:
hinglish_sentences = sampled_df['Hinglish']
print(hinglish_sentences)

hinglish_sentences = hinglish_sentences.values
print(type(hinglish_sentences))
print(hinglish_sentences.shape)

hindi_true = list(sampled_df['Hindi'])

1501             yahan par bhagwan Bahubali ki 18 meter\n
2586                                      samjha-parkha\n
2653    jab kamishnar Arun ke saath baat kar rahe the,...
1055    Joshi ne ASI men purattwavidon ke samarthan ka...
705     Ageti Parikshit Sharma Sanskrut Bhasha ke prat...
                              ...                        
4740               karnatak: devarkadu, nagban, nagkudu\n
2940    adhikansh kavita barabar avadhi aur vrajbhasha...
3456             iaka swar do sinhon dwara rakshit hai.\n
373     ek saal bad, raja sahib ne istifa de diya aur ...
79      prabandhan, raktadhan men kabhi kabhi sahayak ...
Name: Hinglish, Length: 100, dtype: object
<class 'numpy.ndarray'>
(100,)


In [10]:
# Testing using LIBINC transliterator
# trn = Transliterator(source='eng', target='hin', build_lookup=True, decode='beamsearch')
trn = Transliterator(source='eng', target='hin', build_lookup=True)
bleu_scores = []
for i in range(hinglish_sentences.size):

    transformed_sentence = trn.transform(hinglish_sentences[i])
    reference = [hindi_true[i].split()]
    # print(transformed_sentence)
    transformed_tokens = transformed_sentence.split()
    bleu_score = sentence_bleu(reference, transformed_tokens)
    bleu_scores.append(bleu_score)

    if i % 10 == 0:
        print("Transliterating sentence ", i+1)
        print('Original Code mized text :', hinglish_sentences[i])
        print('Original Native Hindi convertion :', hindi_true[i])
        print('Transliterated to Hindi :', transformed_sentence)
        print('Average BLEU score :', np.mean(bleu_scores), '\n')

Transliterating sentence  1
Original Code mized text : yahan par bhagwan Bahubali ki 18 meter

Original Native Hindi convertion : यहाँ पर भगवान बाहुबली की १८ मी.

Transliterated to Hindi : यहाँ पर भगवान बाहुबली की 18 मीटर

Average BLEU score : 0.6147881529512643 

Transliterating sentence  11
Original Code mized text : Stivan S Reinmund: purv chairman aur CEO, Pepsiko In

Original Native Hindi convertion : स्टीवन एस रेइनमुंड: पूर्व चेयरमैन और सीईओ, पेप्सीको इंक

Transliterated to Hindi : स्टिवन स रेनमंड: पूर्व चेयरमाण और सियो, पेप्सिको इन

Average BLEU score : 0.3678757699675088 



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Transliterating sentence  21
Original Code mized text : vaha ke bhudrishya ka dekhkar use pata chala ki 'manav shram ke samuchit prayog se' us khsetra ki to kaya hi palat gai hai.

Original Native Hindi convertion : वहां के भूदॄश्य का देखकर उसे पता चला कि ‘मानव श्रम के समुचित प्रयोग से’ उस क्षेत्र की तो काया ही पलट गई है।

Transliterated to Hindi : वाहा के भूद्रीश्य का देखकर उसे पता चला की 'मानव श्रम के समुचित प्रयोग से' उस ख्सेत्र की तो काया ही पलट गई हैं.

Average BLEU score : 0.3800078134465459 

Transliterating sentence  31
Original Code mized text : iska vishisht vidyutiya pratirodh 4.9 hai jo platinum ka lagbhag adha hai.

Original Native Hindi convertion : इसका विशिष्ट विद्युतीय प्रतिरोध ४.९ है जो प्लैटिनम का लगभग आधा है।

Transliterated to Hindi : इसका विशिष्ट विद्युतिया प्रतिरोध 4.9 हैं जो प्लेटिनम का लगभग आधा हैं.

Average BLEU score : 0.38058544213481127 

Transliterating sentence  41
Original Code mized text : ek baar to unhone Jawaharlal Nehru ko patr likhkar krantikari na

In [11]:
average_bleu_score = np.mean(bleu_scores)
print('Average Bleu Score for Train Set : ', average_bleu_score)

Average Bleu Score for Train Set :  0.3185566633653254


In [12]:
"""
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"C:\Johns Hopkins\sem1\Machine Translation\MT Final Project\IndicTrans2\indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES=r"C:\Johns Hopkins\sem1\Machine Translation\MT Final Project\IndicTrans2\indic_nlp_resources"

import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()

# Testing using AI4Bharath transliterator
ai4b_bleu_scores = []
lang = 'hi'
for i in range(hinglish_sentences.size):

    transformed_sentence = ItransTransliterator.from_itrans(hinglish_sentences[i],lang)
    reference = [hindi_true[i].split()]
    # print(transformed_sentence)
    transformed_tokens = transformed_sentence.split()
    bleu_score = sentence_bleu(reference, transformed_tokens)
    ai4b_bleu_scores.append(bleu_score)

    if i % 10 == 0:
        print("Transliterating sentence ", i+1)
        print('Original Code mized text :', hinglish_sentences[i])
        print('Original Native Hindi convertion :', hindi_true[i])
        print('Transliterated to Hindi :', transformed_sentence)
        print('Average BLEU score :', np.mean(ai4b_bleu_scores), '\n')

average_bleu_score = np.mean(ai4b_bleu_scores)
print('Average Bleu Score for Train Set : ', average_bleu_score)
"""

Transliterating sentence  1
Original Code mized text : yahan par bhagwan Bahubali ki 18 meter

Original Native Hindi convertion : यहाँ पर भगवान बाहुबली की १८ मी.

Transliterated to Hindi : यहन् पर् भग्वन् Bअहुबलि कि १८ मेतेर्

Average BLEU score : 1.1200407237786664e-231 

Transliterating sentence  11
Original Code mized text : Stivan S Reinmund: purv chairman aur CEO, Pepsiko In

Original Native Hindi convertion : स्टीवन एस रेइनमुंड: पूर्व चेयरमैन और सीईओ, पेप्सीको इंक

Transliterated to Hindi : Sतिवन् S ऱेइन्मुन्द्: पुर्व् चैर्मन् और् CEO, Pएप्सिको ईन्

Average BLEU score : 2.349485101993388e-79 

Transliterating sentence  21
Original Code mized text : vaha ke bhudrishya ka dekhkar use pata chala ki 'manav shram ke samuchit prayog se' us khsetra ki to kaya hi palat gai hai.

Original Native Hindi convertion : वहां के भूदॄश्य का देखकर उसे पता चला कि ‘मानव श्रम के समुचित प्रयोग से’ उस क्षेत्र की तो काया ही पलट गई है।

Transliterated to Hindi : वह के भुद्रिश्य क देख्कर् उसे पत चल कि 'मन

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Average Bleu Score for Train Set :  0.0014528679532351443


In [15]:
# Testing out Google Transliterate package
from google.transliteration import transliterate_word
def transliterate_all_alphanumeric_parts(sentence, lang_code='hi'):
    transliterated_sentence = []
    words = sentence.split()

    for word in words:
        # Use regex to find all alphanumeric parts in the word
        parts = re.findall(r'[a-zA-Z0-9]+|[^a-zA-Z0-9]+', word)
        
        transliterated_word = ""
        
        for part in parts:
            if part.isalnum():  # Transliterate only alphanumeric parts
                try:
                    # Transliterate the alphanumeric part
                    suggestions = transliterate_word(part, lang_code=lang_code)
                    transliterated_part = suggestions[0] if suggestions else part
                except IndexError:
                    # If transliteration fails, keep the alphanumeric part as is
                    transliterated_part = part
            else:
                # Keep non-alphanumeric parts (like "-" or spaces) as is
                transliterated_part = part
            
            transliterated_word += transliterated_part
        
        transliterated_sentence.append(transliterated_word)

    return ' '.join(transliterated_sentence)

google_bleu_scores = []
lang = 'hi'
for i in range(hinglish_sentences.size):

    print(i)
    # transformed_sentence = transliterate_text(hinglish_sentences[i],lang_code = lang)
    transformed_sentence = transliterate_all_alphanumeric_parts(hinglish_sentences[i], lang_code=lang)
    reference = [hindi_true[i].split()]
    # print(transformed_sentence)
    transformed_tokens = transformed_sentence.split()
    bleu_score = sentence_bleu(reference, transformed_tokens)
    google_bleu_scores.append(bleu_score)

    if i % 10 == 0:
        print("Transliterating sentence ", i+1)
        print('Original Code mized text :', hinglish_sentences[i])
        print('Original Native Hindi convertion :', hindi_true[i])
        print('Transliterated to Hindi :', transformed_sentence)
        print('Average BLEU score :', np.mean(google_bleu_scores), '\n')

average_bleu_score = np.mean(google_bleu_scores)
print('Average Bleu Score for Train Set : ', average_bleu_score)

0
Transliterating sentence  1
Original Code mized text : yahan par bhagwan Bahubali ki 18 meter

Original Native Hindi convertion : यहाँ पर भगवान बाहुबली की १८ मी.

Transliterated to Hindi : यहाँ पर भगवन बाहुबली की १८ मीटर
Average BLEU score : 6.313993041533344e-78 

1
2
3
4
5
6
7
8
9
10
Transliterating sentence  11
Original Code mized text : Stivan S Reinmund: purv chairman aur CEO, Pepsiko In

Original Native Hindi convertion : स्टीवन एस रेइनमुंड: पूर्व चेयरमैन और सीईओ, पेप्सीको इंक

Transliterated to Hindi : स्टीवन स रेमण्ड: पूर्व चेयरमैन और सीईओ, पेप्सिको इन
Average BLEU score : 0.41052323410637204 

11
12
13
14
15
16
17
18
19
20
Transliterating sentence  21
Original Code mized text : vaha ke bhudrishya ka dekhkar use pata chala ki 'manav shram ke samuchit prayog se' us khsetra ki to kaya hi palat gai hai.

Original Native Hindi convertion : वहां के भूदॄश्य का देखकर उसे पता चला कि ‘मानव श्रम के समुचित प्रयोग से’ उस क्षेत्र की तो काया ही पलट गई है।

Transliterated to Hindi : वह के भ

In [16]:
print('Google Transliterate model is very accurate but takes time to transliterate the sentence')

Google Transliterate model is very accurate but takes time to transliterate the sentence


In [17]:
from ai4bharat.transliteration import XlitEngine

In [18]:
e = XlitEngine("hi")
out = e.translit_word("computer", topk=5, beam_width=10)
print(out)
# output:{'hi': ['कम्प्यूटर', 'कंप्यूटर', 'कम्पूटर', 'कम्पुटर', 'कम्प्युटर']}

Loading hi...


  weights = torch.load( weight_path, map_location=torch.device(self.device))


{'hi': ['कम्प्यूटर', 'कंप्यूटर', 'कम्पूटर', 'कम्पुटर', 'कम्प्युटर']}


In [19]:
# Testing using AI4Bharath transliterator
ai4b_bleu_scores = []
lang = 'hi'
e = XlitEngine('hi')
for i in range(hinglish_sentences.size):

    transformed_sentence = e.translit_sentence(hinglish_sentences[i])[lang]
    reference = [hindi_true[i].split()]
    print(transformed_sentence)
    transformed_tokens = transformed_sentence.split()
    bleu_score = sentence_bleu(reference, transformed_tokens)
    ai4b_bleu_scores.append(bleu_score)

    if i % 1 == 0:
        print("Transliterating sentence ", i+1)
        print('Original Code mized text :', hinglish_sentences[i])
        print('Original Native Hindi convertion :', hindi_true[i])
        print('Transliterated to Hindi :', transformed_sentence)
        print('Average BLEU score :', np.mean(ai4b_bleu_scores), '\n')

Loading hi...
यहाँ पर भगवन बहुबली कि 18 मीटर
Transliterating sentence  1
Original Code mized text : yahan par bhagwan Bahubali ki 18 meter

Original Native Hindi convertion : यहाँ पर भगवान बाहुबली की १८ मी.

Transliterated to Hindi : यहाँ पर भगवन बहुबली कि 18 मीटर
Average BLEU score : 6.968148412761692e-155 



The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


समझा-परखा
Transliterating sentence  2
Original Code mized text : samjha-parkha

Original Native Hindi convertion : समझा-परखा

Transliterated to Hindi : समझा-परखा
Average BLEU score : 3.484074206380846e-155 



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


जब कमिश्नर अरुण के साठ बात कर रहे थे, तो सपना आती है आउर् आयुक्त अरुण से छुपने को कहता है.
Transliterating sentence  3
Original Code mized text : jab kamishnar Arun ke saath baat kar rahe the, to Sapna aati hai aur aayukt Arun se chhupne ko kahta hai.

Original Native Hindi convertion : जब कमिश्नर अरुण के साथ बात कर रहे थे, तो सपना आती है और आयुक्त अरुण से छुपने को कहता है।

Transliterated to Hindi : जब कमिश्नर अरुण के साठ बात कर रहे थे, तो सपना आती है आउर् आयुक्त अरुण से छुपने को कहता है.
Average BLEU score : 0.22374677077485583 

जोशी ने एसी में पुरातत्वविदों के सामर्थन का उठान किया आउर् मौलिक को बहार करने का प्रयास किया, लेकिन सरकार ने जोशी कि बजाय उसे खारिज कर दिया.
Transliterating sentence  4
Original Code mized text : Joshi ne ASI men purattwavidon ke samarthan ka uthan kiya aur maulik ko bahar karne ka prayas kiya, lekin sarkar ne Joshi ki bajay use kharij kar diya.

Original Native Hindi convertion : जोशी ने एएसआई में पुरातत्वविदों के समर्थन का उत्थान किया और मौलिक को बाहर करने

KeyboardInterrupt: 

In [None]:
average_bleu_score = np.mean(ai4b_bleu_scores)
print('Average Bleu Score for Train Set : ', average_bleu_score)

In [None]:
print(hinglish_sentences[12])
print(hindi_true[12])

In [None]:

suggestions = transliterate_word('America', lang_code='ja')
print(suggestions[0])

#suggestions = transliterate_word('#20', lang_code='hi')
#print(suggestions[0])

def transliterate_sentence(sentence, lang_code='hi'):
    transliterated_sentence = []
    words = sentence.split()
    
    for word in words:
        # Check if word is alphanumeric
        if word.isalnum():
            try:
                # Get the first suggestion for the transliterated word
                suggestions = transliterate_word(word, lang_code=lang_code)
                transliterated_word = suggestions[0] if suggestions else word
                transliterated_sentence.append(transliterated_word)
            except IndexError:
                # Handle cases where no suggestions are returned
                transliterated_sentence.append(word)
        else:
            # Skip non-alphanumeric words
            transliterated_sentence.append(word)

    return ' '.join(transliterated_sentence)

sentence = hinglish_sentences[12]
print(transliterate_sentence(sentence))

In [None]:
def transliterate_all_alphanumeric_parts(sentence, lang_code='hi'):
    transliterated_sentence = []
    words = sentence.split()

    for word in words:
        # Use regex to find all alphanumeric parts in the word
        parts = re.findall(r'[a-zA-Z0-9]+|[^a-zA-Z0-9]+', word)
        
        transliterated_word = ""
        
        for part in parts:
            if part.isalnum():  # Transliterate only alphanumeric parts
                try:
                    # Transliterate the alphanumeric part
                    suggestions = transliterate_word(part, lang_code=lang_code)
                    transliterated_part = suggestions[0] if suggestions else part
                except IndexError:
                    # If transliteration fails, keep the alphanumeric part as is
                    transliterated_part = part
            else:
                # Keep non-alphanumeric parts (like "-" or spaces) as is
                transliterated_part = part
            
            transliterated_word += transliterated_part
        
        transliterated_sentence.append(transliterated_word)

    return ' '.join(transliterated_sentence)

# Example usage
sentence = "This is an example with uprichar-Wasu and another-word."
print(transliterate_all_alphanumeric_parts(sentence))

In [None]:
print(transliterate_all_alphanumeric_parts(hinglish_sentences[12]))

In [None]:
print(transliterate_all_alphanumeric_parts(hinglish_sentences[24]))

In [None]:
print(hindi_true[24])