In [1]:
dependent_vowel = {
    'ा': 'ā', 
    'ि': 'i', 
    'ी': 'ī', 
    'ु': 'u', 
    'ू': 'ū', 
    'ृ': 'ri', 
    'ॄ': 'rr', 
    'ॅ': 'ĕ', 
    'ॆ': 'e', 
    'े': 'ê', 
    'ै': 'ai', 
    'ॉ': 'ŏ', 
    'ॊ': 'o', 
    'ो': 'ô', 
    'ौ': 'au', 
    '्': '', #lack of a vowel sound
    'ं': 'ṃ',
    'ं': 'n',
    'ँ': 'n̐',
    'ः': 'ḥ ',
    '़': '', #accounts for regional differences, unsure of exact differences (kept 'a' to avoid errors)

}
print(len(dependent_vowel))
consonant = {
    'क': 'k',
    'ख': 'kh',
    'ग': 'g',
    'घ': 'gh',
    'ङ': 'ṅa',
    'च': 'c',
    'छ': 'ch',
    'ज': 'j',
    'झ': 'jh',
    'ञ': 'ña',
    'ट': 'ṭ',
    'ठ': 'ṭh',
    'ड': 'ḍ',
    'ढ': 'ḍh',
    'ण': 'ṇ',
    'त': 't',
    'थ': 'th',
    'द': 'd',
    'ध': 'dh',
    'न': 'n',
    'प': 'p',
    'फ': 'f',
    'ब': 'b',
    'भ': 'bh',
    'म': 'm',
    'य': 'y',
    'र': 'r',
    'व': 'v',
    'ल': 'l',
    'श': 'sh',
    'ष': 'shh',
    'स': 'ś',
    'ह': 'h',
    'क्ष': 'ksh',
    'त्र': 'tr',
    'ज्ञ': 'gy',
    'ऽ': "'",
    'ळ': 'l',
}
print(len(consonant))
independent_vowel = {
    'अ':'a',
    'आ':'aa',
    'इ':'e',
    'ई':'i',
    'उ':'u',
    'ऊ':'oo',
    'ए':'a',
    'ऐ':'ae',
    'ओ':'o',
    'औ':'ao',
    'अं':'am',
    'अः':'a',
    'ऋ':'ri',
    'ॠ':'rr',
    'ऑ':'aw',
}
print(len(independent_vowel))


20
38
15


In [2]:
def transliterate(word: str, dependent_vowel: dict, consonant: dict, independent_vowel: dict) -> str:
    transliteration = ''
    letter_type = []  

    for char in word:
        if char in dependent_vowel:
            transliteration += dependent_vowel[char]
            letter_type.append('d')
        elif char in consonant:
            transliteration += consonant[char]
            letter_type.append('c')
        elif char in independent_vowel:
            transliteration += independent_vowel[char]
            letter_type.append('i')
        else:
            print(f"Warning: Character '{char}' not found in any dictionary.")

        #print(transliteration)
        # Check for consecutive consonants

        if len(transliteration) > 1 and len(letter_type) > 1:
            if letter_type[-1] == 'c' and letter_type[-2] == 'c':
                # Make sure there are at least two characters in transliteration
                if len(transliteration) > 1:
                    transliteration = transliteration[:-1] + 'a' + transliteration[-1]
                # Remove the last consonant if needed
                if len(transliteration) > 2:
                    transliteration = transliteration[:-2] + 'a' + transliteration[-1]
                # Ensure the letter type is updated correctly
                letter_type[-1] = 'c'
    return transliteration
    

In [3]:
output = transliterate('अनाधिकृतपणे', dependent_vowel, consonant, independent_vowel)
print(output)

anādhikritapaṇê


In [4]:
output = transliterate('पॉडकास्टर', dependent_vowel, consonant, independent_vowel)
print(output)

pŏḍakāśṭar


In [5]:
output = transliterate('बेयंत', dependent_vowel, consonant, independent_vowel)
print(output)

bêynt


In [6]:
output = transliterate('बोझि', dependent_vowel, consonant, independent_vowel)
print(output)

bôjhi


In [7]:
output = transliterate('बहानें', dependent_vowel, consonant, independent_vowel)
print(output)

bahānên


In [8]:
output = transliterate('ब्रेडनबर्ग', dependent_vowel, consonant, independent_vowel)
print(output)

brêḍanabarg


https://github.com/eymenefealtun/all-words-in-all-languages/blob/main/Hindi/Hindi.txt

In [17]:
words = []
with open('Hindi.txt', 'r') as file:
    content = file.read()
    print(f"File content: {content}")
    words = content.split(',')
    words_array = [word.strip() for word in words]

    hashTable = {} #store word and transliteration
    for word in words_array[:10000]:
        output = transliterate(word, dependent_vowel, consonant, independent_vowel)
        hashTable[word] = output
    print(hashTable)
#


File content: श्वाघ्निक,प्लॉट,प्लॉटों,परिहारण,अपरिहारण,लघुक्रम,थैलियों,मनमैला,कुलश्रेष्ठ,कुलश्रेष्ठों,उप-किराएदारी,लसिका,अस्वरण,उपरौना,चश्म-ए-बद्दूर,कपिलस्मृति,अन्यबीजज,द्विरागमन,द्विरागमनों,एनपीटी,पुनरादान,वर्तमानत,वर्तमानता,अस्वरन,चमडा,चमडे,सूक्ष्मत,सूक्ष्मता,स्वत्वविलेख,पराकाष्ठा,पराकाष्ठे,पराकाष्ठे,पराकाष्ठों,वर्तमानन,प्रपूर्ण,प्रपूर्णों,बेबाकी,बेबाकियाँ,बेबाकिओं,बेबाकिओं,प्रपूर्त,दंडविधान,भाषिका,भाषिकाएँ,भाषिकाओं,भाषिकाओं,भाषिकी,गोपीजनवल्लभ,जीव-रसायन,पेट्रो-रसायन,परिक्रामित,ज़िपलौक,हल्के-फुल्के,सायरन,भुसुंड,अग्निजात,मेटाबोलाइज़,अबूजा,वृन्दवादन,मेटाबोलाइड़,संप्रवृत्ति,प्रामीत्य,भगिनीगामी,नरभुक,क्षुधासूति,खेमकरी,मंदारमाला,बदलाहट,उत्तरपुस्तिका,कमलाई,स्वाधीनतोपरांत,डिंबाहव,जीवाणुवैज्ञानिक,जीवाणुवैज्ञानिकों,कुशलपूर्वक,बिनचाहा,सुविज्ञेय,संवर्मी,दुभिक्ष,प्रसन्नात्मा,डर्विल,मृदंगफल,अयन-काल,अयन-कालों,पीतगल,पुरातत्त्व,तूँबा,तूँबे,तूँबों,तूँबी,तूँबे,अभिप्रायविरुद्ध,दांना,अक्रर्,साधने_वाला,पितविहीन,अपक्षत,बाल्मीक,बाल्मीकी,खातक,खातकों,ज़ुहूर,ज़ुहूरों,सुकवा,काज्यो,सुकवि,बालियों,कहुँ,विवाहोपरांतत,उपर

In [18]:
import csv
# Write hashTable to a new CSV file
with open('transliterated_words.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Hindi', 'Transliteration']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for word, transliteration in hashTable.items():
        writer.writerow({'Hindi': word, 'Transliteration': transliteration})

print("HashTable has been written to 'transliterated_words.csv'")

HashTable has been written to 'transliterated_words.csv'
