In [13]:
# build_dataset.py

ALLOWED_SIGNS = {
    # vowels
    "අ","ආ","ඇ","ඈ","ඉ","ඊ","උ","ඌ","එ","ඒ","ඔ","ඕ",
    "ඓ", "ඖ", "ඍ",

    # consonants (hal)
    "ක්","ග්","ජ්","ට්","ද්","ණ්","ත්","න්","බ්","ය්","ල්",
    "ඩ්","ප්","ම්","ර්","ව්","ස්","හ්","ළ්",
    "ඛ්", "ධ්", "ච්", "භ්", "ථ්", "ෆ්", "ශ්", "ෂ්", "ඤ්", "ඡ්",

    # special consonants
    "ඟ්","ඳ්","ඬ්","ඹ්","ඝ්","ඪ්","ඨ්","ඵ්",
    "ං","්‍ය",
    "ෟ",
    "්‍ර" # Rakaranshaya
}

In [14]:
VOWEL_MAP = {
    "ා": "ආ",
    "ි": "ඉ",
    "ී": "ඊ",
    "ු": "උ",
    "ූ": "ඌ",
    "ෙ": "එ",
    "ේ": "ඒ",
    "ො": "ඔ",
    "ෝ": "ඕ",
    "ෛ": "ඓ",
    "ෞ": "ඖ",
    "ැ": "ඇ",
    "ෑ": "ඈ"
}

In [15]:
def to_fingerspelling(word):
    output = []

    # Strip common zero-width and non-breaking space characters, but keep Zero Width Joiner for conjuncts
    word = word.replace('\u200b', '').replace('\ufeff', '').replace(' ', '')

    i = 0
    while i < len(word):
        ch = word[i]

        # Priority 1a: Check for consonant + '්' + '‍' + 'ය' + optional vowel modifier (Yakaranshaya: 'ග්‍යා' -> 'ග්', '්‍ය', 'ආ')
        if i + 3 < len(word) and \
           ("\u0D9A" <= ch <= "\u0DC6") and \
           word[i+1] == '්' and \
           word[i+2] == '‍' and \
           word[i+3] == 'ය':

            consonant_hal = ch + "්"
            if consonant_hal not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{consonant_hal}' not in inventory for word '{word}'")
            output.append(consonant_hal) # Add 'ග්'

            # Add '්‍ය' sign
            if '්‍ය' not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '්‍ය' not in inventory for word '{word}'")
            output.append('්‍ය') # Add '්‍ය'

            # Check for trailing vowel modifier (e.g., 'ා' in 'ග්‍යා')
            if i + 4 < len(word) and word[i+4] in VOWEL_MAP:
                independent_vowel = VOWEL_MAP[word[i+4]]
                if independent_vowel not in ALLOWED_SIGNS:
                    raise ValueError(f" Sign '{independent_vowel}' not in inventory for word '{word}'")
                output.append(independent_vowel) # Add 'ආ'
                i += 5 # Consumed ch, ්, ‍, ය, ා
            else:
                i += 4 # Consumed ch, ්, ‍, ය
            continue

        # Priority 1b: Check for consonant + '්' + '‍' + 'ර' + optional vowel modifier (Rakaranshaya: 'ප්‍ර' -> 'ප්', 'ර්', 'අ')
        if i + 3 < len(word) and \
           ("\u0D9A" <= ch <= "\u0DC6") and \
           word[i+1] == '්' and \
           word[i+2] == '‍' and \
           word[i+3] == 'ර':

            consonant_hal = ch + "්"
            if consonant_hal not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{consonant_hal}' not in inventory for word '{word}'")
            output.append(consonant_hal) # Add 'ප්'

            # Add '්‍ර' sign (Rakaranshaya)
            rakaranshaya = "්‍ර"
            if rakaranshaya not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{rakaranshaya}' not in inventory for word '{word}'")
            output.append(rakaranshaya) # Add '්‍ර'

            # Check for trailing vowel modifier (e.g., 'ි' in 'ප්‍රි')
            if i + 4 < len(word) and word[i+4] in VOWEL_MAP:
                independent_vowel = VOWEL_MAP[word[i+4]]
                if independent_vowel not in ALLOWED_SIGNS:
                    raise ValueError(f" Sign '{independent_vowel}' not in inventory for word '{word}'")
                output.append(independent_vowel) # Add 'ඉ' or other vowel
                i += 5 # Consumed ch, ්, ‍, ර, vowel
            else:
                # Do NOT add inherent 'අ' for Rakaranshaya based on new requirements
                i += 4 # Consumed ch, ්, ‍, ර
            continue

        # Priority 2: Consonant + Vowel Modifier (e.g., 'භා' -> 'භ්', 'ආ')
        if ("\u0D9A" <= ch <= "\u0DC6") and (i + 1 < len(word) and word[i+1] in VOWEL_MAP):
            consonant_hal = ch + "්"
            independent_vowel = VOWEL_MAP[word[i+1]]

            if consonant_hal not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{consonant_hal}' not in inventory for word '{word}'")
            if independent_vowel not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{independent_vowel}' not in inventory for word '{word}'")

            output.append(consonant_hal)
            output.append(independent_vowel)
            i += 2
            continue

        # Priority 3: Consonant followed by hal mark + optional vowel modifier
        # e.g., 'ල්බ' -> 'ල්', 'බ්', 'අ' (no vowel after ල්)
        # e.g., 'ල්බන්' -> 'ល්', 'බ්', 'អ', 'న్'
        if ("\u0D9A" <= ch <= "\u0DC6") and (i + 1 < len(word) and word[i+1] == '්'):
            consonant_hal = ch + "්"
            if consonant_hal not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{consonant_hal}' not in inventory for word '{word}'")
            output.append(consonant_hal)
            
            # Check for vowel modifier after hal mark
            if i + 2 < len(word) and word[i+2] in VOWEL_MAP:
                independent_vowel = VOWEL_MAP[word[i+2]]
                if independent_vowel not in ALLOWED_SIGNS:
                    raise ValueError(f" Sign '{independent_vowel}' not in inventory for word '{word}'")
                output.append(independent_vowel)
                i += 3  # Consumed ch, ්, vowel
            else:
                i += 2  # Consumed ch, ్ (no vowel after hal)
            continue

        # Priority 3b: Consonant followed by another consonant (inherent 'අ' vowel)
        # e.g., 'ජල' in 'අංජලී' -> 'ජ්', 'අ', 'ල්'
        if ("\u0D9A" <= ch <= "\u0DC6") and (i + 1 < len(word) and "\u0D9A" <= word[i+1] <= "\u0DC6"):
            consonant_hal = ch + "්"
            if consonant_hal not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{consonant_hal}' not in inventory for word '{word}'")
            output.append(consonant_hal)
            
            # Add inherent 'අ' vowel
            if 'අ' not in ALLOWED_SIGNS:
                raise ValueError(f" Sign 'අ' not in inventory for word '{word}'")
            output.append('අ')
            i += 1
            continue

        # Priority 4: Skip standalone '්' (Al-Lakuna)
        if ch == '්':
            i += 1
            continue

        # Priority 4b: Skip standalone '‍' (Zero Width Joiner)
        if ch == '‍':
            i += 1
            continue

        # Priority 5: Handle individual characters
        # If it's a vowel modifier, convert to independent vowel
        if ch in VOWEL_MAP:
            sign = VOWEL_MAP[ch]
            if sign not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{sign}' not in inventory for word '{word}'")
            output.append(sign)
            i += 1
            continue

        # If it's a Sinhala consonant, convert to hal form + inherent 'අ'
        elif ("\u0D9A" <= ch <= "\u0DC6"):
            sign = ch + "්"
            if sign not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{sign}' not in inventory for word '{word}'")
            output.append(sign)
            
            # Add inherent 'අ' vowel for standalone consonants
            if 'අ' not in ALLOWED_SIGNS:
                raise ValueError(f" Sign 'අ' not in inventory for word '{word}'")
            output.append('අ')
            i += 1
            continue

        # For any other character (independent vowels, pre-composed hal consonants, etc.)
        else:
            sign = ch
            if sign not in ALLOWED_SIGNS:
                raise ValueError(f" Sign '{sign}' not in inventory for word '{word}'")
            output.append(sign)
            i += 1
            continue

    return output

In [16]:
print(to_fingerspelling("ඈමෑලි"))
print(to_fingerspelling("භාග්‍යා"))
print(to_fingerspelling("ප්‍රමිලා"))


['ඈ', 'ම්', 'ඈ', 'ල්', 'ඉ']
['භ්', 'ආ', 'ග්', '්\u200dය', 'ආ']
['ප්', '්\u200dර', 'ම්', 'ඉ', 'ල්', 'ආ']


In [17]:
def read_words(path):
    with open(path, encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]


In [18]:
def build_dataset():
    female_names = read_words("dataset_names/female.txt")
    male_names = read_words("dataset_names/male.txt")

    all_words = set(female_names + male_names)  # remove duplicates

    with open("output_fingerspell_dataset/fingerspell_dataset.tsv", "w", encoding="utf-8") as f:
        f.write("input\toutput\n")

        for word in sorted(all_words):
            try:
                signs = to_fingerspelling(word)
                f.write(f"{word}\t{'|'.join(signs)}\n")
            except ValueError as e:
                print(e)


In [19]:
build_dataset()

In [55]:
print("Test cases with hal forms:")
print("ඔල්බන්:", to_fingerspelling("ඔල්බන්"))
print("කල්ප:", to_fingerspelling("කල්ප"))

Test cases with hal forms:
ඔල්බන්: ['ඔ', 'ල්', 'බ්', 'අ', 'න්']
කල්ප: ['ක්', 'අ', 'ල්', 'ප්', 'අ']


In [62]:
# load village list from csv file
import pandas as pd

village_df=pd.read_csv("dataset_village_names/cities.csv")

province_df=pd.read_csv("dataset_village_names/provinces.csv")
districts_df=pd.read_csv("dataset_village_names/districts.csv")
districts_df.head()

Unnamed: 0,district id,province_id,name_en,name_si,name_ta
0,1,6,Ampara,අම්පාර,அம்பாறை
1,2,8,Anuradhapura,අනුරාධපුරය,அனுராதபுரம்
2,3,7,Badulla,බදුල්ල,பதுளை
3,4,6,Batticaloa,මඩකලපුව,மட்டக்களப்பு
4,5,1,Colombo,කොළඹ,கொழும்பு


In [63]:

# add this names into txt file
with open("dataset_village_names/village_names.txt","w",encoding="utf-8") as vf:
    for name in village_df["name_si"]:
        vf.write(name+"\n")



In [65]:
# add this names into txt file
with open("dataset_village_names/districts.txt","w",encoding="utf-8") as vf:
    for name in districts_df["name_si"]:
        vf.write(name+"\n")

In [64]:
# add this names into txt file
with open("dataset_village_names/province.txt","w",encoding="utf-8") as vf:
    for name in province_df["name_si"]:
        vf.write(name+"\n")

In [70]:
def build_dataset_cities():
    districts = read_words("dataset_village_names/districts.txt")
    cities = read_words("dataset_village_names/village_names.txt")
    provinces = read_words("dataset_village_names/province.txt")

    # filter out "-" and numbers from the lists
    districts = [word for word in districts if word != "-" and not any(char.isdigit() for char in word)]
    cities = [word for word in cities if word != "-" and not any(char.isdigit() for char in word)]
    provinces = [word for word in provinces if word != "-" and not any(char.isdigit() for char in word)]    

    all_words = set(districts + cities + provinces)  # remove duplicates

    with open("output_fingerspell_dataset/fingerspell_dataset_cities.tsv", "w", encoding="utf-8") as f:
        f.write("input\toutput\n")

        for word in sorted(all_words):
            try:
                signs = to_fingerspelling(word)
                f.write(f"{word}\t{'|'.join(signs)}\n")
            except ValueError as e:
                print(e)


In [69]:
build_dataset_cities()

 Sign '-' not in inventory for word 'ඌව-පරණගම'
 Sign '1' not in inventory for word 'කොළඹ1'
 Sign '1' not in inventory for word 'කොළඹ10'
 Sign '1' not in inventory for word 'කොළඹ11'
 Sign '1' not in inventory for word 'කොළඹ12'
 Sign '1' not in inventory for word 'කොළඹ13'
 Sign '1' not in inventory for word 'කොළඹ14'
 Sign '1' not in inventory for word 'කොළඹ15'
 Sign '2' not in inventory for word 'කොළඹ2'
 Sign '3' not in inventory for word 'කොළඹ3'
 Sign '4' not in inventory for word 'කොළඹ4'
 Sign '5' not in inventory for word 'කොළඹ5'
 Sign '6' not in inventory for word 'කොළඹ6'
 Sign '7' not in inventory for word 'කොළඹ7'
 Sign '8' not in inventory for word 'කොළඹ8'
 Sign '9' not in inventory for word 'කොළඹ9'
 Sign '(' not in inventory for word 'ගල්පාත(සබරගමුව)'
 Sign '-' not in inventory for word 'ජා-ඇල'
 Sign 'ඞ්' not in inventory for word 'තික්කොඞයි'
 Sign '(' not in inventory for word 'දෙවිනුවර(දෙවුන්දර)'
 Sign '-' not in inventory for word 'නාවල-කොස්වත්ත'
 Sign '-' not in inventory for 

In [47]:
import pandas as pd

df = pd.read_excel(
    "mappings.xlsx",
    sheet_name=0,      # or sheet name
    engine="openpyxl"
)

print(df.head())
print(df.shape)


# save as csv
df.to_csv("fingerspelling_mapper.csv", index=False)

print("✅ Conversion completed")


   IMG_8327  0
0  IMG_8328  1
1  IMG_8329  2
2  IMG_8330  3
3  IMG_8331  4
4  IMG_8332  5
(127, 2)
✅ Conversion completed
