In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bangla-quran-with-tafsir/en_bn_quran_tafsir.csv
/kaggle/input/bangla-quran-with-tafsir/ben_quran_with_tafsir.csv
/kaggle/input/en-bn-sahih-bukhari-muslim/en_bn_translated_sahih_hadiths.csv
/kaggle/input/en-bn-sahih-bukhari-muslim/bn_bukhari_muslim.csv
/kaggle/input/en-bn-sahih-bukhari-muslim/en_bn_translated_sahih_hadiths_final.csv
/kaggle/input/en-bn-sahih-bukhari-muslim/en_bn_bukhari_muslim.csv
/kaggle/input/en-bn-sahih-bukhari-muslim/en_bukhari_muslim.csv


# Motivation

Most of the times, TTS models struggle to pronounce few characters correctly,specially if those characters appear in a less frequent word. for example, the bangla tts models that we are sharing here can't pronounce "বায়ান" word correctly. instead of saying "bayan" it says "bajan". we need to smartly handle such cases in order to reduce TTS error rate. if we replace "বায়ান" with "বাইআন" then we can see that our system can now read "baian". here, "baian" is very close to the pronounciation of "bayan" and that way we can increase TTS accuracy a lot,specially for large documents (i.e books). we have discovered that so often bangla tts models struggle to pronounce 'য়' and it doesn't mean it makes mistake for all words that contains 'য়'. for example one of our model can read 'আয়াত' as "ayat" perfectly but another model reads 'আয়াত' as "ajrato" which is a mistake that needs to be fixed for that particular model.

in this notebook,we will try to find all unique words with 'য়' character in them and count frequency of  each unique 'য়' word in our document so that we can later decide what are the words that needs to be re-arranged in a format that helps our TTS system to reduce it's error rate.

Note that, even 1% error reduction is hugeee improvement for our TTS pipeline here because we will be making it to read gigantic books. Here accuracy is more important than speed.smart post processing should help.

# utils

In [2]:

def get_unique_য়_words(bn_words):
    words_of_interest = []
    for i in range(len(bn_words)):

        for char in bn_words[i]:
            ck = 0
            for ch in char:
                if('য়' == ch):
                    ck = 1 
            if(ck):
                words_of_interest.append(bn_words[i])
    return words_of_interest

def count_all_য়_words(mlt_all_words,bn_unique_words):
    counter = []
    for i in range(len(bn_unique_words)):
        count = 0
        for j in range(len(mlt_all_words)):
            if(bn_unique_words[i] in mlt_all_words[j]):
                count+=1

        counter.append(count)
    return counter



# sahih bukhari, muslim hadith dataset

[dataset link](https://www.kaggle.com/datasets/mobassir/en-bn-sahih-bukhari-muslim)

In [3]:
bn_bukhari_muslim = pd.read_csv('/kaggle/input/en-bn-sahih-bukhari-muslim/bn_bukhari_muslim.csv')
bn_bukhari_muslim.head(1)

Unnamed: 0,id,source_book,chapter_no,hadith_no,narrator,validity,chain_idx,text_bn,text_ar,explanation,extra_note,preface
0,0,সহিহ বুখারী,১/১. অধ্যায়ঃ,1,‘আলক্বামাহ ইব্‌নু ওয়াক্কাস আল-লায়সী (রহঃ) থেকে...,সহিহ হাদিস,"(৫৪, ২৫২৯, ৩৮৯৮, ৫০৭০, ৬৬৮৯, ৬৯৫৩; মুসলিম ২৩/৪...",আমি ‘উমর ইব্‌নুল খাত্তাব (রাঃ)-কে মিম্বারের উপ...,حَدَّثَنَا الْحُمَيْدِيُّ عَبْدُ اللَّهِ بْنُ ...,শারী‘আহ্‌র মূল উৎস হচ্ছে ওয়াহী। ওয়াহী দু’ প্রক...,এখনো পাওয়া যায়নি,আল্লাহ্‌র রসূল (সাল্লাল্লাহু 'আলাইহি ওয়া সাল্ল...


In [4]:
len('য়')

2

In [5]:
'য়' == 'য়'

False

In [6]:
'য়' == 'য়'

True

In [7]:
bn_hadiths = bn_bukhari_muslim.narrator+bn_bukhari_muslim.text_bn+bn_bukhari_muslim.explanation


In [8]:
mlt_unique_words = list(bn_hadiths.str.split(' ', expand=True).stack().unique())
len(mlt_unique_words)

83442

In [9]:
%%time

bn_unique_words = get_unique_য়_words(mlt_unique_words)

CPU times: user 121 ms, sys: 15 µs, total: 121 ms
Wall time: 121 ms


In [10]:
len(bn_unique_words)

8564

In [11]:
count_words = bn_hadiths.str.split(' ', expand=True).stack().value_counts()
count_words

‘আলাইহি           22194
(সাল্লাল্লাহু     22077
(রাঃ)             19955
ওয়া               19931
থেকে              19553
                  ...  
ধাবমান)               1
ঘোড়াটিতো              1
সমুদ্র।(আধুনিক        1
৫৪৯৪)এখনো             1
১০০৪)এখনো             1
Length: 83442, dtype: int64

In [12]:
bn_unique = set(bn_unique_words)
bn_unique_words = list(bn_unique)
len(bn_unique_words)

8266

In [13]:
mlt_all_words = list(bn_hadiths.str.split(' ', expand=True).stack())
len(mlt_all_words)

1270607

In [14]:
%%time

counter = count_all_য়_words(mlt_all_words,bn_unique_words)

CPU times: user 19min 7s, sys: 261 ms, total: 19min 7s
Wall time: 19min 9s


In [15]:
#sanity check
len(counter),len(bn_unique_words)

(8266, 8266)

In [16]:
df = pd.DataFrame(columns=['bn_words_of_interest', 'frequency_count'])
df.bn_words_of_interest = bn_unique_words
df.frequency_count = counter


In [17]:
df = df.sort_values(by=['frequency_count'], ascending=False)

In [18]:
df.to_csv("hadiths_frequent_bn_words_of_interest.csv",index = False)

In [19]:
df.head()

Unnamed: 0,bn_words_of_interest,frequency_count
4419,য়,102621
6212,ওয়া,29507
617,হয়,8486
8083,হয়ে,5611
2393,দায়,4559


# scrutinizing our [qtafsir dataset ](https://www.kaggle.com/datasets/mobassir/bangla-quran-with-tafsir)

In [20]:
qtafsir = pd.read_csv('../input/bangla-quran-with-tafsir/ben_quran_with_tafsir.csv')
qtafsir.head(1)

Unnamed: 0,text,ayat,আল_বায়ান,তাইসিরুল,মুজিবুর_রহমান,Sahih_International,tafsir_bayan_headers,tafsir_bayan_text,tafsir_zakariya_headers,tafsir_zakariya_text
0,بِسۡمِ اللّٰهِ الرَّحۡمٰنِ الرَّحِیۡمِ ﴿۱﴾\nب...,"surah 1, ayat 1",পরম করুণাময় অতি দয়ালু আল্লাহর নামে।,(আরম্ভ করছি) পরম করুণাময় অসীম দয়াময় আল্লাহর না...,"পরম করুণাময়, অসীম দয়ালু আল্লাহর নামে (শুরু করছ...","In the name of Allah, the Entirely Merciful, t...",(১) অনন্ত করুণাময় পরম দয়ালু আল্লাহর নামে (আরম্...,‘বিসমিল্লাহ’র পূর্বে ‘আক্বরাউ’ ‘আবদাউ’ অথবা ‘আ...,"১. রহমান, রহীম(১) আল্লাহর নামে।(২)","১. সাধারণত আয়াতের অনুবাদে বলা হয়ে থাকে, পরম ..."


In [21]:
bn_qtafsir = qtafsir.আল_বায়ান+qtafsir.তাইসিরুল+qtafsir.মুজিবুর_রহমান+qtafsir.tafsir_bayan_headers+qtafsir.tafsir_bayan_text+qtafsir.tafsir_zakariya_headers+qtafsir.tafsir_zakariya_text


In [22]:
len(bn_qtafsir)

6236

In [23]:
mlt_unique_words = list(bn_qtafsir.str.split(' ', expand=True).stack().unique())

In [24]:
len(mlt_unique_words)

116812

In [25]:
%%time

bn_unique_words = get_unique_য়_words(mlt_unique_words)

CPU times: user 188 ms, sys: 1 µs, total: 188 ms
Wall time: 188 ms


In [26]:
len(bn_unique_words)

7524

In [27]:
# bn_words = ' '.join(str(s) for s in bn_unique_words)
count_words = bn_qtafsir.str.split(' ', expand=True).stack().value_counts()
count_words

ও               27422
এবং             25869
তাদের           20469
আর              19362
তারা            18941
                ...  
পরিভাষাকে           1
চিরস্থায়ী।’’        1
والأرض)             1
السموات             1
১/৯]তাফসীরে         1
Length: 116812, dtype: int64

In [28]:
bn_unique = set(bn_unique_words)
bn_unique_words = list(bn_unique)
len(bn_unique_words)

7339

In [29]:
mlt_all_words = list(bn_qtafsir.str.split(' ', expand=True).stack())
len(mlt_all_words)

1809060

In [30]:
%%time

counter = count_all_য়_words(mlt_all_words,bn_unique_words)

CPU times: user 23min 56s, sys: 294 ms, total: 23min 56s
Wall time: 23min 56s


In [31]:
#sanity check
len(counter),len(bn_unique_words)

(7339, 7339)

In [32]:
df = pd.DataFrame(columns=['bn_words_of_interest', 'frequency_count'])
df.bn_words_of_interest = bn_unique_words
df.frequency_count = counter


In [33]:
df = df.sort_values(by=['frequency_count'], ascending=False)

In [34]:
df.to_csv("frequent_bn_words_of_interest.csv",index = False)

In [35]:
df

Unnamed: 0,bn_words_of_interest,frequency_count
3877,য়,81381
535,হয়,15404
7189,হয়ে,10004
5412,ওয়া,6351
3944,হয়েছ,6209
...,...,...
3323,দিয়্যাত,1
3318,(শিয়া)রা,1
3317,হৃদয়ও,1
3316,কর্তৃস্থানীয়,1
