In [1]:
import pandas as pd 
import numpy  as np
from wordfreq import word_frequency ,top_n_list,get_frequency_dict

from symspellpy import SymSpell, Verbosity
from itertools import islice

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from random import random, choice
import random
import time

import os
import re
from tqdm import tqdm  
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True,nb_workers=4)


INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from bnunicodenormalizer import Normalizer 
pandarallel.initialize(progress_bar=True,nb_workers=8)
tqdm.pandas()
bnorm=Normalizer()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
from typing import Dict, List, Tuple, Any, Union


import json

from datasets import load_metric

from tqdm.auto import tqdm
from IPython.display import display, Audio, HTML

cer = load_metric("cer")
wer = load_metric("wer")
import warnings 
warnings.filterwarnings('ignore')

In [4]:
def remove_punctuations(my_str):
    punctuations = '''````¬£|¬¢|√ë+-*/=EROero‡ß≥‡ß¶‡ßß‡ß®‡ß©‡ß™‡ß´‡ß¨‡ß≠‡ßÆ‡ßØ012‚Äì34567‚Ä¢89‡•§!()-[]{};:'"‚Äú\‚Äô‚Ä¶,<>.‚Äö/?@#$%^&*_~‚Äò‚Äî‡••‚Äù‚Ä∞ü§£‚öΩÔ∏è‚úåÔøΩÔø∞‡ß∑Ôø∞'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None]) 

def wer(ref, hyp ,debug=False):
    r = ref.split()
    h = hyp.split()
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact
    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        lines = []
        compares = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("C\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("S\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("I\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("D\t" + r[i]+"\t"+"****")
    return lines

def error_label(ref,hyp):
    li=[]
    lines= wer(ref, hyp,debug=True)
    for line in reversed(lines):
        line = re.sub(r"\t"," ",line)
        line = line.split()
        li.append(line[0])
    return li

def wer_calc(label):
    s=label.count('S')
    d=label.count('D')
    i=label.count('I')
    return int((s+d+i)/len(label)*100)

# Common_voice

In [5]:
df_cv_1 = pd.read_csv('asr_bangla/data/cv_100k_p1.csv')
df_cv_2 = pd.read_csv('asr_bangla/data/cv_100k_p2.csv')
df_cv_3 = pd.read_csv('asr_bangla/data/symspell_nov.csv')
df_cv = pd.read_csv('cv_train.csv')

In [6]:
df_cv_3 = df_cv_3[(df_cv_3.source == 'train') & (df_cv_3.audio == 'noisy')]

In [7]:
df_cv_3 = df_cv_3[['path','wav2vec2','4gram']]

In [8]:
df_cv = df_cv[['path','sentence']]

In [9]:
df_cv_1= df_cv_1.append([df_cv_2,df_cv_3])
df_cv_1 = df_cv_1.reset_index(drop=True)

In [10]:
df_cv_1['path'] = df_cv_1.path.apply(lambda x: os.path.basename(str(x)))

In [11]:
df_cv = df_cv_1.merge(df_cv,on='path',how='inner')

In [12]:
df_cv

Unnamed: 0,path,wav2vec2,4gram,sentence
0,common_voice_bn_30991371.mp3,‡¶¶‡ßá‡¶ì‡ßü‡¶æ‡¶® ‡¶´‡¶∞‡¶ø‡¶¶ ‡¶ó‡¶æ‡ßü‡¶ú‡ßÄ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶ì‡ßü‡¶æ‡¶Æ‡ßÄ ‡¶≤‡ßÄ‡¶ó‡ßá‡¶∞ ‡¶â‡¶™‡¶¶‡ßá‡¶∑‡ßç...,‡¶¶‡ßá‡¶ì‡ßü‡¶æ‡¶® ‡¶´‡¶∞‡¶ø‡¶¶ ‡¶ó‡¶æ‡¶ú‡ßÄ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶ì‡ßü‡¶æ‡¶Æ‡ßÄ ‡¶≤‡ßÄ‡¶ó‡ßá‡¶∞ ‡¶â‡¶™‡¶¶‡ßá‡¶∑‡ßç‡¶ü...,‡¶¶‡ßá‡¶ì‡¶Ø‡¶º‡¶æ‡¶® ‡¶´‡¶∞‡¶ø‡¶¶ ‡¶ó‡¶æ‡¶ú‡ßÄ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶ì‡¶Ø‡¶º‡¶æ‡¶Æ‡ßÄ ‡¶≤‡ßÄ‡¶ó‡ßá‡¶∞ ‡¶â‡¶™‡¶¶‡ßá‡¶∑...
1,common_voice_bn_30991410.mp3,‡¶è‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶ï‡¶æ‡¶∞‡ßã ‡¶ó‡¶æ‡¶® ‡¶ó‡¶æ‡¶ì‡ßü‡¶æ ‡¶¶‡¶ï‡ßç‡¶∑‡¶§‡¶æ‡¶¨‡¶æ ‡¶®‡¶ø‡¶ú‡¶∏‡ßç‡¶¨ ‡¶ß‡¶∞‡ßç‡¶Æ...,‡¶è‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶ï‡¶æ‡¶∞‡¶ì ‡¶ó‡¶æ‡¶® ‡¶ó‡¶æ‡¶ì‡ßü‡¶æ ‡¶¶‡¶ï‡ßç‡¶∑‡¶§‡¶æ ‡¶¨‡¶æ ‡¶®‡¶ø‡¶ú‡¶∏‡ßç‡¶¨ ‡¶ß‡¶∞‡ßç...,‡¶è‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶ï‡¶æ‡¶∞‡¶ì ‡¶ó‡¶æ‡¶® ‡¶ó‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ ‡¶¶‡¶ï‡ßç‡¶∑‡¶§‡¶æ ‡¶¨‡¶æ ‡¶®‡¶ø‡¶ú‡¶∏‡ßç‡¶¨ ‡¶ß‡¶∞...
2,common_voice_bn_30991513.mp3,‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡ßá ‡¶§‡¶•‡ßç‡¶Ø ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶Ø‡ßá ‡¶ï‡ßã‡¶®‡ßã ‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶´‡¶æ‡¶á‡¶≤‡ßá...,‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡ßá ‡¶§‡¶•‡ßç‡¶Ø ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶Ø‡ßá ‡¶ï‡ßã‡¶®‡ßã ‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶´‡¶æ‡¶á‡¶≤‡ßá...,‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡ßá ‡¶§‡¶•‡ßç‡¶Ø ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶Ø‡ßá ‡¶ï‡ßã‡¶®‡¶ì ‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶´‡¶æ‡¶á‡¶≤‡ßá...
3,common_voice_bn_30991535.mp3,‡¶π‡¶ø‡¶∏‡¶æ‡¶¨ ‡¶™‡¶æ‡¶ì‡ßü‡¶æ‡¶∞ ‡¶™‡¶∞ ‡¶§‡¶ø‡¶®‡¶ø ‡¶∏‡ßç‡¶¨‡¶≤‡ßç‡¶™ ‡¶∏‡¶Æ‡ßü‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá‡¶á ‡¶∏‡¶Æ‡ßç‡¶™‡ßÇ...,‡¶π‡¶ø‡¶∏‡¶æ‡¶¨ ‡¶™‡¶æ‡¶ì‡ßü‡¶æ‡¶∞ ‡¶™‡¶∞ ‡¶§‡¶ø‡¶®‡¶ø ‡¶∏‡ßç‡¶¨‡¶≤‡ßç‡¶™ ‡¶∏‡¶Æ‡ßü‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá‡¶á ‡¶∏‡¶Æ‡ßç‡¶™‡ßÇ...,‡¶π‡¶ø‡¶∏‡¶æ‡¶¨ ‡¶™‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞ ‡¶™‡¶∞ ‡¶§‡¶ø‡¶®‡¶ø ‡¶∏‡ßç‡¶¨‡¶≤‡ßç‡¶™ ‡¶∏‡¶Æ‡¶Ø‡¶º‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá‡¶á ‡¶∏‡¶Æ‡ßç...
4,common_voice_bn_30991592.mp3,‡¶®‡¶¶‡ßÄ‡¶∞ ‡¶™‡¶æ‡¶®‡¶ø‡¶§‡ßá ‡¶¨‡ßü‡ßá ‡¶Ü‡¶∏‡¶æ ‡¶™‡¶≤‡¶ø ‡¶¶‡¶≤‡¶¶‡ßá‡¶∂‡ßá ‡¶ú‡¶Æ‡ßá ‡¶Æ‡¶æ‡¶ù‡ßá ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ...,‡¶®‡¶¶‡ßÄ‡¶∞ ‡¶™‡¶æ‡¶®‡¶ø‡¶§‡ßá ‡¶¨‡ßü‡ßá ‡¶Ü‡¶∏‡¶æ ‡¶™‡¶≤‡¶ø ‡¶§‡¶≤‡¶¶‡ßá‡¶∂‡ßá ‡¶ú‡¶Æ‡ßá ‡¶Æ‡¶æ‡¶ù‡ßá ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ...,‡¶®‡¶¶‡ßÄ‡¶∞ ‡¶™‡¶æ‡¶®‡¶ø‡¶§‡ßá ‡¶¨‡¶Ø‡¶º‡ßá ‡¶Ü‡¶∏‡¶æ ‡¶™‡¶≤‡¶ø ‡¶§‡¶≤‡¶¶‡ßá‡¶∂‡ßá ‡¶ú‡¶Æ‡ßá ‡¶Æ‡¶æ‡¶ù‡ßá‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ...
...,...,...,...,...
100931,common_voice_bn_31459116.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡¶°‡¶º‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡¶°‡¶º‡¶æ‡¶Ø‡¶º ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®‡•§
100932,common_voice_bn_31459130.mp3,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡¶Ø‡¶º...
100933,common_voice_bn_31459161.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...
100934,common_voice_bn_31637719.mp3,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã‡•§


# Open_SLR

In [13]:
df_slr_1 = pd.read_csv('asr_bangla/data/slr_1_60.csv')
df_slr_2 = pd.read_csv('asr_bangla/data/slr_150_200.csv')
df_slr_3 = pd.read_csv('asr_bangla/data/slr_60_70.csv')
df_slr_4 = pd.read_csv('asr_bangla/data/slr_70_80.csv')
df_slr_5 = pd.read_csv('asr_bangla/data/slr_80_90.csv')
df_slr_6 = pd.read_csv('asr_bangla/data/slr_90_100.csv')

In [14]:
df_slr_1 = df_slr_1.append([df_slr_2,df_slr_3,df_slr_4,df_slr_5,df_slr_6])
df_slr_1 = df_slr_1.reset_index(drop=True)

In [15]:
df_slr_1['path'] = df_slr_1.path.apply(lambda x: os.path.basename(str(x)))

In [16]:
df_slr_1

Unnamed: 0,path,wav2vec2,4gram
0,000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá
1,000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá
2,00005debc7.flac,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•
3,00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
4,00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá
...,...,...,...
149995,74c6f3b531.flac,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡¶∞‡ßã ‡¶ï‡¶æ‡¶ú ‡¶π‡ßü‡ßá‡¶õ‡ßá,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡ßú ‡¶ï‡¶æ‡¶ú ‡¶π‡ßü‡ßá‡¶õ‡ßá
149996,74c702e2d5.flac,‡¶ú‡ßÅ‡¶®‡ßç ‡¶ñ‡¶æ‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤,‡¶ú‡ßÅ‡¶® ‡¶ñ‡ßá‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤
149997,74c706e710.flac,‡¶è‡¶∞‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂‡ßá,‡¶è‡¶∞‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá
149998,74c70ce063.flac,‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡¶®‡ßü‡¶∂‡¶≤ ‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶ú‡¶®‡ßá‡¶∞,‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡¶®‡ßü ‡¶∂ ‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶ú‡¶®‡ßá‡¶∞


In [17]:
df_slr = pd.read_csv('slr_train.csv')

In [18]:
df_slr['path'] = df_slr.path.apply(lambda x :os.path.basename(str(x)))

In [19]:
df_slr = df_slr_1.merge(df_slr,on='path',how='inner')

In [20]:
df_slr['path'] = 'slr_' + df_slr.path

In [21]:
df_slr

Unnamed: 0,path,wav2vec2,4gram,sentence
0,slr_000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá ‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá
1,slr_000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è ‡¶ß‡¶∞‡¶£‡ßá‡¶∞ ‡¶ï‡¶æ‡¶∞‡ßç‡¶° ‡¶®‡¶ø‡¶Ø‡¶º‡ßá
2,slr_00005debc7.flac,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•
3,slr_00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡¶Ø‡¶º ‡¶π‡¶Ø‡¶º‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
4,slr_00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá
...,...,...,...,...
149995,slr_74c6f3b531.flac,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡¶∞‡ßã ‡¶ï‡¶æ‡¶ú ‡¶π‡ßü‡ßá‡¶õ‡ßá,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡ßú ‡¶ï‡¶æ‡¶ú ‡¶π‡ßü‡ßá‡¶õ‡ßá,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡¶°‡¶º ‡¶ï‡¶æ‡¶ú ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá
149996,slr_74c702e2d5.flac,‡¶ú‡ßÅ‡¶®‡ßç ‡¶ñ‡¶æ‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤,‡¶ú‡ßÅ‡¶® ‡¶ñ‡ßá‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤,‡¶ù‡ßÅ‡¶®‡ßÅ‡¶ñ‡¶æ‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤
149997,slr_74c706e710.flac,‡¶è‡¶∞‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂‡ßá,‡¶è‡¶∞‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá,‡¶è‡¶∞ ‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá
149998,slr_74c70ce063.flac,‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡¶®‡ßü‡¶∂‡¶≤ ‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶ú‡¶®‡ßá‡¶∞,‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡¶®‡ßü ‡¶∂ ‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶ú‡¶®‡ßá‡¶∞,‡ß®‡ß¨ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡ßØ‡¶∂ ‡ß®‡ß¨ ‡¶ú‡¶®‡ßá‡¶∞


# Data_Merged

In [22]:
df_actual = df_slr.append(df_cv)

In [23]:
df_actual = df_actual.dropna()

In [24]:
df_actual = df_actual.reset_index(drop=True)

In [25]:
df_actual.rename(columns = {'4gram':'arpa_4gram'}, inplace = True)

In [26]:
df_actual

Unnamed: 0,path,wav2vec2,arpa_4gram,sentence
0,slr_000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá ‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá
1,slr_000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è ‡¶ß‡¶∞‡¶£‡ßá‡¶∞ ‡¶ï‡¶æ‡¶∞‡ßç‡¶° ‡¶®‡¶ø‡¶Ø‡¶º‡ßá
2,slr_00005debc7.flac,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•
3,slr_00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡¶Ø‡¶º ‡¶π‡¶Ø‡¶º‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
4,slr_00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá
...,...,...,...,...
248612,common_voice_bn_31459116.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡¶°‡¶º‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡¶°‡¶º‡¶æ‡¶Ø‡¶º ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®‡•§
248613,common_voice_bn_31459130.mp3,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡¶Ø‡¶º...
248614,common_voice_bn_31459161.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...
248615,common_voice_bn_31637719.mp3,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã‡•§


In [27]:
df_actual['sentence'] = df_actual['sentence'].parallel_apply(lambda x : remove_punctuations(x))
df_actual["sentence"]=df_actual["sentence"].parallel_apply(lambda x:normalize(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [28]:
#df_actual.to_csv('df_data_actual.csv',index=False)

In [29]:
df_actual['wav2vec2_label'] = df_actual.parallel_apply(lambda x : error_label(x.sentence,x.wav2vec2),axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [30]:
df_actual['arpa_label'] = df_actual.parallel_apply(lambda x : error_label(x.sentence,x.arpa_4gram),axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [32]:
df_actual['wer_wav2vec2'] = df_actual.wav2vec2_label.parallel_apply(lambda x: wer_calc(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [33]:
df_actual['wer_arpa'] = df_actual.arpa_label.parallel_apply(lambda x: wer_calc(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [34]:
df_actual.to_csv('asr_bangla/data/asr_cv_slr.csv',index=False)

In [None]:
df_actual[(df_actual.wer_wav2vec2 >= 30) & (df_actual.wer_wav2vec2 <= 90)]

# Custom_data_prothom_alo

In [None]:
sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7)
dictionary_path = 'asr_bangla/data/prothom_alo_word_freq.txt'
sym_spell.load_dictionary(dictionary_path, 0, 1,separator=" ")

In [None]:
df_prothom_alo = pd.read_csv('bangla_text_data.csv')

df_prothom_alo= df_prothom_alo.dropna(how='any')

In [None]:
def get_len(sen):
    sen = sen.split()
    l = len(sen)
    return l

In [None]:
df_prothom_alo = df_prothom_alo.sample(300000)

In [None]:
df_prothom_alo['lena'] = df_prothom_alo['sentence'].apply(lambda x: get_len(str(x)) )

In [None]:
df_prothom_alo.lena.value_counts()

In [None]:
df_prothom_alo = df_prothom_alo[(df_prothom_alo.lena >= 3) & (df_prothom_alo.lena <= 15)]

In [None]:
df_prothom_alo = df_prothom_alo.sample(100000)

In [None]:
df_prothom_alo.lena.value_counts()

In [None]:
df_prothom_alo.to_csv('custom_data_for_bert.csv',index=False)

In [None]:
df_prothom_alo = pd.read_csv('custom_data_for_bert.csv')

In [None]:
df_prothom_alo

In [None]:
character = ['‡¶Ö','‡¶Ü','‡¶á','‡¶à','‡¶â','‡¶ä','‡¶è','‡¶ê','‡¶ì','‡¶î','‡¶ã','‡¶É','‡¶Ç','‡ßé'
             '‡¶ï','‡¶ñ','‡¶ó','‡¶ò','‡¶ô','‡¶ö', '‡¶õ','‡¶Ø','‡¶ú', '‡¶ù','‡¶ü','‡¶†','‡¶°','‡¶∞','‡ßú','‡ßù',
             '‡¶£','‡¶§','‡¶•','‡¶¶','‡¶ß','‡¶®','‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶∏','‡¶∂','‡¶∑','‡ßü','‡¶π']

In [None]:
JuktakkhorList = [' ‡¶ï‡ßç‡¶ü ' , ' ‡¶ï‡ßç‡¶ï ' , ' ‡¶ï‡ßç‡¶§ ' , ' ‡¶ï‡ßç‡¶Ø ' , ' ‡¶ï‡ßç‡¶∞ ' , ' ‡¶ï‡ßç‡¶≤ ' , ' ‡¶ï‡ßç‡¶∑ ' , ' ‡¶ï‡ßç‡¶∑‡ßç‡¶£ ' , ' ‡¶ï‡ßç‡¶∑‡ßç‡¶Æ ' , ' ‡¶ï‡ßç‡¶∑‡ßç‡¶Ø ' , ' ‡¶ï‡ßç‡¶∏ ' , ' ‡¶ñ‡ßç‡¶∞ ' , ' ‡¶ó‡ßç‡¶ß ' , ' ‡¶ó‡ßç‡¶ß‡ßç‡¶Ø ' , ' ‡¶ó‡ßç‡¶® ' , ' ‡¶ó‡ßç‡¶®‡ßç‡¶Ø ' , ' ‡¶ó‡ßç‡¶¨ ' , ' ‡¶ó‡ßç‡¶∞ ' , ' ‡¶ó‡ßç‡¶∞‡ßç‡¶Ø ' , ' ‡¶ó‡ßç‡¶≤ ' , ' ‡¶ò‡ßç‡¶® ' , ' ‡¶ò‡ßç‡¶∞ ' , ' ‡¶ô‡ßç‡¶ï‡ßç‡¶Ø ' , ' ‡¶ô‡ßç‡¶ó‡ßç‡¶Ø ' , ' ‡¶ö‡ßç‡¶ö ' , ' ‡¶ö‡ßç‡¶õ‡ßç‡¶¨ ' , ' ‡¶ö‡ßç‡¶Ø ' , ' ‡¶ú‡ßç‡¶ú ' , ' ‡¶ú‡ßç‡¶ú‡ßç‡¶¨ ' , ' ‡¶ü‡ßç‡¶ü ' , ' ‡¶ú‡ßç‡¶¨ ' , ' ‡¶ú‡ßç‡¶Ø ' , ' ‡¶ú‡ßç‡¶∞ ' , ' ‡¶ü‡ßç‡¶Ø ' , ' ‡¶ü‡ßç‡¶∞ ' , ' ‡¶°‡ßç‡¶° ' , ' ‡¶°‡ßç‡¶∞ ' , ' ‡¶£‡ßç‡¶ü ' , ' ‡¶£‡ßç‡¶† ' , ' ‡¶£‡ßç‡¶° ' , ' ‡¶£‡ßç‡¶£ ' , ' ‡¶£‡ßç‡¶Ø ' , ' ‡ßé‡¶ï ' , ' ‡ßé‡¶ñ ' , ' ‡¶§‡ßç‡¶§ ' , ' ‡¶§‡ßç‡¶§‡ßç‡¶¨ ' , ' ‡¶§‡ßç‡¶§‡ßç‡¶Ø ' , ' ‡¶§‡ßç‡¶® ' , ' ‡ßé‡¶™ ' , ' ‡¶§‡ßç‡¶¨ ' , ' ‡¶§‡ßç‡¶Æ ' , ' ‡¶§‡ßç‡¶Æ‡ßç‡¶Ø ' , ' ‡¶§‡ßç‡¶Ø ' , ' ‡¶§‡ßç‡¶∞ ' , ' ‡¶§‡ßç‡¶∞‡ßç‡¶Ø ' , ' ‡ßé‡¶∏ ' , ' ‡¶¶‡ßç‡¶ò ' , ' ‡¶¶‡ßç‡¶¶ ' , ' ‡¶¶‡ßç‡¶ß ' , ' ‡¶¶‡ßç‡¶¨ ' , ' ‡¶¶‡ßç‡¶≠ ' , ' ‡¶¶‡ßç‡¶≠‡ßç‡¶∞ ' , ' ‡¶¶‡ßç‡¶Æ ' , ' ‡¶¶‡ßç‡¶Ø ' , ' ‡¶¶‡ßç‡¶∞ ' , ' ‡¶¶‡ßç‡¶∞‡ßç‡¶Ø ' , ' ‡¶ß‡ßç‡¶¨ ' , ' ‡¶ß‡ßç‡¶Ø ' , ' ‡¶ß‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶ü ' , ' ‡¶®‡ßç‡¶ü‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶† ' , ' ‡¶®‡ßç‡¶° ' , ' ‡¶®‡ßç‡¶°‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶§ ' , ' ‡¶®‡ßç‡¶§‡ßç‡¶¨ ' , ' ‡¶®‡ßç‡¶§‡ßç‡¶Ø ' , ' ‡¶®‡ßç‡¶§‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßç‡¶Ø ' , ' ‡¶®‡ßç‡¶• ' , ' ‡¶®‡ßç‡¶¶ ' , ' ‡¶®‡ßç‡¶¶‡ßç‡¶¨ ' , ' ‡¶®‡ßç‡¶¶‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶ß ' , ' ‡¶®‡ßç‡¶® ' , ' ‡¶®‡ßç‡¶Ø ' , ' ‡¶™‡ßç‡¶ü ' , ' ‡¶™‡ßç‡¶§ ' , ' ‡¶™‡ßç‡¶® ' , ' ‡¶™‡ßç‡¶™ ' , ' ‡¶™‡ßç‡¶Ø ' , ' ‡¶¨‡ßç‡¶¶ ' , ' ‡¶¨‡ßç‡¶ß ' , ' ‡¶¨‡ßç‡¶¨ ' , ' ‡¶¨‡ßç‡¶∞ ' , ' ‡¶≠‡ßç‡¶Ø ' , ' ‡¶≠‡ßç‡¶∞ ' , ' ‡¶Æ‡ßç‡¶™‡ßç‡¶∞ ' , ' ‡¶Æ‡ßç‡¶¨ ' , ' ‡¶Æ‡ßç‡¶Æ ' , ' ‡¶Æ‡ßç‡¶Ø ' , ' ‡¶Æ‡ßç‡¶∞ ' , ' ‡¶Ø‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶ï ' , ' ‡¶∞‡ßç‡¶ó‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶ò‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶ú‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶•‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶¨‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶ñ ' , ' ‡¶∞‡ßç‡¶ó ' , ' ‡¶∞‡ßç‡¶ò ' , ' ‡¶∞‡ßç‡¶ö ' , ' ‡¶∞‡ßç‡¶õ ' , ' ‡¶∞‡ßç‡¶ú ' , ' ‡¶∞‡ßç‡¶ù ' , ' ‡¶∞‡ßç‡¶ü ' , ' ‡¶∞‡ßç‡¶° ' , ' ‡¶∞‡ßç‡¶£ ' , ' ‡¶∞‡ßç‡¶§ ' , ' ‡¶∞‡ßç‡¶• ' , ' ‡¶∞‡ßç‡¶¶ ' , ' ‡¶∞‡ßç‡¶¶‡ßç‡¶¨ ' , ' ‡¶∞‡ßç‡¶¶‡ßç‡¶∞ ' , ' ‡¶∞‡ßç‡¶ß ' , ' ‡¶∞‡ßç‡¶ß‡ßç‡¶¨ ' , ' ‡¶≤‡ßç‡¶ü ' , ' ‡¶≤‡ßç‡¶° ' , ' ‡¶≤‡ßç‡¶™ ' , ' ‡¶≤‡ßç‡¶Æ ' , ' ‡¶≤‡ßç‡¶Ø ' , ' ‡¶≤‡ßç‡¶≤ ' , ' ‡¶∂‡ßç‡¶õ ' , ' ‡¶∂‡ßç‡¶® ' , ' ‡¶∂‡ßç‡¶¨ ' , ' ‡¶∂‡ßç‡¶∞ ' , ' ‡¶∂‡ßç‡¶≤ ' , ' ‡¶∑‡ßç‡¶ï ' , ' ‡¶∑‡ßç‡¶ï‡ßç‡¶∞ ' , ' ‡¶∑‡ßç‡¶ü ' , ' ‡¶∑‡ßç‡¶ü‡ßç‡¶Ø ' , ' ‡¶∑‡ßç‡¶ü‡ßç‡¶∞ ' , ' ‡¶∑‡ßç‡¶† ' , ' ‡¶∑‡ßç‡¶™ ' , ' ‡¶∏‡ßç‡¶ü ' , ' ‡¶∏‡ßç‡¶ü‡ßç‡¶∞ ' , ' ‡¶∏‡ßç‡¶§‡ßç‡¶∞ ' , ' ‡¶∏‡ßç‡¶§ ' , ' ‡¶∏‡ßç‡¶•‡ßç‡¶Ø ' , ' ‡¶∏‡ßç‡¶Ø ' , ' ‡¶∏‡ßç‡¶∞ ' , ' ‡¶∏‡ßç‡¶≤ ' , ' ‡¶π‡ßç‡¶∞ ' , ' ‡¶π‡ßç‡¶≤ ' , ' ‡¶ó‡ßç‡¶Æ ' , ' ‡¶Æ‡ßç‡¶≠ ' , ' ‡¶∏‡ßç‡¶Æ ' , ' ‡¶ô‡ßç‡¶ò ' , ' ‡¶ô‡ßç‡¶ï‡ßç‡¶∑ ' , ' ‡¶ô‡ßç‡¶ñ ' , ' ‡¶ô‡ßç‡¶ï ' , ' ‡¶ô‡ßç‚Äå‡¶ï‡ßç‡¶§ ' , ' ‡¶û‡ßç‡¶ö ' , ' ‡¶û‡ßç‡¶õ ' , ' ‡¶û‡ßç‡¶ú ' , ' ‡¶π‡ßç‡¶® ' , ' ‡¶ô‡ßç‡¶ó ' , ' ‡¶ú‡ßç‡¶û ' , ' ‡¶ó‡ßç‡¶Ø ' , ' ‡¶ö‡ßç‡¶õ '] 

In [None]:
ReplaceDict = {
      '‡¶ï' : ['‡¶≤','‡¶Ø'],
      '‡¶ñ' : ['‡¶ï‡¶ó','‡¶ï‡¶ú','‡¶≤‡¶π','‡¶ù'],
      '‡¶ó' : ['‡¶´','‡¶π'],
      '‡¶ò' : ['‡¶´‡¶ó','‡¶π‡¶ú'],
      '‡¶ô' : ['‡¶¨','‡¶Æ'],
      '‡¶ö' : ['‡¶≠','‡¶ö‡¶ú','‡¶ö‡¶ó','‡¶≠‡¶π'],
      '‡¶õ' : ['‡¶≠','‡¶ö‡¶ú','‡¶ö‡¶ó','‡¶≠‡¶π'],
      '‡¶ú' : ['‡¶ï','‡¶π'],
      '‡¶ù' : ['‡¶ï‡¶ú','‡¶π‡¶ó'],
      '‡¶ü' : ['‡¶∞'],
      '‡¶†' : ['‡¶§‡¶ú','‡¶§‡¶ó','‡¶∞‡¶π'],
      '‡¶°' : ['‡¶∏','‡¶´'],
      '‡¶¢' : ['‡¶¶‡¶ú','‡¶¶‡ßç‡¶ó','‡¶∂','‡¶´‡¶π'],
      '‡¶£' : ['‡¶¨','‡¶Æ'],
      '‡¶§' : ['‡¶∞'],
      '‡¶•' : ['‡¶§‡¶ú', '‡¶§‡¶ó', '‡¶∞‡¶π'],
      '‡¶¶' : ['‡¶∏', '‡¶´'],
      '‡¶ß' : ['‡¶¶‡¶ú', '‡¶¶‡ßç‡¶ó', '‡¶∂' ,'‡¶´‡¶π'],
      '‡¶®' : ['‡¶¨','‡¶Æ'],
      '‡¶™' : ['‡¶ì',' ‡ßã'],
      '‡¶´' : ['‡¶¶','‡¶ó'],
      '‡¶¨' : ['‡¶≠','‡¶®'],
      '‡¶≠' : ['‚Äç‡¶¨','‡¶ö'],
      '‡¶Æ' : ['‡¶®'],
      '‡¶Ø' : ['‡¶π','‡¶ï'],
      '‡¶∞' : ['‡¶è',' ‡ßá','‡¶§'],
      '‡¶≤' : ['‡¶ï'],
      '‡¶∂' : ['‡¶∏‡¶ú','‡¶∏‡¶ó','‡¶Ü‡¶π','‡¶¢'],
      '‡¶∑' : ['‡¶∏‡¶ú','‡¶∏‡¶ó','‡¶Ü‡¶π','‡¶¢'],
      '‡¶∏' : ['‡¶Ü',' ‡¶æ','‡¶¶'],
      '‡¶π' : ['‡¶ó','‡¶Ø'],
      '‡ßü' : ['‡¶§','‡¶â','‡ßÅ'],
      '‡ßú' : ['‡¶è',' ‡ßá','‡¶§'],
      '‡ßù' : ['‡¶è',' ‡ßá','‡¶§'],
      '‡ßé' : ['‡¶∞'],
      '‡¶Ç' : ['‡¶¨','‡¶Æ'],
      '‡¶É' : ['‡¶ó','‡¶Ø'],
      '‡¶Ö' : ['‡¶™',' ‡¶ø','‡¶á'],
      '‡¶Ü' : ['‡¶∏'],
      '‡¶á' : ['‡¶â','‡¶Ö'],
      '‡¶à' : ['‡¶â','‡¶Ö'],
      '‡¶â' : ['‡¶á',' ‡¶ø'],
      '‡¶ä' : ['‡¶á',' ‡¶ø'],
      '‡¶ã' : [''],
      '‡¶è' : ['‡¶ì','‡¶∞'],
      '‡¶ê' : [''],
      '‡¶ì' : ['‡¶™','‡¶á'],
      '‡¶î' : [''],
      '‡¶æ' : ['‡¶∏'],
      '‡¶ø' : ['‡¶â','‡¶Ö'],
      '‡ßã' : ['‡¶™',' ‡¶ø'],
      '‡ßå' : [''],
      '‡ßá' : ['‡ßã','‡¶∞'],
      '‡ßà' : ['']
    }

In [None]:
SameClusterDict = {
    '‡¶Ö' : ['‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡¶Ü' : ['‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ‡¶Å','‡¶Ü‡¶É','‡¶ì‡ßü‡¶æ','‡¶è'],
    '‡¶á':['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],
    '‡¶à':['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],
    '‡¶â' : ['‡¶â''‡¶â‡¶Å','‡¶ä','‡ßÅ'],
    '‡¶ä' : ['‡¶â''‡¶â‡¶Å','‡¶ä','‡ßÅ'],
    '‡¶ã' : ['‡¶∞‡¶ø','‡¶π‡ßç‡¶∞‡ßÄ','‡ßú‡¶ø','‡ßù‡¶ø','‡¶¨‡ßç‡¶∞‡¶á','‡ßÉ','‡¶π‡ßç‡¶∞'],
    '‡¶è' : ['‡¶è','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶Ü‡¶É'],
    '‡¶ê': ['‡¶Ö‡¶á','‡¶à','‡¶ì‡¶á' , '‡¶á', '‡¶Ö‡ßç‡¶Ø‡¶æ‡¶á'],
    '‡¶ì': ['‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡¶î': [ '‡¶ì','‡¶Ü','‡¶Ö‡¶â','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡¶ï': ['‡¶ï','‡¶ñ','‡¶ó', '‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ï‡ßã‡¶Å',' ‡¶ï‡ßã','‡¶ñ','‡¶ñ‡ßã','‡¶ñ‡¶ì','‡¶ï‡ßç‡¶∑‡¶ì','‡¶ï‡¶π‡ßã‡¶Å','‡¶ï‡¶π‡¶ì','‡¶ó‡ßã‡¶Å','‡¶ó‡ßü' ],
    '‡¶ñ': ['‡¶ï','‡¶ñ','‡¶ó', '‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ï‡ßã‡¶Å',' ‡¶ï‡ßã','‡¶ñ','‡¶ñ‡ßã','‡¶ñ‡¶ì','‡¶ï‡ßç‡¶∑‡¶ì','‡¶ï‡¶π‡ßã‡¶Å','‡¶ï‡¶π‡¶ì','‡¶ó‡ßã‡¶Å','‡¶ó‡ßü' ],
    '‡¶ó': ['‡¶ï','‡¶ñ','‡¶ó', '‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ï‡ßã‡¶Å',' ‡¶ï‡ßã','‡¶ñ','‡¶ñ‡ßã','‡¶ñ‡¶ì','‡¶ï‡ßç‡¶∑‡¶ì','‡¶ï‡¶π‡ßã‡¶Å','‡¶ï‡¶π‡¶ì','‡¶ó‡ßã‡¶Å','‡¶ó‡ßü' ],
    '‡¶ò ': ['‡¶ñ','‡¶ñ‡ßã','‡¶ñ‡¶ì','‡¶ï‡ßç‡¶∑‡¶ì','‡¶ï‡¶π‡ßã‡¶Å','‡¶ï‡¶π‡¶ì','‡¶ó‡ßã‡¶Å','‡¶ó‡ßü' ,'‡¶ò‡ßã ','‡¶ò‡¶ì'],
    '‡¶ô': ['‚óå‡¶Ç','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶Ö‡¶Ç'],
    '‡¶ö': ['‡¶ö','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ö‡ßã‡¶Å','‡¶ö‡¶ì','‡¶õ‡ßã‡¶Å','‡¶õ‡ßã','‡¶õ‡¶ì','‡¶õ'],
    '‡¶õ': ['‡¶ö','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ö‡ßã‡¶Å','‡¶ö‡¶ì','‡¶õ‡ßã‡¶Å','‡¶õ‡ßã','‡¶õ‡¶ì','‡¶õ'],
    '‡¶ú': ['‡¶ú','‡¶Ø','‡¶ú‡ßã','‡¶Ø‡ßã','‡¶ù','‡¶ù‡¶ì'],
    '‡¶ù': ['‡¶ú','‡¶Ø','‡¶ú‡ßã','‡¶Ø‡ßã','‡¶ù','‡¶ù‡¶ì'],
    '‡¶û': ['‡¶®','‡¶ô','‡¶£','‡¶û','‡¶á','‡¶à','‡¶è','‡ßá','‡ßü','‡¶®‡¶ì' ],
    '‡¶ü' : ['‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü'],
    '‡¶†' : ['‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü'],
    '‡¶°' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶¢': ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É','‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶£': ['‡¶®','‡¶ô','‡¶£','‡¶û','‡¶á','‡¶à','‡¶è','‡ßá','‡ßü','‡¶®‡¶ì' ],
    '‡¶§' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶•': ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶¶' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶ß' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶®': ['‡¶®','‡¶ô','‡¶£','‡¶û','‡¶á','‡¶à','‡¶è','‡ßá','‡ßü','‡¶®‡¶ì' ],
    '‡¶™' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶´' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶¨' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶≠' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶Æ' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶Ø' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶∞' : ['‡¶∞','‡ßú','‡ßù','‡¶∞‡¶ì','‡¶∞‡ßã‡¶Å','‡¶∞‡ßã','‡¶∞‡ßü','‡¶ã', '‡¶π‡¶∞'  ],
    '‡¶≤' : ['‡¶≤','‡¶≤‡ßü','‡¶≤‡ßã'],
    '‡¶∂' : ['‡¶∂','‡¶∑','‡¶∏','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶∂‡ßã','‡¶∏‡¶Ö','‡¶∂‡ßü'],
    '‡¶∑' : ['‡¶∂','‡¶∑','‡¶∏','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶∂‡ßã','‡¶∏‡¶Ö','‡¶∂‡ßü'],
    '‡¶∏' : ['‡¶∂','‡¶∑','‡¶∏','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶∂‡ßã','‡¶∏‡¶Ö','‡¶∂‡ßü'],
    '‚Äç‡¶π': ['‡¶π','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡ßú' : ['‡¶∞','‡ßú','‡ßù','‡¶∞‡¶ì','‡¶∞‡ßã‡¶Å','‡¶∞‡ßã','‡¶∞‡ßü','‡¶ã', '‡¶π‡¶∞'  ],
    '‚Äç‡ßù': ['‡¶∞','‡ßú','‡ßù','‡¶∞‡¶ì','‡¶∞‡ßã‡¶Å','‡¶∞‡ßã','‡¶∞‡ßü','‡¶ã', '‡¶π‡¶∞'  ],
    '‚Äç‡ßü ': ['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],

    '‡ßé' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶Ç' : ['‚óå‡¶Ç','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶Ö‡¶Ç'],
    '‡¶É': ['‡¶π','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‚Äç‡¶Å' : [''],

    '‚Äç‡¶ø': ['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],
    '‡ßÄ' : ['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],
    '‡ßá':['‡¶è','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶Ü‡¶É'],
    '‡ßà'  :['‡¶è','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶Ü‡¶É'],
    '‡ßã' : ['‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡ßá‡ßó':['‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡ßÉ' : ['‡¶∞‡¶ø','‡¶π‡ßç‡¶∞‡ßÄ','‡ßú‡¶ø','‡ßù‡¶ø','‡¶¨‡ßç‡¶∞‡¶á','‡ßÉ','‡¶π‡ßç‡¶∞'],
     '‚óå‡ßÇ'  : ['‡¶â''‡¶â‡¶Å','‡¶ä','‡ßÅ'],
     ' ‡ßÅ'  : ['‡¶â''‡¶â‡¶Å','‡¶ä','‡ßÅ']
}

In [None]:
def apply_error(string,error):
    random.seed(time.time()+200)
    errors = ['Cluster','Replace','Juktakkhor','Character','Deletion','Noerror']
    if error == "Cluster":
        if string in list(SameClusterDict.keys()):
            string = random.choice(SameClusterDict[string])
    if error == "Replace":
        if string in list(ReplaceDict.keys()):
            string = random.choice(ReplaceDict[string])
    if error == "Juktakkhor":
        string = random.choice(JuktakkhorList)
    if error == "Character":
        string = random.choice(character)
    if error == "Deletion":
        string = ""
    if error == "insertion":
        temp_error = random.choice(errors)
        string = string + apply_error(string,temp_error)
    if error == "Noerror":
        pass
    return string

In [None]:
def error_generator(word):
    errors = ['Cluster','Replace','Juktakkhor','Character','Deletion','Insertion','Noerror']
    string = []
    string[:0] = word
    random.seed(time.time())
    noise_ratio = random.randint(0,int(len(word)*0.40))
    print(noise_ratio)
    for n in range(noise_ratio):
        for i in range(len(string)):
            random.seed(time.time()+2)
            error_t = random.random()
            if error_t >=0.5:
                random.seed(time.time()+100)
                error = random.choice(errors)
                string[i] = apply_error(string[i],error)
    word = "".join(string)
    word = word.replace(" ", "")
    return word

In [None]:
word = '‡¶Ö‡¶ú‡¶™‡¶æ‡¶∞‡¶æ‡¶ó‡¶æ‡ßü‡ßá'

In [None]:
error_generator(word)

In [None]:
sentence = 
df_prothom_alo.sentence.tolist()

In [None]:
df_custom_error_dataset = pd.DataFrame()
for sen in tqdm(sentence):
    sent = sen
    sen = sen.split()
    sent = sent.split()
    random.seed(time.time()+356)
    random_index = random.randint(0,len(sen)-1)
    sen[random_index] = '[MASK]'
    error_word = error_generator(sent[random_index])
    temp_dict = {'sentence':" ".join(sent),'masked_sen':" ".join(sen),'error_word':error_word }
    df_custom_error_dataset = df_custom_error_dataset.append(temp_dict,ignore_index=True)

In [None]:
df_custom_error_dataset.sample(5)

In [None]:
df_custom_error_dataset.to_csv('custom_error_dataset.csv',index=False)