In [1]:
import pandas as pd 
import numpy  as np
from wordfreq import word_frequency ,top_n_list,get_frequency_dict

from symspellpy import SymSpell, Verbosity
from itertools import islice

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from random import random, choice
import random
import time

import os
import re
from tqdm import tqdm  
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True,nb_workers=4)


INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from bnunicodenormalizer import Normalizer 
pandarallel.initialize(progress_bar=True,nb_workers=8)
tqdm.pandas()
bnorm=Normalizer()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
from typing import Dict, List, Tuple, Any, Union


import json

from datasets import load_metric

from tqdm.auto import tqdm
from IPython.display import display, Audio, HTML
from Levenshtein import distance as lev


cer = load_metric("cer")
wer = load_metric("wer")
import warnings 
warnings.filterwarnings('ignore')

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

In [27]:
def remove_punctuations(my_str):
    punctuations = '''````¬£|¬¢|√ë+-*/=EROero‡ß≥‡ß¶‡ßß‡ß®‡ß©‡ß™‡ß´‡ß¨‡ß≠‡ßÆ‡ßØ012‚Äì34567‚Ä¢89‡•§!()-[]{};:'"‚Äú\‚Äô‚Ä¶,<>.‚Äö/?@#$%^&*_~‚Äò‚Äî‡••‚Äù‚Ä∞ü§£‚öΩÔ∏è‚úåÔøΩÔø∞‡ß∑Ôø∞'''
    no_punct = ""
    for char in my_str:
        if char not in punctuations:
            no_punct = no_punct + char
    return no_punct

def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None]) 

def wer(ref, hyp ,debug=False):
    r = ref.split()
    h = hyp.split()
    #costs will holds the costs, like in the Levenshtein distance algorithm
    costs = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]
    # backtrace will hold the operations we've done.
    # so we could later backtrace, like the WER algorithm requires us to.
    backtrace = [[0 for inner in range(len(h)+1)] for outer in range(len(r)+1)]

    OP_OK = 0
    OP_SUB = 1
    OP_INS = 2
    OP_DEL = 3

    DEL_PENALTY=1 # Tact
    INS_PENALTY=1 # Tact
    SUB_PENALTY=1 # Tact
    # First column represents the case where we achieve zero
    # hypothesis words by deleting all reference words.
    for i in range(1, len(r)+1):
        costs[i][0] = DEL_PENALTY*i
        backtrace[i][0] = OP_DEL

    # First row represents the case where we achieve the hypothesis
    # by inserting all hypothesis words into a zero-length reference.
    for j in range(1, len(h) + 1):
        costs[0][j] = INS_PENALTY * j
        backtrace[0][j] = OP_INS

    # computation
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                costs[i][j] = costs[i-1][j-1]
                backtrace[i][j] = OP_OK
            else:
                substitutionCost = costs[i-1][j-1] + SUB_PENALTY # penalty is always 1
                insertionCost    = costs[i][j-1] + INS_PENALTY   # penalty is always 1
                deletionCost     = costs[i-1][j] + DEL_PENALTY   # penalty is always 1

                costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                if costs[i][j] == substitutionCost:
                    backtrace[i][j] = OP_SUB
                elif costs[i][j] == insertionCost:
                    backtrace[i][j] = OP_INS
                else:
                    backtrace[i][j] = OP_DEL

    # back trace though the best route:
    i = len(r)
    j = len(h)
    numSub = 0
    numDel = 0
    numIns = 0
    numCor = 0
    if debug:
        lines = []
        compares = []
    while i > 0 or j > 0:
        if backtrace[i][j] == OP_OK:
            numCor += 1
            i-=1
            j-=1
            if debug:
                lines.append("C\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_SUB:
            numSub +=1
            i-=1
            j-=1
            if debug:
                lines.append("S\t" + r[i]+"\t"+h[j])
        elif backtrace[i][j] == OP_INS:
            numIns += 1
            j-=1
            if debug:
                lines.append("I\t" + "****" + "\t" + h[j])
        elif backtrace[i][j] == OP_DEL:
            numDel += 1
            i-=1
            if debug:
                lines.append("D\t" + r[i]+"\t"+"****")
    return lines

def error_label(ref,hyp):
    li=[]
    lines= wer(ref, hyp,debug=True)
    for line in reversed(lines):
        line = re.sub(r"\t"," ",line)
        line = line.split()
        li.append(line[0])
    return li

def wer_calc(label):
    s=label.count('S')
    d=label.count('D')
    i=label.count('I')
    return int((s+d+i)/len(label)*100)

def masking(arpa):
    sen = arpa.split()
    for words in sen:
        suggestions = sym_spell.lookup(words, Verbosity.CLOSEST,max_edit_distance=0, include_unknown=True)
        for suggestion in suggestions:
            suggestion = str(suggestion)
            suggestion = suggestion.split(',')
            if int(suggestion[2]) == 0:
                mask = '[MASK]'
                sen = [mask if x == words else x for x in sen]
                break
    return " ".join(sen)

def lav_distance(error_word,masked_word):
    return int(lev(error_word, masked_word)/max(len(error_word),len(masked_word))*100)

In [5]:
sym_spell = SymSpell(max_dictionary_edit_distance=5, prefix_length=7)
dictionary_path = 'asr_bangla/data/prothom_alo_word_freq.txt'
sym_spell.load_dictionary(dictionary_path, 0, 1,separator=" ")

True

# Common_voice

In [6]:
df_cv_1 = pd.read_csv('asr_bangla/data/cv_100k_p1.csv')
df_cv_2 = pd.read_csv('asr_bangla/data/cv_100k_p2.csv')
df_cv_3 = pd.read_csv('asr_bangla/data/symspell_nov.csv')
df_cv = pd.read_csv('cv_train.csv')

In [7]:
df_cv_3 = df_cv_3[(df_cv_3.source == 'train') & (df_cv_3.audio == 'noisy')]

In [8]:
df_cv_3 = df_cv_3[['path','wav2vec2','4gram']]

In [9]:
df_cv = df_cv[['path','sentence']]

In [10]:
df_cv_1= df_cv_1.append([df_cv_2,df_cv_3])
df_cv_1 = df_cv_1.reset_index(drop=True)

In [11]:
df_cv_1['path'] = df_cv_1.path.apply(lambda x: os.path.basename(str(x)))

In [12]:
df_cv = df_cv_1.merge(df_cv,on='path',how='inner')

In [13]:
df_cv

Unnamed: 0,path,wav2vec2,4gram,sentence
0,common_voice_bn_30991371.mp3,‡¶¶‡ßá‡¶ì‡ßü‡¶æ‡¶® ‡¶´‡¶∞‡¶ø‡¶¶ ‡¶ó‡¶æ‡ßü‡¶ú‡ßÄ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶ì‡ßü‡¶æ‡¶Æ‡ßÄ ‡¶≤‡ßÄ‡¶ó‡ßá‡¶∞ ‡¶â‡¶™‡¶¶‡ßá‡¶∑‡ßç...,‡¶¶‡ßá‡¶ì‡ßü‡¶æ‡¶® ‡¶´‡¶∞‡¶ø‡¶¶ ‡¶ó‡¶æ‡¶ú‡ßÄ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶ì‡ßü‡¶æ‡¶Æ‡ßÄ ‡¶≤‡ßÄ‡¶ó‡ßá‡¶∞ ‡¶â‡¶™‡¶¶‡ßá‡¶∑‡ßç‡¶ü...,‡¶¶‡ßá‡¶ì‡¶Ø‡¶º‡¶æ‡¶® ‡¶´‡¶∞‡¶ø‡¶¶ ‡¶ó‡¶æ‡¶ú‡ßÄ ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂ ‡¶Ü‡¶ì‡¶Ø‡¶º‡¶æ‡¶Æ‡ßÄ ‡¶≤‡ßÄ‡¶ó‡ßá‡¶∞ ‡¶â‡¶™‡¶¶‡ßá‡¶∑...
1,common_voice_bn_30991410.mp3,‡¶è‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶ï‡¶æ‡¶∞‡ßã ‡¶ó‡¶æ‡¶® ‡¶ó‡¶æ‡¶ì‡ßü‡¶æ ‡¶¶‡¶ï‡ßç‡¶∑‡¶§‡¶æ‡¶¨‡¶æ ‡¶®‡¶ø‡¶ú‡¶∏‡ßç‡¶¨ ‡¶ß‡¶∞‡ßç‡¶Æ...,‡¶è‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶ï‡¶æ‡¶∞‡¶ì ‡¶ó‡¶æ‡¶® ‡¶ó‡¶æ‡¶ì‡ßü‡¶æ ‡¶¶‡¶ï‡ßç‡¶∑‡¶§‡¶æ ‡¶¨‡¶æ ‡¶®‡¶ø‡¶ú‡¶∏‡ßç‡¶¨ ‡¶ß‡¶∞‡ßç...,‡¶è‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶ï‡¶æ‡¶∞‡¶ì ‡¶ó‡¶æ‡¶® ‡¶ó‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ ‡¶¶‡¶ï‡ßç‡¶∑‡¶§‡¶æ ‡¶¨‡¶æ ‡¶®‡¶ø‡¶ú‡¶∏‡ßç‡¶¨ ‡¶ß‡¶∞...
2,common_voice_bn_30991513.mp3,‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡ßá ‡¶§‡¶•‡ßç‡¶Ø ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶Ø‡ßá ‡¶ï‡ßã‡¶®‡ßã ‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶´‡¶æ‡¶á‡¶≤‡ßá...,‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡ßá ‡¶§‡¶•‡ßç‡¶Ø ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶Ø‡ßá ‡¶ï‡ßã‡¶®‡ßã ‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶´‡¶æ‡¶á‡¶≤‡ßá...,‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡ßá ‡¶§‡¶•‡ßç‡¶Ø ‡¶Ö‡¶®‡ßç‡¶Ø ‡¶Ø‡ßá ‡¶ï‡ßã‡¶®‡¶ì ‡¶°‡¶ø‡¶ú‡¶ø‡¶ü‡¶æ‡¶≤ ‡¶´‡¶æ‡¶á‡¶≤‡ßá...
3,common_voice_bn_30991535.mp3,‡¶π‡¶ø‡¶∏‡¶æ‡¶¨ ‡¶™‡¶æ‡¶ì‡ßü‡¶æ‡¶∞ ‡¶™‡¶∞ ‡¶§‡¶ø‡¶®‡¶ø ‡¶∏‡ßç‡¶¨‡¶≤‡ßç‡¶™ ‡¶∏‡¶Æ‡ßü‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá‡¶á ‡¶∏‡¶Æ‡ßç‡¶™‡ßÇ...,‡¶π‡¶ø‡¶∏‡¶æ‡¶¨ ‡¶™‡¶æ‡¶ì‡ßü‡¶æ‡¶∞ ‡¶™‡¶∞ ‡¶§‡¶ø‡¶®‡¶ø ‡¶∏‡ßç‡¶¨‡¶≤‡ßç‡¶™ ‡¶∏‡¶Æ‡ßü‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá‡¶á ‡¶∏‡¶Æ‡ßç‡¶™‡ßÇ...,‡¶π‡¶ø‡¶∏‡¶æ‡¶¨ ‡¶™‡¶æ‡¶ì‡¶Ø‡¶º‡¶æ‡¶∞ ‡¶™‡¶∞ ‡¶§‡¶ø‡¶®‡¶ø ‡¶∏‡ßç‡¶¨‡¶≤‡ßç‡¶™ ‡¶∏‡¶Æ‡¶Ø‡¶º‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá‡¶á ‡¶∏‡¶Æ‡ßç...
4,common_voice_bn_30991592.mp3,‡¶®‡¶¶‡ßÄ‡¶∞ ‡¶™‡¶æ‡¶®‡¶ø‡¶§‡ßá ‡¶¨‡ßü‡ßá ‡¶Ü‡¶∏‡¶æ ‡¶™‡¶≤‡¶ø ‡¶¶‡¶≤‡¶¶‡ßá‡¶∂‡ßá ‡¶ú‡¶Æ‡ßá ‡¶Æ‡¶æ‡¶ù‡ßá ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ...,‡¶®‡¶¶‡ßÄ‡¶∞ ‡¶™‡¶æ‡¶®‡¶ø‡¶§‡ßá ‡¶¨‡ßü‡ßá ‡¶Ü‡¶∏‡¶æ ‡¶™‡¶≤‡¶ø ‡¶§‡¶≤‡¶¶‡ßá‡¶∂‡ßá ‡¶ú‡¶Æ‡ßá ‡¶Æ‡¶æ‡¶ù‡ßá ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ...,‡¶®‡¶¶‡ßÄ‡¶∞ ‡¶™‡¶æ‡¶®‡¶ø‡¶§‡ßá ‡¶¨‡¶Ø‡¶º‡ßá ‡¶Ü‡¶∏‡¶æ ‡¶™‡¶≤‡¶ø ‡¶§‡¶≤‡¶¶‡ßá‡¶∂‡ßá ‡¶ú‡¶Æ‡ßá ‡¶Æ‡¶æ‡¶ù‡ßá‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ...
...,...,...,...,...
100931,common_voice_bn_31459116.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡¶°‡¶º‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡¶°‡¶º‡¶æ‡¶Ø‡¶º ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®‡•§
100932,common_voice_bn_31459130.mp3,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡¶Ø‡¶º...
100933,common_voice_bn_31459161.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...
100934,common_voice_bn_31637719.mp3,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã‡•§


# Open_SLR

In [14]:
df_slr_1 = pd.read_csv('asr_bangla/data/slr_1_60.csv')
df_slr_2 = pd.read_csv('asr_bangla/data/slr_150_200.csv')
df_slr_3 = pd.read_csv('asr_bangla/data/slr_60_70.csv')
df_slr_4 = pd.read_csv('asr_bangla/data/slr_70_80.csv')
df_slr_5 = pd.read_csv('asr_bangla/data/slr_80_90.csv')
df_slr_6 = pd.read_csv('asr_bangla/data/slr_90_100.csv')

In [15]:
df_slr_1 = df_slr_1.append([df_slr_2,df_slr_3,df_slr_4,df_slr_5,df_slr_6])
df_slr_1 = df_slr_1.reset_index(drop=True)

In [16]:
df_slr_1['path'] = df_slr_1.path.apply(lambda x: os.path.basename(str(x)))

In [17]:
df_slr_1

Unnamed: 0,path,wav2vec2,4gram
0,000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá
1,000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá
2,00005debc7.flac,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•
3,00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
4,00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá
...,...,...,...
149995,74c6f3b531.flac,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡¶∞‡ßã ‡¶ï‡¶æ‡¶ú ‡¶π‡ßü‡ßá‡¶õ‡ßá,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡ßú ‡¶ï‡¶æ‡¶ú ‡¶π‡ßü‡ßá‡¶õ‡ßá
149996,74c702e2d5.flac,‡¶ú‡ßÅ‡¶®‡ßç ‡¶ñ‡¶æ‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤,‡¶ú‡ßÅ‡¶® ‡¶ñ‡ßá‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤
149997,74c706e710.flac,‡¶è‡¶∞‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂‡ßá,‡¶è‡¶∞‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá
149998,74c70ce063.flac,‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡¶®‡ßü‡¶∂‡¶≤ ‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶ú‡¶®‡ßá‡¶∞,‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡¶®‡ßü ‡¶∂ ‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶ú‡¶®‡ßá‡¶∞


In [18]:
df_slr = pd.read_csv('slr_train.csv')

In [19]:
df_slr['path'] = df_slr.path.apply(lambda x :os.path.basename(str(x)))

In [20]:
df_slr = df_slr_1.merge(df_slr,on='path',how='inner')

In [21]:
df_slr['path'] = 'slr_' + df_slr.path

In [22]:
df_slr

Unnamed: 0,path,wav2vec2,4gram,sentence
0,slr_000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá ‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá
1,slr_000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è ‡¶ß‡¶∞‡¶£‡ßá‡¶∞ ‡¶ï‡¶æ‡¶∞‡ßç‡¶° ‡¶®‡¶ø‡¶Ø‡¶º‡ßá
2,slr_00005debc7.flac,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•
3,slr_00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡¶Ø‡¶º ‡¶π‡¶Ø‡¶º‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
4,slr_00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá
...,...,...,...,...
149995,slr_74c6f3b531.flac,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡¶∞‡ßã ‡¶ï‡¶æ‡¶ú ‡¶π‡ßü‡ßá‡¶õ‡ßá,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡ßú ‡¶ï‡¶æ‡¶ú ‡¶π‡ßü‡ßá‡¶õ‡ßá,‡¶è‡¶ï‡¶ü‡¶æ ‡¶¨‡¶°‡¶º ‡¶ï‡¶æ‡¶ú ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá
149996,slr_74c702e2d5.flac,‡¶ú‡ßÅ‡¶®‡ßç ‡¶ñ‡¶æ‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤,‡¶ú‡ßÅ‡¶® ‡¶ñ‡ßá‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤,‡¶ù‡ßÅ‡¶®‡ßÅ‡¶ñ‡¶æ‡¶≤‡¶æ‡¶∞ ‡¶∏‡ßã‡¶®‡¶æ‡¶∞ ‡¶¶‡ßÅ‡¶≤
149997,slr_74c706e710.flac,‡¶è‡¶∞‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ ‡¶¶‡ßá‡¶∂‡ßá,‡¶è‡¶∞‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá,‡¶è‡¶∞ ‡¶™‡¶∞‡¶ì ‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá
149998,slr_74c70ce063.flac,‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡¶®‡ßü‡¶∂‡¶≤ ‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶ú‡¶®‡ßá‡¶∞,‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡¶®‡ßü ‡¶∂ ‡¶õ‡¶æ‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶ú‡¶®‡ßá‡¶∞,‡ß®‡ß¨ ‡¶π‡¶æ‡¶ú‡¶æ‡¶∞ ‡ßØ‡¶∂ ‡ß®‡ß¨ ‡¶ú‡¶®‡ßá‡¶∞


# Data_Merged

In [23]:
df_actual = df_slr.append(df_cv)

In [24]:
df_actual = df_actual.dropna()

In [25]:
df_actual = df_actual.reset_index(drop=True)

In [26]:
df_actual.rename(columns = {'4gram':'arpa_4gram'}, inplace = True)

In [27]:
df_actual

Unnamed: 0,path,wav2vec2,arpa_4gram,sentence
0,slr_000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá ‡¶¶‡¶æ‡¶Ø‡¶º‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá
1,slr_000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è ‡¶ß‡¶∞‡¶£‡ßá‡¶∞ ‡¶ï‡¶æ‡¶∞‡ßç‡¶° ‡¶®‡¶ø‡¶Ø‡¶º‡ßá
2,slr_00005debc7.flac,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•
3,slr_00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡¶Ø‡¶º ‡¶π‡¶Ø‡¶º‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
4,slr_00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá
...,...,...,...,...
248612,common_voice_bn_31459116.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡¶°‡¶º‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡¶°‡¶º‡¶æ‡¶Ø‡¶º ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®‡•§
248613,common_voice_bn_31459130.mp3,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡¶Ø‡¶º...
248614,common_voice_bn_31459161.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...
248615,common_voice_bn_31637719.mp3,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã‡•§


In [28]:
df_actual['sentence'] = df_actual['sentence'].parallel_apply(lambda x : remove_punctuations(x))
df_actual["sentence"]=df_actual["sentence"].parallel_apply(lambda x:normalize(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [29]:
#df_actual.to_csv('df_data_actual.csv',index=False)

In [30]:
df_actual['wav2vec2_label'] = df_actual.parallel_apply(lambda x : error_label(x.sentence,x.wav2vec2),axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [31]:
df_actual['arpa_label'] = df_actual.parallel_apply(lambda x : error_label(x.sentence,x.arpa_4gram),axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [32]:
df_actual['wer_wav2vec2'] = df_actual.wav2vec2_label.parallel_apply(lambda x: wer_calc(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [33]:
df_actual['wer_arpa'] = df_actual.arpa_label.parallel_apply(lambda x: wer_calc(x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=31078), Label(value='0 / 31078')))‚Ä¶

In [34]:
df_actual

Unnamed: 0,path,wav2vec2,arpa_4gram,sentence,wav2vec2_label,arpa_label,wer_wav2vec2,wer_arpa
0,slr_000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,"[S, C, C]","[S, C, C]",33,33
1,slr_000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è ‡¶ß‡¶∞‡¶£‡ßá‡¶∞ ‡¶ï‡¶æ‡¶∞‡ßç‡¶° ‡¶®‡¶ø‡ßü‡ßá,"[S, S, S, C]","[S, S, S, C]",75,75
2,slr_00005debc7.flac,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,‡¶π‡¶§‡ßá ‡¶â‡¶™‡¶æ‡¶∞‡ßç‡¶ú‡¶ø‡¶§ ‡¶Ö‡¶∞‡ßç‡¶•,"[C, C, C]","[C, C, C]",0,0
3,slr_00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,"[S, S, C, C]","[C, C, C, C]",50,0
4,slr_00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá,"[I, S, S]","[I, S, S]",100,100
...,...,...,...,...,...,...,...,...
248612,common_voice_bn_31459116.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,"[C, C, S, C, C]","[C, C, I, I, S, C, C]",20,42
248613,common_voice_bn_31459130.mp3,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü ‡¶Ü‡¶õ‡ßá,"[C, S, C, S, C, C, C, I, S, S]","[C, I, S, C, S, C, C, C, C, S]",50,40
248614,common_voice_bn_31459161.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...,"[C, C, S, S, C, C, C, S, C, C, S]","[C, C, S, C, C, C, C, S, C, C, S]",36,27
248615,common_voice_bn_31637719.mp3,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã,"[S, C, C, S, S]","[C, C, C, C, S]",60,20


In [35]:
df_actual = df_actual[df_actual.wer_wav2vec2 !=0 ]

In [36]:
df_actual.to_csv('asr_bangla/data/asr_cv_slr.csv',index=False)

# Real data masking

In [37]:
df_actual = pd.read_csv('asr_bangla/data/asr_cv_slr.csv')

In [7]:
df_actual = pd.read_csv('asr_bangla/data/asr_cv_slr.csv')

In [8]:
df_actual['masked_arpa'] = df_actual['arpa_4gram'].parallel_apply(lambda x: masking(str(x)))
df_actual['masked_wav2vec2'] = df_actual['wav2vec2'].parallel_apply(lambda x: masking(str(x)))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24730), Label(value='0 / 24730')))‚Ä¶

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24730), Label(value='0 / 24730')))‚Ä¶

In [9]:
df_actual

Unnamed: 0,path,wav2vec2,arpa_4gram,sentence,wav2vec2_label,arpa_label,wer_wav2vec2,wer_arpa,masked_arpa,masked_wav2vec2
0,slr_000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,"['S', 'C', 'C']","['S', 'C', 'C']",33,33,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá
1,slr_000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è ‡¶ß‡¶∞‡¶£‡ßá‡¶∞ ‡¶ï‡¶æ‡¶∞‡ßç‡¶° ‡¶®‡¶ø‡ßü‡ßá,"['S', 'S', 'S', 'C']","['S', 'S', 'S', 'C']",75,75,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá
2,slr_00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,"['S', 'S', 'C', 'C']","['C', 'C', 'C', 'C']",50,0,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø [MASK] ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
3,slr_00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá,"['I', 'S', 'S']","['I', 'S', 'S']",100,100,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ [MASK]
4,slr_00012880b1.flac,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,‡¶∂‡¶π‡¶∞‡ßá ‡¶Ø‡¶æ‡¶∞‡¶æ,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü,"['D', 'S', 'S']","['D', 'S', 'S']",100,100,‡¶∂‡¶π‡¶∞‡ßá ‡¶Ø‡¶æ‡¶∞‡¶æ,[MASK] [MASK]
...,...,...,...,...,...,...,...,...,...,...
197835,common_voice_bn_31459116.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,"['C', 'C', 'S', 'C', 'C']","['C', 'C', 'I', 'I', 'S', 'C', 'C']",20,42,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ [MASK] ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®
197836,common_voice_bn_31459130.mp3,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü ‡¶Ü‡¶õ‡ßá,"['C', 'S', 'C', 'S', 'C', 'C', 'C', 'I', 'S', ...","['C', 'I', 'S', 'C', 'S', 'C', 'C', 'C', 'C', ...",50,40,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá [MASK] ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®‡ßÄ‡ßü ...
197837,common_voice_bn_31459161.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...,"['C', 'C', 'S', 'S', 'C', 'C', 'C', 'S', 'C', ...","['C', 'C', 'S', 'C', 'C', 'C', 'C', 'S', 'C', ...",36,27,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ [MASK] ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...
197838,common_voice_bn_31637719.mp3,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã,"['S', 'C', 'C', 'S', 'S']","['C', 'C', 'C', 'C', 'S']",60,20,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï [MASK] ‡¶õ‡¶ø‡¶≤


In [10]:
mask = '[MASK]'

In [11]:
df_actual['arpa_masked_n'] = df_actual['masked_arpa'].parallel_apply(lambda x: str(x).split().count(mask))
df_actual['wav2vec2_masked_n'] = df_actual['masked_wav2vec2'].parallel_apply(lambda x: str(x).split().count(mask))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24730), Label(value='0 / 24730')))‚Ä¶

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=24730), Label(value='0 / 24730')))‚Ä¶

In [12]:
df_actual['wav2vec2_masked_n'].sum()

299528

In [13]:
df_actual['arpa_masked_n'].sum()

28833

In [14]:
df_actual = df_actual[(df_actual.wav2vec2_masked_n !=0) & (df_actual.wav2vec2_masked_n <=6)]

In [15]:
df_actual = df_actual[['wav2vec2','masked_wav2vec2','wav2vec2_masked_n','sentence']]

In [16]:
df_actual

Unnamed: 0,wav2vec2,masked_wav2vec2,wav2vec2_masked_n,sentence
2,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø [MASK] ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,1,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
3,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ [MASK],1,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá
4,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,[MASK] [MASK],2,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü
5,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ‡¶Æ,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá [MASK],1,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§‡¶ì ‡¶ï‡¶∞‡ßá‡¶õ‡ßá ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ
6,‡¶Ø‡¶æ‡¶§‡ßá ‡¶¨‡¶ø‡¶≠‡¶æ‡¶π ‡¶∏‡¶æ‡¶¶‡ßÄ ‡¶¶‡¶ø‡¶≤‡ßá,‡¶Ø‡¶æ‡¶§‡ßá [MASK] ‡¶∏‡¶æ‡¶¶‡ßÄ ‡¶¶‡¶ø‡¶≤‡ßá,1,‡¶Ø‡¶æ‡¶§‡ßá ‡¶¨‡¶ø‡¶¨‡¶æ‡¶π ‡¶∂‡¶æ‡¶¶‡ßÄ ‡¶¶‡¶ø‡¶≤‡ßá
...,...,...,...,...
197835,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ [MASK] ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,1,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®
197836,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá [MASK] ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®‡ßÄ‡ßü ...,1,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü ‡¶Ü‡¶õ‡ßá
197837,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ [MASK] ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,1,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...
197838,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï [MASK] ‡¶õ‡¶ø‡¶≤,1,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã


In [17]:
df_actual['reference_n'] = df_actual.sentence.parallel_apply(lambda x:len(str(x).split()))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=18804), Label(value='0 / 18804')))‚Ä¶

In [18]:
df_actual[df_actual.reference_n == 1]

Unnamed: 0,wav2vec2,masked_wav2vec2,wav2vec2_masked_n,sentence,reference_n
65,‡¶∏‡ßá‡¶™‡ßç‡¶ü‡ßá‡¶Æ‡ßç‡¶¨‡¶∞ ‡¶§‡ßá‡¶∞‡ßã ‡¶â‡¶®‡¶ø‡¶∑ ‡¶ö‡¶æ‡¶∞,‡¶∏‡ßá‡¶™‡ßç‡¶ü‡ßá‡¶Æ‡ßç‡¶¨‡¶∞ ‡¶§‡ßá‡¶∞‡ßã [MASK] ‡¶ö‡¶æ‡¶∞,1,‡¶∏‡ßá‡¶™‡ßç‡¶ü‡ßá‡¶Æ‡ßç‡¶¨‡¶∞,1
104,‡¶Ü‡¶ß‡¶æ‡¶∞‡¶∏ ‡¶™‡¶ö‡¶æ‡¶§‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶Ö‡¶®‡ßÅ‡¶∏‡¶¶ ‡¶ö‡¶æ‡¶∞,[MASK] [MASK] ‡¶•‡ßá‡¶ï‡ßá [MASK] ‡¶ö‡¶æ‡¶∞,3,‡¶•‡ßá‡¶ï‡ßá,1
146,‡¶ó‡ßÄ‡¶∂‡ßç‡¶¨ ‡¶ï‡¶æ‡¶≤,[MASK] ‡¶ï‡¶æ‡¶≤,1,‡¶ó‡ßç‡¶∞‡ßÄ‡¶∑‡ßç‡¶Æ‡¶ï‡¶æ‡¶≤,1
182,‡¶´‡ßá‡¶æ‡¶∞‡¶∂,[MASK],1,‡¶´‡ßá‡¶°‡¶æ‡¶∞‡ßá‡¶∂‡¶®,1
236,‡¶ñ‡ßá‡¶≤‡¶æ ‡¶´‡¶§,‡¶ñ‡ßá‡¶≤‡¶æ [MASK],1,‡¶ñ‡ßá‡¶≤‡¶æ‡¶´‡¶§,1
...,...,...,...,...,...
103525,‡¶Æ‡ßå‡¶≤‡¶¨‡¶æ‡¶¶‡ßÄ ‡¶ú‡¶ô‡ßç‡¶ó‡ßÄ‡¶¨‡¶æ‡¶¶‡¶ø‡¶∞‡¶æ,‡¶Æ‡ßå‡¶≤‡¶¨‡¶æ‡¶¶‡ßÄ [MASK],1,‡¶Æ‡ßå‡¶≤‡¶¨‡¶æ‡¶¶‡ßÄ‡¶ú‡¶ô‡ßç‡¶ó‡ßÄ‡¶¨‡¶æ‡¶¶‡ßÄ‡¶∞‡¶æ,1
103529,‡¶Ü‡¶ü‡¶æ‡¶∏ ‡¶è‡¶ï‡¶æ‡¶®‡ßã ‡¶•‡ßá‡¶ï‡ßá ‡¶â‡¶®‡¶ø‡¶∂‡ßç‡¶ö ‡¶¨‡¶æ‡¶á‡¶∏,[MASK] [MASK] ‡¶•‡ßá‡¶ï‡ßá [MASK] [MASK],4,‡¶•‡ßá‡¶ï‡ßá,1
103558,‡¶™‡¶∞‡¶ø‡¶¨‡¶æ ‡¶ï‡ßá ‡ßá,[MASK] ‡¶ï‡ßá [MASK],2,‡¶™‡¶∞‡¶ø‡¶¨‡¶æ‡¶∞‡¶ï‡ßá,1
166372,‡¶ó‡ßã‡¶≤ ‡¶ü‡ßá‡¶≠‡¶ø‡¶≤,‡¶ó‡ßã‡¶≤ [MASK],1,‡¶ó‡ßã‡¶≤‡¶ü‡ßá‡¶¨‡¶ø‡¶≤,1


In [19]:
df_actual = df_actual[df_actual.reference_n >=2]

In [20]:
wav2vec2_multi_dict = df_actual.set_index('wav2vec2').T.to_dict()
keys = list(wav2vec2_multi_dict.keys())

In [21]:
wav2vec2_multi_dict[keys[0]]

{'masked_wav2vec2': '‡¶π‡¶æ‡¶∏‡¶ø [MASK] ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá',
 'wav2vec2_masked_n': 1,
 'sentence': '‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá',
 'reference_n': 4}

In [36]:
def wav2vec2_bert_data(key,masked,reference):
    global df_wav2vec2_temp
    masked_indices = []
    masked = masked.split()
    reference = reference.split()
    for i in range(len(masked)):
        if masked[i] == mask:
            masked_indices.append(i)
    for i in masked_indices:
        sen = key.split()
        error_word = sen[i]
        if i in range(0,len(reference)):
            correct_word = reference[i]
        else:
            correct_word = " "
        sen[i] = mask 
        sen = " ".join(sen)
        temp_dict = {'wav2vec2':key,'masked_wav2vec2':sen,'error_word': error_word,'correct_word':correct_word,'sentence': " ".join(reference) }        
        df_wav2vec2_temp = df_wav2vec2_temp.append(temp_dict,ignore_index=True)

In [40]:
#df_actual.parallel_apply(lambda x: wav2vec2_bert_data(x.wav2vec2,x.masked_wav2vec2,x.sentence), axis=1)

In [44]:
df_wav2vec2_temp_1 = pd.DataFrame()
df_wav2vec2_temp_2 = pd.DataFrame()

In [45]:
for key in tqdm(keys[:75000]):
    masked_indices = []
    masked = wav2vec2_multi_dict[key]['masked_wav2vec2']
    masked = masked.split()
    reference = wav2vec2_multi_dict[key]['sentence']
    reference = reference.split()
    for i in range(len(masked)):
        if masked[i] == mask:
            masked_indices.append(i)
    for i in masked_indices:
        sen = key.split()
        error_word = sen[i]
        if i in range(0,len(reference)):
            correct_word = reference[i]
        else:
            correct_word = " "
        sen[i] = mask 
        sen = " ".join(sen)
        temp_dict = {'wav2vec2':key,'masked_wav2vec2':sen,'error_word': error_word,'correct_word':correct_word,'sentence': " ".join(reference) }
        df_wav2vec2_temp_1 = df_wav2vec2_temp_1.append(temp_dict,ignore_index=True)

  0%|          | 0/75000 [00:00<?, ?it/s]

In [46]:
for key in tqdm(keys[75000:]):
    masked_indices = []
    masked = wav2vec2_multi_dict[key]['masked_wav2vec2']
    masked = masked.split()
    reference = wav2vec2_multi_dict[key]['sentence']
    reference = reference.split()
    for i in range(len(masked)):
        if masked[i] == mask:
            masked_indices.append(i)
    for i in masked_indices:
        sen = key.split()
        error_word = sen[i]
        if i in range(0,len(reference)):
            correct_word = reference[i]
        else:
            correct_word = " "
        sen[i] = mask 
        sen = " ".join(sen)
        temp_dict = {'wav2vec2':key,'masked_wav2vec2':sen,'error_word': error_word,'correct_word':correct_word,'sentence': " ".join(reference) }
        df_wav2vec2_temp_2 = df_wav2vec2_temp_2.append(temp_dict,ignore_index=True)

  0%|          | 0/71603 [00:00<?, ?it/s]

In [48]:
df_wav2vec2_temp = df_wav2vec2_temp_1.append(df_wav2vec2_temp_2,ignore_index=True)

In [49]:
df_wav2vec2_temp

Unnamed: 0,wav2vec2,masked_wav2vec2,error_word,correct_word,sentence
0,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø [MASK] ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶¨‡¶ø‡¶∑‡¶≤,‡¶¨‡¶ø‡¶∑‡ßü,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
1,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ [MASK],‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá
2,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,[MASK] ‡¶ú‡¶æ‡¶¶‡ßá,‡¶∂‡¶π‡¶ú‡¶ø,‡¶∏‡¶π‡¶ú‡ßá‡¶á,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü
3,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,‡¶∂‡¶π‡¶ú‡¶ø [MASK],‡¶ú‡¶æ‡¶¶‡ßá,‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü
4,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ‡¶Æ,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá [MASK],‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ‡¶Æ,‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§‡¶ì ‡¶ï‡¶∞‡ßá‡¶õ‡ßá ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ
...,...,...,...,...,...
280686,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ [MASK] ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü,‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®
280687,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá [MASK] ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®‡ßÄ‡ßü ...,‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú,‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü ‡¶Ü‡¶õ‡ßá
280688,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ [MASK] ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø,‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...
280689,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï [MASK] ‡¶õ‡¶ø‡¶≤,‡¶Æ‡ßÅ‡¶ö‡ßÅ,‡¶â‡¶Å‡¶ö‡ßÅ,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã


In [51]:
df_wav2vec2_temp.to_csv('data/masked.csv',index=False)

In [4]:
df_masked = pd.read_csv('data/masked.csv')

In [5]:
df_raw = pd.read_csv('data/asr_raw.csv')

In [6]:
df_masked

Unnamed: 0,wav2vec2,masked_wav2vec2,error_word,correct_word,sentence
0,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø [MASK] ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶¨‡¶ø‡¶∑‡¶≤,‡¶¨‡¶ø‡¶∑‡ßü,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
1,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ [MASK],‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá
2,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,[MASK] ‡¶ú‡¶æ‡¶¶‡ßá,‡¶∂‡¶π‡¶ú‡¶ø,‡¶∏‡¶π‡¶ú‡ßá‡¶á,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü
3,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,‡¶∂‡¶π‡¶ú‡¶ø [MASK],‡¶ú‡¶æ‡¶¶‡ßá,‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü
4,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ‡¶Æ,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá [MASK],‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ‡¶Æ,‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§‡¶ì ‡¶ï‡¶∞‡ßá‡¶õ‡ßá ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ
...,...,...,...,...,...
280686,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ [MASK] ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü,‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®
280687,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá [MASK] ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®‡ßÄ‡ßü ...,‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú,‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü ‡¶Ü‡¶õ‡ßá
280688,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ [MASK] ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø,‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...
280689,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï [MASK] ‡¶õ‡¶ø‡¶≤,‡¶Æ‡ßÅ‡¶ö‡ßÅ,‡¶â‡¶Å‡¶ö‡ßÅ,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã


In [7]:
df_raw

Unnamed: 0,path,wav2vec2,arpa_4gram,sentence,wav2vec2_label,arpa_label,wer_wav2vec2,wer_arpa
0,slr_000020a912.flac,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá‡¶∞ ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,‡¶¨‡¶æ‡¶Ç‡¶≤‡¶æ‡¶¶‡ßá‡¶∂‡ßá ‡¶¶‡¶æ‡ßü‡¶ø‡¶§‡ßç‡¶¨ ‡¶®‡ßá‡¶¨‡ßá,"['S', 'C', 'C']","['S', 'C', 'C']",33,33
1,slr_000039928e.flac,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è‡¶á ‡¶ß‡¶∞‡¶®‡ßá‡¶∞ ‡¶ï‡¶æ‡¶ú ‡¶®‡¶ø‡ßü‡ßá,‡¶è ‡¶ß‡¶∞‡¶£‡ßá‡¶∞ ‡¶ï‡¶æ‡¶∞‡ßç‡¶° ‡¶®‡¶ø‡ßü‡ßá,"['S', 'S', 'S', 'C']","['S', 'S', 'S', 'C']",75,75
2,slr_00009e687c.flac,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,"['S', 'S', 'C', 'C']","['C', 'C', 'C', 'C']",50,0
3,slr_00012843bc.flac,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡ßú‡¶ø‡¶§‡ßá,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá,"['I', 'S', 'S']","['I', 'S', 'S']",100,100
4,slr_00012880b1.flac,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,‡¶∂‡¶π‡¶∞‡ßá ‡¶Ø‡¶æ‡¶∞‡¶æ,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü,"['D', 'S', 'S']","['D', 'S', 'S']",100,100
...,...,...,...,...,...,...,...,...
197835,common_voice_bn_31459116.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,"['C', 'C', 'S', 'C', 'C']","['C', 'C', 'I', 'I', 'S', 'C', 'C']",20,42
197836,common_voice_bn_31459130.mp3,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü ‡¶Ü‡¶õ‡ßá,"['C', 'S', 'C', 'S', 'C', 'C', 'C', 'I', 'S', ...","['C', 'I', 'S', 'C', 'S', 'C', 'C', 'C', 'C', ...",50,40
197837,common_voice_bn_31459161.mp3,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...,"['C', 'C', 'S', 'S', 'C', 'C', 'C', 'S', 'C', ...","['C', 'C', 'S', 'C', 'C', 'C', 'C', 'S', 'C', ...",36,27
197838,common_voice_bn_31637719.mp3,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã,"['S', 'C', 'C', 'S', 'S']","['C', 'C', 'C', 'C', 'S']",60,20


In [8]:
df_prev = pd.read_csv('symspell_nov.csv')

In [10]:
df_prev = df_prev[df_prev.audio == 'noisy']

In [14]:
df_prev

Unnamed: 0,path,id,source,audio,sentence,duration,snr,wav2vec2,2gram,3gram,...,wav2vec2_label,wav2vec2_wer,3gram_label,3gram_wer,4gram_label,4gram_wer,wav2vec2_sym,2gram_sym,3gram_sym,4gram_sym
0,/kaggle/input/denoiser-for-benchmark/raw/train...,30991599,train,noisy,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,5.544,-44.277168,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡¶∞‡ßç‡¶Æ‡ßÄ‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,...,"['C', 'C', 'C', 'C', 'D', 'S', 'C', 'C', 'C', ...",15,"['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', ...",0,"['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', ...",0,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡¶∞‡ßç‡¶Æ‡ßÄ‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...
1,/kaggle/input/denoiser-for-benchmark/raw/train...,30991687,train,noisy,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,4.032,-46.880779,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,...,"['C', 'C', 'C', 'C', 'C', 'C', 'C']",0,"['C', 'C', 'C', 'C', 'C', 'C', 'C']",0,"['C', 'C', 'C', 'C', 'C', 'C', 'C']",0,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï
2,/kaggle/input/denoiser-for-benchmark/raw/train...,30991851,train,noisy,‡¶ó‡¶∞‡ßÄ‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡¶ï‡ßá ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá‡¶á ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,4.320,-56.581566,‡¶ó‡ßú‡¶ø ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡ßá‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡¶ø ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç‡¶õ‡¶§...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,...,"['S', 'S', 'S', 'C', 'S', 'C', 'C', 'C', 'C', ...",36,"['I', 'S', 'S', 'C', 'C', 'S', 'C', 'C', 'C', ...",33,"['I', 'S', 'S', 'C', 'C', 'S', 'C', 'C', 'C', ...",33,‡¶ó‡ßú‡¶ø ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡ßá‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡¶ø ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç‡¶õ‡¶§...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...
3,/kaggle/input/denoiser-for-benchmark/raw/train...,30992090,train,noisy,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶è ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,4.284,-29.025093,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶è ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,...,"['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C']",0,"['C', 'C', 'C', 'C', 'D', 'C', 'C', 'C']",12,"['C', 'C', 'C', 'C', 'D', 'C', 'C', 'C']",12,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶è ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø
4,/kaggle/input/denoiser-for-benchmark/raw/train...,30992661,train,noisy,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡ßÄ‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂‡ßç...,5.760,-32.322632,‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡ßÄ ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡ßÄ‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂‡ßç...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...,...,"['S', 'C', 'C', 'C', 'C', 'I', 'S', 'C', 'C']",33,"['C', 'C', 'C', 'I', 'S', 'C', 'I', 'S', 'C', ...",40,"['C', 'C', 'C', 'I', 'S', 'C', 'I', 'S', 'C', ...",40,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡ßÄ‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂‡ßç...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38382,/kaggle/input/denoiser-for-benchmark/raw/slr53...,056711c865,openslr,noisy,‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶∏‡¶Æ‡¶æ‡¶¨‡ßá‡¶∂,3.100,-37.201962,‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶∏‡¶Æ‡¶æ‡¶¨‡ßá‡¶∂,‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶∏‡¶Æ‡¶æ‡¶¨‡ßá‡¶∂,‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶∏‡¶Æ‡¶æ‡¶¨‡ßá‡¶∂,...,"['C', 'C']",0,"['C', 'C']",0,"['C', 'C']",0,‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶∏‡¶Æ‡¶æ‡¶¨‡ßá‡¶∂,‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶∏‡¶Æ‡¶æ‡¶¨‡ßá‡¶∂,‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶∏‡¶Æ‡¶æ‡¶¨‡ßá‡¶∂,‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶∏‡¶Æ‡¶æ‡¶¨‡ßá‡¶∂
38383,/kaggle/input/denoiser-for-benchmark/raw/slr53...,a3230c3684,openslr,noisy,‡¶è‡¶Æ‡¶® ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø,3.300,-36.750912,‡¶è‡¶Æ‡¶® ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø,‡¶è‡¶Æ‡¶® ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø,‡¶è‡¶Æ‡¶® ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø,...,"['C', 'C', 'C']",0,"['C', 'C', 'C']",0,"['C', 'C', 'C']",0,‡¶è‡¶Æ‡¶® ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø,‡¶è‡¶Æ‡¶® ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø,‡¶è‡¶Æ‡¶® ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø,‡¶è‡¶Æ‡¶® ‡¶∏‡¶Ç‡¶∏‡¶¶ ‡¶∏‡¶¶‡¶∏‡ßç‡¶Ø
38384,/kaggle/input/denoiser-for-benchmark/raw/slr53...,7f87c122cb,openslr,noisy,‡¶¨‡ßç‡¶≤‡ßá‡¶° ‡¶ó‡ßá‡¶≤,2.700,-26.636936,‡¶≤‡ßç‡¶¨‡ßá‡¶° ‡¶ï‡ßá‡¶≤,‡¶¨‡ßç‡¶≤‡ßá‡¶° ‡¶ó‡ßá‡¶≤,‡¶¨‡ßç‡¶≤‡ßá‡¶° ‡¶ó‡ßá‡¶≤,...,"['S', 'S']",100,"['C', 'C']",0,"['C', 'C']",0,‡¶ó‡ßç‡¶∞‡ßá‡¶° ‡¶ï‡ßá‡¶≤,‡¶¨‡ßç‡¶≤‡ßá‡¶° ‡¶ó‡ßá‡¶≤,‡¶¨‡ßç‡¶≤‡ßá‡¶° ‡¶ó‡ßá‡¶≤,‡¶¨‡ßç‡¶≤‡ßá‡¶° ‡¶ó‡ßá‡¶≤
38385,/kaggle/input/denoiser-for-benchmark/raw/slr53...,a6463ecbb0,openslr,noisy,‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá‡¶Æ‡ßá‡ßü‡ßá‡¶¶‡ßá‡¶∞,4.500,-18.375225,‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá ‡¶Æ‡ßá‡ßü‡ßá‡¶¶‡ßá‡¶∞,‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá ‡¶Æ‡ßá‡ßü‡ßá‡¶¶‡ßá‡¶∞,‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá‡¶Æ‡ßá‡ßü‡ßá‡¶¶‡ßá‡¶∞,...,"['C', 'I', 'S']",66,"['C', 'C']",0,"['C', 'C']",0,‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá ‡¶Æ‡ßá‡ßü‡ßá‡¶¶‡ßá‡¶∞,‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá ‡¶Æ‡ßá‡ßü‡ßá‡¶¶‡ßá‡¶∞,‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá‡¶Æ‡ßá‡ßü‡ßá‡¶¶‡ßá‡¶∞,‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶õ‡ßá‡¶≤‡ßá‡¶Æ‡ßá‡ßü‡ßá‡¶¶‡ßá‡¶∞


In [12]:
df_prev[df_prev.source == 'train'] 

Unnamed: 0,path,id,source,audio,sentence,duration,snr,wav2vec2,2gram,3gram,...,wav2vec2_label,wav2vec2_wer,3gram_label,3gram_wer,4gram_label,4gram_wer,wav2vec2_sym,2gram_sym,3gram_sym,4gram_sym
0,/kaggle/input/denoiser-for-benchmark/raw/train...,30991599,train,noisy,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,5.544,-44.277168,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡¶∞‡ßç‡¶Æ‡ßÄ‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,...,"['C', 'C', 'C', 'C', 'D', 'S', 'C', 'C', 'C', ...",15,"['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', ...",0,"['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', ...",0,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡¶∞‡ßç‡¶Æ‡ßÄ‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...,‡¶õ‡¶æ‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ö‡¶ø‡¶®‡¶ø ‡¶¨‡¶æ ‡¶ó‡ßÅ‡ßú ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶õ‡¶æ‡¶Å‡¶ö‡ßá ‡¶´‡ßá‡¶≤‡ßá ‡¶∏‡¶®‡ßç‡¶¶‡ßá...
1,/kaggle/input/denoiser-for-benchmark/raw/train...,30991687,train,noisy,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,4.032,-46.880779,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,...,"['C', 'C', 'C', 'C', 'C', 'C', 'C']",0,"['C', 'C', 'C', 'C', 'C', 'C', 'C']",0,"['C', 'C', 'C', 'C', 'C', 'C', 'C']",0,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï,‡¶§‡¶ø‡¶®‡¶ø ‡¶ü‡ßç‡¶∞‡ßá‡¶° ‡¶á‡¶â‡¶®‡¶ø‡ßü‡¶® ‡¶Ü‡¶®‡ßç‡¶¶‡ßã‡¶≤‡¶®‡ßá‡¶∞ ‡¶Ö‡¶®‡ßç‡¶Ø‡¶§‡¶Æ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶®‡¶æ‡ßü‡¶ï
2,/kaggle/input/denoiser-for-benchmark/raw/train...,30991851,train,noisy,‡¶ó‡¶∞‡ßÄ‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑‡¶ï‡ßá ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá‡¶á ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,4.320,-56.581566,‡¶ó‡ßú‡¶ø ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡ßá‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡¶ø ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç‡¶õ‡¶§...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,...,"['S', 'S', 'S', 'C', 'S', 'C', 'C', 'C', 'C', ...",36,"['I', 'S', 'S', 'C', 'C', 'S', 'C', 'C', 'C', ...",33,"['I', 'S', 'S', 'C', 'C', 'S', 'C', 'C', 'C', ...",33,‡¶ó‡ßú‡¶ø ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡ßá‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡¶ø ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç‡¶õ‡¶§...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...,‡¶ó‡¶∞‡¶ø‡¶¨ ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶ï‡¶ø ‡¶∏‡ßá ‡¶Æ‡¶æ‡¶®‡ßÅ‡¶∑ ‡¶¨‡¶≤‡ßá ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡ßá ‡¶®‡¶æ ‡¶è‡¶¨‡¶Ç ‡¶§‡ßÅ‡¶ö‡ßç...
3,/kaggle/input/denoiser-for-benchmark/raw/train...,30992090,train,noisy,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶è ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,4.284,-29.025093,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶è ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,...,"['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C']",0,"['C', 'C', 'C', 'C', 'D', 'C', 'C', 'C']",12,"['C', 'C', 'C', 'C', 'D', 'C', 'C', 'C']",12,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶è ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø,‡¶¨‡ßç‡¶Ø‡¶æ‡¶ü‡ßá‡¶∞ ‡¶¨‡ßá‡¶∂ ‡¶¶‡ßÇ‡¶∞ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡ßÅ‡¶á‡¶Ç ‡¶ï‡¶∞‡¶§‡ßá‡¶® ‡¶§‡¶ø‡¶®‡¶ø
4,/kaggle/input/denoiser-for-benchmark/raw/train...,30992661,train,noisy,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡ßÄ‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂‡ßç...,5.760,-32.322632,‡¶Ö‡¶ß‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡ßÄ ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡ßÄ‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂‡ßç...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...,...,"['S', 'C', 'C', 'C', 'C', 'I', 'S', 'C', 'C']",33,"['C', 'C', 'C', 'I', 'S', 'C', 'I', 'S', 'C', ...",40,"['C', 'C', 'C', 'I', 'S', 'C', 'I', 'S', 'C', ...",40,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡ßÄ‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂‡ßç...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...,‡¶Ö‡¶¶‡ßç‡¶Ø‡¶æ‡¶¨‡¶ß‡¶ø ‡¶è ‡¶∏‡¶Ç‡¶ó‡ßç‡¶∞‡¶π‡¶ü‡¶ø ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶∂‡ßç‡¶∞‡ßá‡¶£‡¶ø‡¶∞ ‡¶ï‡ßç‡¶∞‡¶ø‡¶ï‡ßá‡¶ü‡ßá ‡¶¨‡¶ø‡¶∂...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9931,/kaggle/input/denoiser-for-benchmark/raw/train...,31459116,train,noisy,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,5.580,-24.213869,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶†‡¶®‡¶æ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,...,"['C', 'C', 'S', 'C', 'C']",20,"['C', 'C', 'I', 'I', 'S', 'C', 'C']",42,"['C', 'C', 'I', 'I', 'S', 'C', 'C']",42,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶ü‡¶®‡¶æ‡¶∞‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®,‡¶§‡¶ø‡¶®‡¶ø ‡¶¨‡¶ó‡ßÅ‡ßú‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶† ‡¶®‡¶æ ‡¶™‡¶æ‡ßú‡¶æ‡ßü ‡¶ú‡¶®‡ßç‡¶Æ‡¶ó‡ßç‡¶∞‡¶π‡¶£ ‡¶ï‡¶∞‡ßá‡¶®
9932,/kaggle/input/denoiser-for-benchmark/raw/train...,31459130,train,noisy,‡¶§‡¶¨‡ßá ‡¶ú‡¶®‡ßç‡¶°‡¶ø‡¶∏ ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶Å‡¶ö‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü ‡¶Ü‡¶õ‡ßá,6.120,-28.318727,‡¶§‡¶¨‡ßá ‡¶õ‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,...,"['C', 'S', 'C', 'S', 'C', 'C', 'C', 'I', 'S', ...",50,"['C', 'I', 'S', 'C', 'S', 'C', 'C', 'C', 'C', ...",40,"['C', 'I', 'S', 'C', 'S', 'C', 'C', 'C', 'C', ...",40,‡¶§‡¶¨‡ßá ‡¶∞‡ßã‡¶Æ‡¶æ‡¶®‡ßç‡¶ü‡¶ø‡¶ï ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞ ‡¶®...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...,‡¶§‡¶¨‡ßá ‡¶∏‡¶Æ‡¶æ‡¶® ‡¶ü‡¶ø‡¶ú ‡¶•‡ßá‡¶ï‡ßá ‡¶¨‡ßá‡¶∂‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶¶‡ßá‡¶∞ ‡¶ï‡¶ø‡¶õ‡ßÅ ‡¶ï‡¶∞‡¶£‡ßÄ‡ßü...
9933,/kaggle/input/denoiser-for-benchmark/raw/train...,31459161,train,noisy,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶ï‡¶æ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶Ö‡¶ï...,6.552,-32.994953,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶è...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,...,"['C', 'C', 'S', 'S', 'C', 'C', 'C', 'S', 'C', ...",36,"['C', 'C', 'S', 'C', 'C', 'C', 'C', 'S', 'C', ...",27,"['C', 'C', 'S', 'C', 'C', 'C', 'C', 'S', 'C', ...",27,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...,‡¶§‡¶ø‡¶®‡¶ø ‡¶Ö‡¶∏‡¶Æ ‡¶â‡¶™‡¶§‡ßç‡¶Ø‡¶æ‡¶ï‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ‡¶™‡ßÅ‡¶∞‡¶∏‡ßç‡¶ï‡¶æ‡¶∞ ‡¶ì ‡¶∏‡¶æ‡¶π‡¶ø‡¶§‡ßç‡¶Ø ...
9934,/kaggle/input/denoiser-for-benchmark/raw/train...,31637719,train,noisy,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤‡ßã,3.744,-13.999677,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,...,"['S', 'C', 'C', 'S', 'S']",60,"['C', 'C', 'C', 'C', 'S']",20,"['C', 'C', 'C', 'C', 'S']",20,‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶Æ‡ßÅ‡¶ö‡¶ø ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤,‡¶è‡¶ü‡¶ø ‡¶™‡ßÇ‡¶∞‡ßç‡¶¨‡ßá ‡¶Ö‡¶®‡ßá‡¶ï ‡¶â‡¶Å‡¶ö‡ßÅ ‡¶õ‡¶ø‡¶≤


In [20]:
df_raw[df_raw.sentence.duplicated() == True]

Unnamed: 0,path,wav2vec2,arpa_4gram,sentence,wav2vec2_label,arpa_label,wer_wav2vec2,wer_arpa
377,slr_009d6e8214.flac,‡¶≠‡ßã‡¶ü ‡¶π‡¶¨‡ßá ‡¶ö‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶´‡ßá‡¶¨‡ßç‡¶∞‡ßÅ‡ßü‡¶æ‡¶∞‡¶ø,‡¶≠‡ßã‡¶ü ‡¶π‡¶¨‡ßá ‡¶ö‡¶¨‡ßç‡¶¨‡¶ø‡¶∂ ‡¶´‡ßá‡¶¨‡ßç‡¶∞‡ßÅ‡ßü‡¶æ‡¶∞‡¶ø,‡¶≠‡ßã‡¶ü ‡¶π‡¶¨‡ßá ‡¶´‡ßá‡¶¨‡ßç‡¶∞‡ßÅ‡ßü‡¶æ‡¶∞‡¶ø,"['C', 'C', 'I', 'C']","['C', 'C', 'I', 'C']",25,25
790,slr_0168dba8fc.flac,‡¶æ‡¶∞‡ßç‡¶ï‡¶ø‡¶® ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶∏‡ßç‡¶ü‡ßç‡¶∞ ‡¶ö,‡¶Æ‡¶æ‡¶∞‡ßç‡¶ï‡¶ø‡¶® ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶∑‡ßç‡¶ü‡ßç‡¶∞,‡¶Æ‡¶æ‡¶∞‡ßç‡¶ï‡¶ø‡¶® ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶∑‡ßç‡¶ü‡ßç‡¶∞‡ßá‡¶∞ ‡¶ö‡ßá‡ßü‡ßá,"['S', 'S', 'S']","['C', 'D', 'S']",100,66
806,slr_016c334007.flac,‡¶∏‡ßã‡¶≤‡¶∏‡ßç‡¶Ø ‡¶Ö‡¶®‡¶® ‡¶¨‡ßà‡¶∂‡¶æ‡¶≤‡ßá,‡¶∏‡ßã‡¶≤‡¶∏ ‡¶Ö‡¶®‡¶® ‡¶¨‡¶á ‡¶∂‡¶æ‡¶≤‡ßá,‡¶∏‡¶æ‡¶≤‡ßá,"['I', 'I', 'S']","['I', 'I', 'I', 'S']",100,100
823,slr_01750a3785.flac,‡¶ï‡¶¨‡¶ø ‡¶ì ‡¶™‡ßç‡¶∞‡¶æ‡¶¨‡¶®‡ßç‡¶ß‡¶ø‡¶ï ‡¶∞‡¶æ‡¶ú‡¶ì ‡¶Ü‡¶≤‡¶æ ‡¶â‡¶¶‡ßç‡¶¶‡ßÄ‡¶®,‡¶ï‡¶¨‡¶ø ‡¶ì ‡¶™‡ßç‡¶∞‡¶æ‡¶¨‡¶®‡ßç‡¶ß‡¶ø‡¶ï ‡¶∞‡¶æ‡¶ú‡ßÅ ‡¶Ü‡¶≤‡¶æ ‡¶â‡¶¶‡ßç‡¶¶‡¶ø‡¶®,‡¶ï‡¶¨‡¶ø ‡¶ì ‡¶™‡ßç‡¶∞‡¶æ‡¶¨‡¶®‡ßç‡¶ß‡¶ø‡¶ï ‡¶∞‡¶æ‡¶ú‡ßÅ ‡¶Ü‡¶≤‡¶æ‡¶â‡¶¶‡ßç‡¶¶‡¶ø‡¶®,"['C', 'C', 'C', 'I', 'S', 'S']","['C', 'C', 'C', 'C', 'I', 'S']",50,33
890,slr_018db2e3b4.flac,‡¶¶‡¶ï‡ßç‡¶∑‡¶ø‡¶£ ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶Æ‡¶π‡¶æ‡¶∏‡¶æ‡¶ó‡¶∞‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶æ‡ßü ‡¶ö‡¶æ‡¶∞‡ßç‡¶∂ ‡¶¨‡¶∞‡ßç‡¶ó ‡¶ï‡¶ø‡¶≤‡ßã‡¶Æ‡¶ø‡¶ü...,‡¶¶‡¶ï‡ßç‡¶∑‡¶ø‡¶£ ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶Æ‡¶π‡¶æ‡¶∏‡¶æ‡¶ó‡¶∞‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶æ‡ßü ‡¶ö‡¶æ‡¶∞‡ßç‡¶Æ ‡¶¨‡¶∞‡ßç‡¶ó ‡¶ï‡¶ø‡¶≤‡ßã‡¶Æ‡¶ø‡¶ü...,‡¶¶‡¶ï‡ßç‡¶∑‡¶ø‡¶£ ‡¶≠‡¶æ‡¶∞‡¶§ ‡¶Æ‡¶π‡¶æ‡¶∏‡¶æ‡¶ó‡¶∞‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶æ‡ßü ‡¶ö‡¶æ‡¶∞‡¶∂‡ßã ‡¶¨‡¶∞‡ßç‡¶ó‡¶ï‡¶ø‡¶≤‡ßã‡¶Æ‡¶ø‡¶ü‡¶æ...,"['C', 'C', 'C', 'C', 'I', 'S', 'S', 'C', 'C', ...","['C', 'C', 'C', 'C', 'I', 'S', 'S', 'C', 'C', ...",40,33
...,...,...,...,...,...,...,...,...
197829,common_voice_bn_31365386.mp3,‡¶ï‡¶æ‡¶∞‡¶£ ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶∑‡ßç‡¶ü‡ßç‡¶∞‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶ï‡¶æ‡¶∞‡¶ü‡¶ø ‡¶∞‡¶æ‡¶ú‡ßç‡¶Ø ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶ú...,‡¶ï‡¶æ‡¶∞‡¶£ ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶∑‡ßç‡¶ü‡ßç‡¶∞‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶ï‡¶æ‡¶∞‡¶ü‡¶ø ‡¶∞‡¶æ‡¶ú‡ßç‡¶Ø ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶ú...,‡¶ï‡¶æ‡¶∞‡¶£ ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶∑‡ßç‡¶ü‡ßç‡¶∞‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶§‡ßá‡¶∞‡¶ü‡¶ø ‡¶∞‡¶æ‡¶ú‡ßç‡¶Ø ‡¶Ø‡ßÅ‡¶ï‡ßç‡¶§‡¶∞‡¶æ‡¶ú...,"['C', 'C', 'C', 'S', 'C', 'C', 'C', 'C', 'C']","['C', 'C', 'C', 'S', 'C', 'C', 'C', 'C', 'C']",11,11
197831,common_voice_bn_31378206.mp3,‡¶è‡¶á ‡¶ú‡ßá‡¶≤‡¶æ‡¶ü‡¶ø ‡¶Ü‡ßü‡¶§‡¶®‡ßá‡¶∞ ‡¶¶‡¶ø‡¶ï ‡¶•‡ßá‡¶ï‡ßá ‡¶™‡¶∂‡ßç‡¶ö‡¶ø‡¶Æ‡¶∞‡¶ô‡ßç‡¶ó‡ßá‡¶∞ ‡¶¨‡ßÉ‡¶π‡¶§‡ßç‡¶§‡¶Æ...,‡¶è‡¶á ‡¶ú‡ßá‡¶≤‡¶æ‡¶ü‡¶ø ‡¶Ü‡ßü‡¶§‡¶®‡ßá‡¶∞ ‡¶¶‡¶ø‡¶ï ‡¶•‡ßá‡¶ï‡ßá ‡¶™‡¶∂‡ßç‡¶ö‡¶ø‡¶Æ‡¶¨‡¶ô‡ßç‡¶ó‡ßá‡¶∞ ‡¶¨‡ßÉ‡¶π‡¶§‡ßç‡¶§‡¶Æ...,‡¶è‡¶á ‡¶ú‡ßá‡¶≤‡¶æ‡¶ü‡¶ø ‡¶Ü‡ßü‡¶§‡¶®‡ßá‡¶∞ ‡¶¶‡¶ø‡¶ï ‡¶•‡ßá‡¶ï‡ßá ‡¶™‡¶∂‡ßç‡¶ö‡¶ø‡¶Æ‡¶¨‡¶ô‡ßç‡¶ó‡ßá‡¶∞ ‡¶¨‡ßÉ‡¶π‡¶§‡ßç‡¶§‡¶Æ...,"['C', 'C', 'C', 'C', 'C', 'S', 'C', 'C', 'C', ...","['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', ...",14,0
197832,common_voice_bn_31403271.mp3,‡¶è‡¶á ‡¶∏‡¶æ‡¶Æ‡¶∞‡¶æ‡¶ú‡ßç‡¶Ø‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶•‡¶æ‡¶® ‡¶ó‡¶§ ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡ßÉ‡¶∂‡ßç‡¶ü‡¶ø‡¶Ø‡ßÅ‡¶¶‡ßç‡¶ß‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß...,‡¶è‡¶á ‡¶∏‡¶æ‡¶Æ‡ßç‡¶∞‡¶æ‡¶ú‡ßç‡¶Ø‡ßá‡¶∞ ‡¶™‡ßç‡¶∞‡¶ß‡¶æ‡¶® ‡¶ó‡¶§ ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡ßÉ‡¶∂‡ßç‡¶ü‡¶ø‡¶Ø‡ßÅ‡¶¶‡ßç‡¶ß‡ßá‡¶∞ ‡¶Æ‡¶æ...,‡¶è‡¶á ‡¶∏‡¶æ‡¶Æ‡ßç‡¶∞‡¶æ‡¶ú‡ßç‡¶Ø‡ßá‡¶∞ ‡¶™‡¶§‡¶® ‡¶ò‡¶ü‡ßá ‡¶™‡ßç‡¶∞‡¶•‡¶Æ ‡¶¨‡¶ø‡¶∂‡ßç‡¶¨‡¶Ø‡ßÅ‡¶¶‡ßç‡¶ß‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç...,"['C', 'S', 'S', 'S', 'C', 'S', 'C', 'S', 'S', ...","['C', 'C', 'S', 'S', 'C', 'S', 'C', 'S', 'C', ...",70,29
197834,common_voice_bn_31448771.mp3,‡¶ú‡ßç‡¶Ø‡¶æ‡¶ô‡ßç‡¶ó ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó ‡¶Ö‡¶∏‡ßç‡¶¨‡ßÄ‡¶ï‡¶æ‡¶∞ ‡¶ï‡¶∞‡ßá,‡¶ú‡ßç‡¶Ø‡¶æ‡¶ô‡ßç‡¶ó ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó ‡¶Ö‡¶∏‡ßç‡¶¨‡ßÄ‡¶ï‡¶æ‡¶∞ ‡¶ï‡¶∞‡ßá,‡¶ú‡ßç‡¶Ø‡¶æ‡¶ô‡ßç‡¶ó‡ßã ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó ‡¶Ö‡¶∏‡ßç‡¶¨‡ßÄ‡¶ï‡¶æ‡¶∞ ‡¶ï‡¶∞‡ßá,"['S', 'C', 'C', 'C']","['S', 'C', 'C', 'C']",25,25


In [27]:
df_sen = df_prev.sentence.tolist()

In [29]:
df_raw = df_raw[~df_raw['sentence'].isin(df_sen)]

In [31]:
df_masked = df_masked[~df_masked['sentence'].isin(df_sen)]

In [33]:
df_masked.to_csv('data/train/masked.csv',index=False)

In [34]:
df_raw.to_csv('data/train/raw.csv',index=False)

In [6]:
df_raw = pd.read_csv('data/train/raw.csv')
df_masked = pd.read_csv('data/train/masked.csv')

In [7]:
df_masked 

Unnamed: 0,wav2vec2,masked_wav2vec2,error_word,correct_word,sentence
0,‡¶π‡¶æ‡¶∏‡¶ø ‡¶¨‡¶ø‡¶∑‡¶≤ ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶π‡¶æ‡¶∏‡¶ø [MASK] ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá,‡¶¨‡¶ø‡¶∑‡¶≤,‡¶¨‡¶ø‡¶∑‡ßü,‡¶π‡¶æ‡¶∏‡¶ø‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü ‡¶π‡ßü‡ßá‡¶á ‡¶Ü‡¶õ‡ßá
1,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ ‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,‡¶∂‡¶æ‡¶ï ‡¶¶‡ßá‡¶∂ [MASK],‡¶ó‡ßÅ‡¶∞‡ßÅ‡¶§‡ßá,,‡¶∏‡¶æ‡¶∞‡ßç‡¶ï ‡¶¶‡ßá‡¶∂‡¶ó‡ßÅ‡¶≤‡ßã‡¶§‡ßá
2,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,[MASK] ‡¶ú‡¶æ‡¶¶‡ßá,‡¶∂‡¶π‡¶ú‡¶ø,‡¶∏‡¶π‡¶ú‡ßá‡¶á,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü
3,‡¶∂‡¶π‡¶ú‡¶ø ‡¶ú‡¶æ‡¶¶‡ßá,‡¶∂‡¶π‡¶ú‡¶ø [MASK],‡¶ú‡¶æ‡¶¶‡ßá,‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ,‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶Ø‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡ßü
4,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ‡¶Æ,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§ ‡¶ï‡¶∞‡ßá‡¶õ‡ßá [MASK],‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ‡¶Æ,‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ,‡¶Ö‡¶®‡ßç‡¶§‡¶∞‡ßç‡¶≠‡ßÅ‡¶ï‡ßç‡¶§‡¶ì ‡¶ï‡¶∞‡ßá‡¶õ‡ßá ‡¶∞‡¶æ‡¶∂‡¶ø‡ßü‡¶æ
...,...,...,...,...,...
248769,‡¶∞‡ßá‡¶°‡¶ø‡¶ì ‡¶¨‡¶ø‡¶ï‡¶ø‡¶∞‡¶£‡ßá‡¶∞ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶Ö‡¶Ç‡¶∂ ‡¶Ü‡¶™‡ßá‡¶ï‡ßç‡¶∑‡¶ø‡¶ï ‡¶ú‡ßá‡¶ü ‡¶•‡ßá‡¶ï‡ßá ‡¶Ü‡¶∏‡ßá‡¶Ø...,‡¶∞‡ßá‡¶°‡¶ø‡¶ì ‡¶¨‡¶ø‡¶ï‡¶ø‡¶∞‡¶£‡ßá‡¶∞ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶Ö‡¶Ç‡¶∂ ‡¶Ü‡¶™‡ßá‡¶ï‡ßç‡¶∑‡¶ø‡¶ï ‡¶ú‡ßá‡¶ü ‡¶•‡ßá‡¶ï‡ßá ‡¶Ü‡¶∏‡ßá‡¶Ø...,‡¶™‡¶¶‡¶∞‡ßç‡¶∂‡¶®,‡¶™‡¶∂‡ßç‡¶∞‡¶®,‡¶∞‡ßá‡¶°‡¶ø‡¶ì ‡¶¨‡¶ø‡¶ï‡¶ø‡¶∞‡¶£‡ßá‡¶∞ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶Ö‡¶Ç‡¶∂ ‡¶Ü‡¶™‡ßá‡¶ï‡ßç‡¶∑‡¶ø‡¶ï ‡¶ú‡ßá‡¶ü ‡¶•‡ßá‡¶ï‡ßá ‡¶Ü‡¶∏‡ßá ...
248770,‡¶Ü‡¶¨‡ßç‡¶¶‡ßÅ‡¶≤ ‡¶Æ‡¶æ‡¶≤‡ßá‡¶ï ‡¶Æ‡ßü‡¶Æ‡¶®‡¶∏‡¶ø‡¶ô‡¶π ‡¶ú‡ßá‡¶≤‡¶æ‡¶∞ ‡¶Æ‡ßÅ‡¶ï‡ßç‡¶§‡¶ø‡¶Ø‡ßÅ‡¶¶‡ßç‡¶ß‡ßá‡¶∞ ‡¶∏‡¶Ç‡¶ó‡¶†...,‡¶Ü‡¶¨‡ßç‡¶¶‡ßÅ‡¶≤ ‡¶Æ‡¶æ‡¶≤‡ßá‡¶ï [MASK] ‡¶ú‡ßá‡¶≤‡¶æ‡¶∞ ‡¶Æ‡ßÅ‡¶ï‡ßç‡¶§‡¶ø‡¶Ø‡ßÅ‡¶¶‡ßç‡¶ß‡ßá‡¶∞ ‡¶∏‡¶Ç‡¶ó‡¶†‡¶ï ...,‡¶Æ‡ßü‡¶Æ‡¶®‡¶∏‡¶ø‡¶ô‡¶π,‡¶Æ‡ßü‡¶Æ‡¶®‡¶∏‡¶ø‡¶Ç‡¶π,‡¶Ü‡¶¶‡ßç‡¶¨‡ßÅ‡¶≤ ‡¶Æ‡¶æ‡¶≤‡ßá‡¶ï ‡¶Æ‡ßü‡¶Æ‡¶®‡¶∏‡¶ø‡¶Ç‡¶π ‡¶ú‡ßá‡¶≤‡¶æ‡¶∞ ‡¶Æ‡ßÅ‡¶ï‡ßç‡¶§‡¶ø‡¶Ø‡ßÅ‡¶¶‡ßç‡¶ß‡ßá‡¶∞ ‡¶∏‡¶Ç‡¶ó‡¶†...
248771,‡¶§‡¶æ‡¶∞ ‡¶ú‡ßü‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶Æ‡¶§‡ßã ‡¶ï‡ßã‡¶® ‡¶§‡ßÉ‡¶®‡¶Æ‡ßÇ‡¶≤ ‡¶ï‡¶Ç‡¶ó...,‡¶§‡¶æ‡¶∞ ‡¶ú‡ßü‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶Æ‡¶§‡ßã ‡¶ï‡ßã‡¶® ‡¶§‡ßÉ‡¶®‡¶Æ‡ßÇ‡¶≤ ‡¶ï‡¶Ç‡¶ó...,‡¶™‡¶æ‡¶∞‡ßç‡¶•‡¶ø,‡¶™‡¶•‡ßç‡¶∞‡ßÄ,‡¶§‡¶æ‡¶∞ ‡¶ú‡ßü‡ßá‡¶∞ ‡¶Æ‡¶æ‡¶ß‡ßç‡¶Ø‡¶Æ‡ßá ‡¶™‡ßç‡¶∞‡¶•‡¶Æ‡¶¨‡¶æ‡¶∞‡ßá‡¶∞ ‡¶Æ‡¶§ ‡¶ï‡ßã‡¶®‡ßã ‡¶§‡ßÉ‡¶£‡¶Æ‡ßÇ‡¶≤ ‡¶ï‡¶Ç‡¶ó...
248772,‡¶è‡¶ñ‡¶æ‡¶®‡ßá ‡¶≤‡ßç‡¶¨‡¶Ø‡¶æ‡¶ü ‡¶¨‡ßç‡¶Ø‡¶æ‡¶Ç‡ßç‡¶ï ‡¶™‡¶∞‡¶ø‡¶∑‡ßá‡¶¨‡¶æ ‡¶¨‡¶∞‡ßç‡¶§‡¶Æ‡¶æ‡¶®,‡¶è‡¶ñ‡¶æ‡¶®‡ßá [MASK] ‡¶¨‡ßç‡¶Ø‡¶æ‡¶Ç‡ßç‡¶ï ‡¶™‡¶∞‡¶ø‡¶∑‡ßá‡¶¨‡¶æ ‡¶¨‡¶∞‡ßç‡¶§‡¶Æ‡¶æ‡¶®,‡¶≤‡ßç‡¶¨‡¶Ø‡¶æ‡¶ü,‡¶≤‡ßç‡¶¨‡¶æ‡¶°,‡¶è‡¶ñ‡¶æ‡¶®‡ßá ‡¶≤‡ßç‡¶¨‡¶æ‡¶° ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ô‡ßç‡¶ï ‡¶™‡¶∞‡¶ø‡¶∑‡ßá‡¶¨‡¶æ ‡¶¨‡¶∞‡ßç‡¶§‡¶Æ‡¶æ‡¶®


# Custom_data_prothom_alo

In [8]:
df_prothom_alo = pd.read_csv('bangla_text_data.csv')

df_prothom_alo= df_prothom_alo.dropna(how='any')

In [9]:
def get_len(sen):
    sen = sen.split()
    l = len(sen)
    return l

In [10]:
#df_prothom_alo = df_prothom_alo.sample(300000)

In [11]:
df_prothom_alo['lena'] = df_prothom_alo['sentence'].apply(lambda x: get_len(str(x)) )

In [12]:
df_prothom_alo.lena.value_counts()

6      790899
5      776612
7      758972
4      708804
8      687480
        ...  
91          1
170         1
217         1
140         1
107         1
Name: lena, Length: 144, dtype: int64

In [13]:
df_prothom_alo = df_prothom_alo[(df_prothom_alo.lena >= 3) & (df_prothom_alo.lena <= 15)]

In [14]:
#df_prothom_alo = df_prothom_alo.sample(1000)

In [15]:
df_prothom_alo.lena.value_counts()

6     790899
5     776612
7     758972
4     708804
8     687480
9     601139
3     592372
10    509464
11    418943
12    341326
13    276272
14    216316
15    172859
Name: lena, dtype: int64

In [16]:
#df_prothom_alo.to_csv('custom_data_for_bert.csv',index=False)

In [17]:
#df_prothom_alo = pd.read_csv('custom_data_for_bert.csv')

In [18]:
df_prothom_alo

Unnamed: 0,sentence,lena
0,‡¶™‡¶∂‡ßç‡¶ö‡¶ø‡¶Æ‡¶¨‡¶ô‡ßç‡¶ó‡ßá‡¶∞ ‡¶¨‡¶∞‡¶ñ‡¶æ‡¶∏‡ßç‡¶§ ‡¶π‡¶ì‡ßü‡¶æ ‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶™‡¶æ‡¶∞‡ßç‡¶• ‡¶ö‡¶ü‡ßç‡¶ü‡ßã‡¶™...,9
1,‡¶Ü‡¶∞ ‡¶ú‡¶æ‡¶Æ‡¶ø‡¶®‡ßá‡¶∞ ‡¶Ü‡¶¨‡ßá‡¶¶‡¶® ‡¶ï‡¶∞‡¶≤‡ßá‡¶® ‡¶®‡¶æ ‡¶Ö‡¶∞‡ßç‡¶™‡¶ø‡¶§‡¶æ,6
3,‡¶∏‡ßá‡¶ñ‡¶æ‡¶®‡ßá ‡¶™‡¶æ‡¶∞‡ßç‡¶•‡¶∞ ‡¶Ü‡¶á‡¶®‡¶ú‡ßÄ‡¶¨‡ßÄ ‡¶ú‡¶æ‡¶Æ‡¶ø‡¶®‡ßá‡¶∞ ‡¶Ü‡¶¨‡ßá‡¶¶‡¶® ‡¶ú‡¶æ‡¶®‡¶æ‡¶≤‡ßá‡¶ì ‡¶Ö‡¶∞...,13
4,‡¶Ö‡¶∞‡ßç‡¶™‡¶ø‡¶§‡¶æ ‡¶Æ‡ßÅ‡¶ñ‡¶æ‡¶∞‡ßç‡¶ú‡¶ø ‡¶ï‡¶æ‡¶∞‡¶æ‡¶ó‡¶æ‡¶∞‡ßá‡¶á,3
7,‡¶§‡¶æ‡¶Å‡¶∞ ‡¶ú‡ßÄ‡¶¨‡¶® ‡¶∏‡¶Ç‡¶∂‡ßü‡ßá‡¶∞ ‡¶Ü‡¶∂‡¶ô‡ßç‡¶ï‡¶æ ‡¶∞‡ßü‡ßá‡¶õ‡ßá,5
...,...,...
8418466,‡¶π‡¶§‡ßç‡¶Ø‡¶æ ‡¶ì ‡¶ó‡¶£‡¶π‡¶§‡ßç‡¶Ø‡¶æ‡¶∏‡¶π ‡¶Æ‡¶æ‡¶®‡¶¨‡¶§‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ö‡¶™‡¶∞‡¶æ‡¶ß‡ßá‡¶∞ ‡¶™‡¶æ‡¶Å‡¶ö‡¶ü‡¶ø ...,9
8418467,‡¶™‡¶æ‡¶Å‡¶ö‡¶ü‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó‡¶á ‡¶ü‡ßç‡¶∞‡¶æ‡¶á‡¶¨‡ßç‡¶Ø‡ßÅ‡¶®‡¶æ‡¶≤‡ßá ‡¶™‡ßç‡¶∞‡¶Æ‡¶æ‡¶£‡¶ø‡¶§ ‡¶π‡ßü‡ßá‡¶õ‡ßá,5
8418468,‡¶ö‡¶æ‡¶∞‡¶ü‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó‡ßá ‡¶§‡¶æ‡¶Å‡¶ï‡ßá ‡¶Æ‡ßÉ‡¶§‡ßç‡¶Ø‡ßÅ‡¶¶‡¶£‡ßç‡¶° ‡¶¶‡ßá‡¶ì‡ßü‡¶æ ‡¶π‡ßü,6
8418469,‡¶è‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶ó‡ßç‡¶∞‡ßÅ‡¶™‡¶ï‡ßá ‡¶ó‡¶£‡¶π‡¶§‡ßç‡¶Ø‡¶æ‡¶∞ ‡¶¶‡¶æ‡ßü‡ßá‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü‡¶ü...,8


In [19]:
character = ['‡¶Ö','‡¶Ü','‡¶á','‡¶à','‡¶â','‡¶ä','‡¶è','‡¶ê','‡¶ì','‡¶î','‡¶ã','‡¶É','‡¶Ç','‡ßé'
             '‡¶ï','‡¶ñ','‡¶ó','‡¶ò','‡¶ô','‡¶ö', '‡¶õ','‡¶Ø','‡¶ú', '‡¶ù','‡¶ü','‡¶†','‡¶°','‡¶∞','‡ßú','‡ßù',
             '‡¶£','‡¶§','‡¶•','‡¶¶','‡¶ß','‡¶®','‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶∏','‡¶∂','‡¶∑','‡ßü','‡¶π']

In [20]:
JuktakkhorList = [' ‡¶ï‡ßç‡¶ü ' , ' ‡¶ï‡ßç‡¶ï ' , ' ‡¶ï‡ßç‡¶§ ' , ' ‡¶ï‡ßç‡¶Ø ' , ' ‡¶ï‡ßç‡¶∞ ' , ' ‡¶ï‡ßç‡¶≤ ' , ' ‡¶ï‡ßç‡¶∑ ' , ' ‡¶ï‡ßç‡¶∑‡ßç‡¶£ ' , ' ‡¶ï‡ßç‡¶∑‡ßç‡¶Æ ' , ' ‡¶ï‡ßç‡¶∑‡ßç‡¶Ø ' , ' ‡¶ï‡ßç‡¶∏ ' , ' ‡¶ñ‡ßç‡¶∞ ' , ' ‡¶ó‡ßç‡¶ß ' , ' ‡¶ó‡ßç‡¶ß‡ßç‡¶Ø ' , ' ‡¶ó‡ßç‡¶® ' , ' ‡¶ó‡ßç‡¶®‡ßç‡¶Ø ' , ' ‡¶ó‡ßç‡¶¨ ' , ' ‡¶ó‡ßç‡¶∞ ' , ' ‡¶ó‡ßç‡¶∞‡ßç‡¶Ø ' , ' ‡¶ó‡ßç‡¶≤ ' , ' ‡¶ò‡ßç‡¶® ' , ' ‡¶ò‡ßç‡¶∞ ' , ' ‡¶ô‡ßç‡¶ï‡ßç‡¶Ø ' , ' ‡¶ô‡ßç‡¶ó‡ßç‡¶Ø ' , ' ‡¶ö‡ßç‡¶ö ' , ' ‡¶ö‡ßç‡¶õ‡ßç‡¶¨ ' , ' ‡¶ö‡ßç‡¶Ø ' , ' ‡¶ú‡ßç‡¶ú ' , ' ‡¶ú‡ßç‡¶ú‡ßç‡¶¨ ' , ' ‡¶ü‡ßç‡¶ü ' , ' ‡¶ú‡ßç‡¶¨ ' , ' ‡¶ú‡ßç‡¶Ø ' , ' ‡¶ú‡ßç‡¶∞ ' , ' ‡¶ü‡ßç‡¶Ø ' , ' ‡¶ü‡ßç‡¶∞ ' , ' ‡¶°‡ßç‡¶° ' , ' ‡¶°‡ßç‡¶∞ ' , ' ‡¶£‡ßç‡¶ü ' , ' ‡¶£‡ßç‡¶† ' , ' ‡¶£‡ßç‡¶° ' , ' ‡¶£‡ßç‡¶£ ' , ' ‡¶£‡ßç‡¶Ø ' , ' ‡ßé‡¶ï ' , ' ‡ßé‡¶ñ ' , ' ‡¶§‡ßç‡¶§ ' , ' ‡¶§‡ßç‡¶§‡ßç‡¶¨ ' , ' ‡¶§‡ßç‡¶§‡ßç‡¶Ø ' , ' ‡¶§‡ßç‡¶® ' , ' ‡ßé‡¶™ ' , ' ‡¶§‡ßç‡¶¨ ' , ' ‡¶§‡ßç‡¶Æ ' , ' ‡¶§‡ßç‡¶Æ‡ßç‡¶Ø ' , ' ‡¶§‡ßç‡¶Ø ' , ' ‡¶§‡ßç‡¶∞ ' , ' ‡¶§‡ßç‡¶∞‡ßç‡¶Ø ' , ' ‡ßé‡¶∏ ' , ' ‡¶¶‡ßç‡¶ò ' , ' ‡¶¶‡ßç‡¶¶ ' , ' ‡¶¶‡ßç‡¶ß ' , ' ‡¶¶‡ßç‡¶¨ ' , ' ‡¶¶‡ßç‡¶≠ ' , ' ‡¶¶‡ßç‡¶≠‡ßç‡¶∞ ' , ' ‡¶¶‡ßç‡¶Æ ' , ' ‡¶¶‡ßç‡¶Ø ' , ' ‡¶¶‡ßç‡¶∞ ' , ' ‡¶¶‡ßç‡¶∞‡ßç‡¶Ø ' , ' ‡¶ß‡ßç‡¶¨ ' , ' ‡¶ß‡ßç‡¶Ø ' , ' ‡¶ß‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶ü ' , ' ‡¶®‡ßç‡¶ü‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶† ' , ' ‡¶®‡ßç‡¶° ' , ' ‡¶®‡ßç‡¶°‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶§ ' , ' ‡¶®‡ßç‡¶§‡ßç‡¶¨ ' , ' ‡¶®‡ßç‡¶§‡ßç‡¶Ø ' , ' ‡¶®‡ßç‡¶§‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßç‡¶Ø ' , ' ‡¶®‡ßç‡¶• ' , ' ‡¶®‡ßç‡¶¶ ' , ' ‡¶®‡ßç‡¶¶‡ßç‡¶¨ ' , ' ‡¶®‡ßç‡¶¶‡ßç‡¶∞ ' , ' ‡¶®‡ßç‡¶ß ' , ' ‡¶®‡ßç‡¶® ' , ' ‡¶®‡ßç‡¶Ø ' , ' ‡¶™‡ßç‡¶ü ' , ' ‡¶™‡ßç‡¶§ ' , ' ‡¶™‡ßç‡¶® ' , ' ‡¶™‡ßç‡¶™ ' , ' ‡¶™‡ßç‡¶Ø ' , ' ‡¶¨‡ßç‡¶¶ ' , ' ‡¶¨‡ßç‡¶ß ' , ' ‡¶¨‡ßç‡¶¨ ' , ' ‡¶¨‡ßç‡¶∞ ' , ' ‡¶≠‡ßç‡¶Ø ' , ' ‡¶≠‡ßç‡¶∞ ' , ' ‡¶Æ‡ßç‡¶™‡ßç‡¶∞ ' , ' ‡¶Æ‡ßç‡¶¨ ' , ' ‡¶Æ‡ßç‡¶Æ ' , ' ‡¶Æ‡ßç‡¶Ø ' , ' ‡¶Æ‡ßç‡¶∞ ' , ' ‡¶Ø‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶ï ' , ' ‡¶∞‡ßç‡¶ó‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶ò‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶ú‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶•‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶¨‡ßç‡¶Ø ' , ' ‡¶∞‡ßç‡¶ñ ' , ' ‡¶∞‡ßç‡¶ó ' , ' ‡¶∞‡ßç‡¶ò ' , ' ‡¶∞‡ßç‡¶ö ' , ' ‡¶∞‡ßç‡¶õ ' , ' ‡¶∞‡ßç‡¶ú ' , ' ‡¶∞‡ßç‡¶ù ' , ' ‡¶∞‡ßç‡¶ü ' , ' ‡¶∞‡ßç‡¶° ' , ' ‡¶∞‡ßç‡¶£ ' , ' ‡¶∞‡ßç‡¶§ ' , ' ‡¶∞‡ßç‡¶• ' , ' ‡¶∞‡ßç‡¶¶ ' , ' ‡¶∞‡ßç‡¶¶‡ßç‡¶¨ ' , ' ‡¶∞‡ßç‡¶¶‡ßç‡¶∞ ' , ' ‡¶∞‡ßç‡¶ß ' , ' ‡¶∞‡ßç‡¶ß‡ßç‡¶¨ ' , ' ‡¶≤‡ßç‡¶ü ' , ' ‡¶≤‡ßç‡¶° ' , ' ‡¶≤‡ßç‡¶™ ' , ' ‡¶≤‡ßç‡¶Æ ' , ' ‡¶≤‡ßç‡¶Ø ' , ' ‡¶≤‡ßç‡¶≤ ' , ' ‡¶∂‡ßç‡¶õ ' , ' ‡¶∂‡ßç‡¶® ' , ' ‡¶∂‡ßç‡¶¨ ' , ' ‡¶∂‡ßç‡¶∞ ' , ' ‡¶∂‡ßç‡¶≤ ' , ' ‡¶∑‡ßç‡¶ï ' , ' ‡¶∑‡ßç‡¶ï‡ßç‡¶∞ ' , ' ‡¶∑‡ßç‡¶ü ' , ' ‡¶∑‡ßç‡¶ü‡ßç‡¶Ø ' , ' ‡¶∑‡ßç‡¶ü‡ßç‡¶∞ ' , ' ‡¶∑‡ßç‡¶† ' , ' ‡¶∑‡ßç‡¶™ ' , ' ‡¶∏‡ßç‡¶ü ' , ' ‡¶∏‡ßç‡¶ü‡ßç‡¶∞ ' , ' ‡¶∏‡ßç‡¶§‡ßç‡¶∞ ' , ' ‡¶∏‡ßç‡¶§ ' , ' ‡¶∏‡ßç‡¶•‡ßç‡¶Ø ' , ' ‡¶∏‡ßç‡¶Ø ' , ' ‡¶∏‡ßç‡¶∞ ' , ' ‡¶∏‡ßç‡¶≤ ' , ' ‡¶π‡ßç‡¶∞ ' , ' ‡¶π‡ßç‡¶≤ ' , ' ‡¶ó‡ßç‡¶Æ ' , ' ‡¶Æ‡ßç‡¶≠ ' , ' ‡¶∏‡ßç‡¶Æ ' , ' ‡¶ô‡ßç‡¶ò ' , ' ‡¶ô‡ßç‡¶ï‡ßç‡¶∑ ' , ' ‡¶ô‡ßç‡¶ñ ' , ' ‡¶ô‡ßç‡¶ï ' , ' ‡¶ô‡ßç‚Äå‡¶ï‡ßç‡¶§ ' , ' ‡¶û‡ßç‡¶ö ' , ' ‡¶û‡ßç‡¶õ ' , ' ‡¶û‡ßç‡¶ú ' , ' ‡¶π‡ßç‡¶® ' , ' ‡¶ô‡ßç‡¶ó ' , ' ‡¶ú‡ßç‡¶û ' , ' ‡¶ó‡ßç‡¶Ø ' , ' ‡¶ö‡ßç‡¶õ '] 

In [21]:
ReplaceDict = {
      '‡¶ï' : ['‡¶≤','‡¶Ø'],
      '‡¶ñ' : ['‡¶ï‡¶ó','‡¶ï‡¶ú','‡¶≤‡¶π','‡¶ù'],
      '‡¶ó' : ['‡¶´','‡¶π'],
      '‡¶ò' : ['‡¶´‡¶ó','‡¶π‡¶ú'],
      '‡¶ô' : ['‡¶¨','‡¶Æ'],
      '‡¶ö' : ['‡¶≠','‡¶ö‡¶ú','‡¶ö‡¶ó','‡¶≠‡¶π'],
      '‡¶õ' : ['‡¶≠','‡¶ö‡¶ú','‡¶ö‡¶ó','‡¶≠‡¶π'],
      '‡¶ú' : ['‡¶ï','‡¶π'],
      '‡¶ù' : ['‡¶ï‡¶ú','‡¶π‡¶ó'],
      '‡¶ü' : ['‡¶∞'],
      '‡¶†' : ['‡¶§‡¶ú','‡¶§‡¶ó','‡¶∞‡¶π'],
      '‡¶°' : ['‡¶∏','‡¶´'],
      '‡¶¢' : ['‡¶¶‡¶ú','‡¶¶‡ßç‡¶ó','‡¶∂','‡¶´‡¶π'],
      '‡¶£' : ['‡¶¨','‡¶Æ'],
      '‡¶§' : ['‡¶∞'],
      '‡¶•' : ['‡¶§‡¶ú', '‡¶§‡¶ó', '‡¶∞‡¶π'],
      '‡¶¶' : ['‡¶∏', '‡¶´'],
      '‡¶ß' : ['‡¶¶‡¶ú', '‡¶¶‡ßç‡¶ó', '‡¶∂' ,'‡¶´‡¶π'],
      '‡¶®' : ['‡¶¨','‡¶Æ'],
      '‡¶™' : ['‡¶ì',' ‡ßã'],
      '‡¶´' : ['‡¶¶','‡¶ó'],
      '‡¶¨' : ['‡¶≠','‡¶®'],
      '‡¶≠' : ['‚Äç‡¶¨','‡¶ö'],
      '‡¶Æ' : ['‡¶®'],
      '‡¶Ø' : ['‡¶π','‡¶ï'],
      '‡¶∞' : ['‡¶è',' ‡ßá','‡¶§'],
      '‡¶≤' : ['‡¶ï'],
      '‡¶∂' : ['‡¶∏‡¶ú','‡¶∏‡¶ó','‡¶Ü‡¶π','‡¶¢'],
      '‡¶∑' : ['‡¶∏‡¶ú','‡¶∏‡¶ó','‡¶Ü‡¶π','‡¶¢'],
      '‡¶∏' : ['‡¶Ü',' ‡¶æ','‡¶¶'],
      '‡¶π' : ['‡¶ó','‡¶Ø'],
      '‡ßü' : ['‡¶§','‡¶â','‡ßÅ'],
      '‡ßú' : ['‡¶è',' ‡ßá','‡¶§'],
      '‡ßù' : ['‡¶è',' ‡ßá','‡¶§'],
      '‡ßé' : ['‡¶∞'],
      '‡¶Ç' : ['‡¶¨','‡¶Æ'],
      '‡¶É' : ['‡¶ó','‡¶Ø'],
      '‡¶Ö' : ['‡¶™',' ‡¶ø','‡¶á'],
      '‡¶Ü' : ['‡¶∏'],
      '‡¶á' : ['‡¶â','‡¶Ö'],
      '‡¶à' : ['‡¶â','‡¶Ö'],
      '‡¶â' : ['‡¶á',' ‡¶ø'],
      '‡¶ä' : ['‡¶á',' ‡¶ø'],
      '‡¶ã' : [''],
      '‡¶è' : ['‡¶ì','‡¶∞'],
      '‡¶ê' : [''],
      '‡¶ì' : ['‡¶™','‡¶á'],
      '‡¶î' : [''],
      '‡¶æ' : ['‡¶∏'],
      '‡¶ø' : ['‡¶â','‡¶Ö'],
      '‡ßã' : ['‡¶™',' ‡¶ø'],
      '‡ßå' : [''],
      '‡ßá' : ['‡ßã','‡¶∞'],
      '‡ßà' : ['']
    }

In [22]:
SameClusterDict = {
    '‡¶Ö' : ['‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡¶Ü' : ['‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ‡¶Å','‡¶Ü‡¶É','‡¶ì‡ßü‡¶æ','‡¶è'],
    '‡¶á':['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],
    '‡¶à':['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],
    '‡¶â' : ['‡¶â''‡¶â‡¶Å','‡¶ä','‡ßÅ'],
    '‡¶ä' : ['‡¶â''‡¶â‡¶Å','‡¶ä','‡ßÅ'],
    '‡¶ã' : ['‡¶∞‡¶ø','‡¶π‡ßç‡¶∞‡ßÄ','‡ßú‡¶ø','‡ßù‡¶ø','‡¶¨‡ßç‡¶∞‡¶á','‡ßÉ','‡¶π‡ßç‡¶∞'],
    '‡¶è' : ['‡¶è','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶Ü‡¶É'],
    '‡¶ê': ['‡¶Ö‡¶á','‡¶à','‡¶ì‡¶á' , '‡¶á', '‡¶Ö‡ßç‡¶Ø‡¶æ‡¶á'],
    '‡¶ì': ['‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡¶î': [ '‡¶ì','‡¶Ü','‡¶Ö‡¶â','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡¶ï': ['‡¶ï','‡¶ñ','‡¶ó', '‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ï‡ßã‡¶Å',' ‡¶ï‡ßã','‡¶ñ','‡¶ñ‡ßã','‡¶ñ‡¶ì','‡¶ï‡ßç‡¶∑‡¶ì','‡¶ï‡¶π‡ßã‡¶Å','‡¶ï‡¶π‡¶ì','‡¶ó‡ßã‡¶Å','‡¶ó‡ßü' ],
    '‡¶ñ': ['‡¶ï','‡¶ñ','‡¶ó', '‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ï‡ßã‡¶Å',' ‡¶ï‡ßã','‡¶ñ','‡¶ñ‡ßã','‡¶ñ‡¶ì','‡¶ï‡ßç‡¶∑‡¶ì','‡¶ï‡¶π‡ßã‡¶Å','‡¶ï‡¶π‡¶ì','‡¶ó‡ßã‡¶Å','‡¶ó‡ßü' ],
    '‡¶ó': ['‡¶ï','‡¶ñ','‡¶ó', '‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ï‡ßã‡¶Å',' ‡¶ï‡ßã','‡¶ñ','‡¶ñ‡ßã','‡¶ñ‡¶ì','‡¶ï‡ßç‡¶∑‡¶ì','‡¶ï‡¶π‡ßã‡¶Å','‡¶ï‡¶π‡¶ì','‡¶ó‡ßã‡¶Å','‡¶ó‡ßü' ],
    '‡¶ò ': ['‡¶ñ','‡¶ñ‡ßã','‡¶ñ‡¶ì','‡¶ï‡ßç‡¶∑‡¶ì','‡¶ï‡¶π‡ßã‡¶Å','‡¶ï‡¶π‡¶ì','‡¶ó‡ßã‡¶Å','‡¶ó‡ßü' ,'‡¶ò‡ßã ','‡¶ò‡¶ì'],
    '‡¶ô': ['‚óå‡¶Ç','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶Ö‡¶Ç'],
    '‡¶ö': ['‡¶ö','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ö‡ßã‡¶Å','‡¶ö‡¶ì','‡¶õ‡ßã‡¶Å','‡¶õ‡ßã','‡¶õ‡¶ì','‡¶õ'],
    '‡¶õ': ['‡¶ö','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶ö‡ßã‡¶Å','‡¶ö‡¶ì','‡¶õ‡ßã‡¶Å','‡¶õ‡ßã','‡¶õ‡¶ì','‡¶õ'],
    '‡¶ú': ['‡¶ú','‡¶Ø','‡¶ú‡ßã','‡¶Ø‡ßã','‡¶ù','‡¶ù‡¶ì'],
    '‡¶ù': ['‡¶ú','‡¶Ø','‡¶ú‡ßã','‡¶Ø‡ßã','‡¶ù','‡¶ù‡¶ì'],
    '‡¶û': ['‡¶®','‡¶ô','‡¶£','‡¶û','‡¶á','‡¶à','‡¶è','‡ßá','‡ßü','‡¶®‡¶ì' ],
    '‡¶ü' : ['‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü'],
    '‡¶†' : ['‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü'],
    '‡¶°' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶¢': ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É','‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶£': ['‡¶®','‡¶ô','‡¶£','‡¶û','‡¶á','‡¶à','‡¶è','‡ßá','‡ßü','‡¶®‡¶ì' ],
    '‡¶§' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶•': ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶¶' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶ß' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶®': ['‡¶®','‡¶ô','‡¶£','‡¶û','‡¶á','‡¶à','‡¶è','‡ßá','‡ßü','‡¶®‡¶ì' ],
    '‡¶™' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶´' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶¨' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶≠' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶Æ' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶Ø' : ['‡¶™','‡¶´','‡¶¨','‡¶≠','‡¶Æ','‡¶¨‡ßç‡¶Ø', '‡¶≠‡¶Å', '‡¶¨‡¶ì' ,'‡¶¨‡ßã','‡¶¨‡ßã‡¶Å','‡¶¨‡ßç‡¶Ø‡ßü','‡¶™‡¶Ö' ,'‡¶™‡ßã' ,'‡¶™‡ßü' ,'‡¶™‡ßã‡¶Å','‡¶≠‡¶Å' ,'‡¶≠‡¶Ö' ,'‡¶≠‡ßü','‡¶≠‡ßã‡¶Å','‡¶Æ‡ßü' ,'‡¶Æ‡ßã‡¶É'],
    '‡¶∞' : ['‡¶∞','‡ßú','‡ßù','‡¶∞‡¶ì','‡¶∞‡ßã‡¶Å','‡¶∞‡ßã','‡¶∞‡ßü','‡¶ã', '‡¶π‡¶∞'  ],
    '‡¶≤' : ['‡¶≤','‡¶≤‡ßü','‡¶≤‡ßã'],
    '‡¶∂' : ['‡¶∂','‡¶∑','‡¶∏','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶∂‡ßã','‡¶∏‡¶Ö','‡¶∂‡ßü'],
    '‡¶∑' : ['‡¶∂','‡¶∑','‡¶∏','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶∂‡ßã','‡¶∏‡¶Ö','‡¶∂‡ßü'],
    '‡¶∏' : ['‡¶∂','‡¶∑','‡¶∏','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶∂‡ßã','‡¶∏‡¶Ö','‡¶∂‡ßü'],
    '‚Äç‡¶π': ['‡¶π','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡ßú' : ['‡¶∞','‡ßú','‡ßù','‡¶∞‡¶ì','‡¶∞‡ßã‡¶Å','‡¶∞‡ßã','‡¶∞‡ßü','‡¶ã', '‡¶π‡¶∞'  ],
    '‚Äç‡ßù': ['‡¶∞','‡ßú','‡ßù','‡¶∞‡¶ì','‡¶∞‡ßã‡¶Å','‡¶∞‡ßã','‡¶∞‡ßü','‡¶ã', '‡¶π‡¶∞'  ],
    '‚Äç‡ßü ': ['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],

    '‡ßé' : ['‡¶¶','‡¶¶‡ßã','‡¶¶‡ßç‡¶Ø','‡¶°‡¶É' ,'‡¶¶‡ßç‡¶Ø‡ßã','‡¶¢','‡¶ß','‡¶•','‡¶§','‡¶§‡¶Å','‡¶§‡ßã‡¶Å','‡¶ü‡ßü','‡¶§‡ßã','‡¶ü','‡¶¢‡ßã','‡¶¶‡¶É' ],
    '‡¶Ç' : ['‚óå‡¶Ç','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É','‡¶Ö‡¶Ç'],
    '‡¶É': ['‡¶π','‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‚Äç‡¶Å' : [''],

    '‚Äç‡¶ø': ['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],
    '‡ßÄ' : ['‡¶á','‡¶à','‡¶è','‡ßá','‡ßü'],
    '‡ßá':['‡¶è','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶Ü‡¶É'],
    '‡ßà'  :['‡¶è','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶Ü‡¶É'],
    '‡ßã' : ['‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡ßá‡ßó':['‡¶ì','‡¶Ü','‡¶Ö‡ßç‡¶Ø‡¶æ','‡¶æ','‡¶ì‡¶Å','‡¶ì‡¶É'],
    '‡ßÉ' : ['‡¶∞‡¶ø','‡¶π‡ßç‡¶∞‡ßÄ','‡ßú‡¶ø','‡ßù‡¶ø','‡¶¨‡ßç‡¶∞‡¶á','‡ßÉ','‡¶π‡ßç‡¶∞'],
     '‚óå‡ßÇ'  : ['‡¶â''‡¶â‡¶Å','‡¶ä','‡ßÅ'],
     ' ‡ßÅ'  : ['‡¶â''‡¶â‡¶Å','‡¶ä','‡ßÅ']
}

In [28]:
def apply_error(string,error):
    random.seed(time.time()+200)
    errors = ['Cluster','Replace','Juktakkhor','Character','Deletion','Noerror']
    if error == "Cluster":
        if string in list(SameClusterDict.keys()):
            string = random.choice(SameClusterDict[string])
    if error == "Replace":
        if string in list(ReplaceDict.keys()):
            string = random.choice(ReplaceDict[string])
    if error == "Juktakkhor":
        string = random.choice(JuktakkhorList)
    if error == "Character":
        string = random.choice(character)
    if error == "Deletion":
        string = ""
    if error == "insertion":
        temp_error = random.choice(errors)
        string = string + apply_error(string,temp_error)
    if error == "Noerror":
        pass
    return string

In [29]:
def error_generator(word):
    errors = ['Cluster','Replace','Juktakkhor','Character','Deletion','Insertion','Noerror']
    string = []
    string[:0] = word
    random.seed(time.time())
    unlucky_pos = random.randint(0,int(len(word)*0.4))
    noise_index = random.sample(range(len(word)),unlucky_pos)
    for n in range(len(string)):
        if n in noise_index:
            error_t = random.random()
            random.seed(time.time()+100)
            error = random.choice(errors)
            string[n] = apply_error(string[n],error)
    return "".join(string)

In [30]:
sentence = df_prothom_alo.sentence.tolist()

In [31]:
df_custom_error_dataset = pd.DataFrame()
temp_dict = {}
i=0
for sen in tqdm(sentence):
    if len(sen) > 3:
        sen = sen.split()
        random.seed(time.time()+356)
        random_index = random.randint(0,len(sen)-1)
        masked_word = sen[random_index]
        if len(masked_word) > 3:
            error_word = error_generator(masked_word)
    masked_sen = sen[:]
    masked_sen[random_index] = '[MASK]'
    temp_dict[i] = {'sentence': " ".join(sen), "masked_sen": " ".join(masked_sen), "error_word":error_word,"correct_word":masked_word,'similarity' : lav_distance(error_word,masked_word)}
    i=i+1
    #df_custom_error_dataset = df_custom_error_dataset.append(temp_dict,ignore_index=True)

  0%|          | 0/6851458 [00:00<?, ?it/s]

In [32]:
df_custom_error_dataset = df_custom_error_dataset.from_dict(temp_dict,"index")

In [36]:
df_custom_error_dataset

Unnamed: 0,sentence,masked_sen,error_word,correct_word,similarity
0,‡¶™‡¶∂‡ßç‡¶ö‡¶ø‡¶Æ‡¶¨‡¶ô‡ßç‡¶ó‡ßá‡¶∞ ‡¶¨‡¶∞‡¶ñ‡¶æ‡¶∏‡ßç‡¶§ ‡¶π‡¶ì‡ßü‡¶æ ‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶™‡¶æ‡¶∞‡ßç‡¶• ‡¶ö‡¶ü‡ßç‡¶ü‡ßã‡¶™...,[MASK] ‡¶¨‡¶∞‡¶ñ‡¶æ‡¶∏‡ßç‡¶§ ‡¶π‡¶ì‡ßü‡¶æ ‡¶Æ‡¶®‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶™‡¶æ‡¶∞‡ßç‡¶• ‡¶ö‡¶ü‡ßç‡¶ü‡ßã‡¶™‡¶æ‡¶ß‡ßç‡¶Ø‡¶æ‡ßü...,‡¶™‡¶∂‡ßç‡¶ö‡¶ø‡¶Æ‡¶¨‡¶ô‡ßç‡¶ó‡ßá‡¶∞,‡¶™‡¶∂‡ßç‡¶ö‡¶ø‡¶Æ‡¶¨‡¶ô‡ßç‡¶ó‡ßá‡¶∞,0
1,‡¶Ü‡¶∞ ‡¶ú‡¶æ‡¶Æ‡¶ø‡¶®‡ßá‡¶∞ ‡¶Ü‡¶¨‡ßá‡¶¶‡¶® ‡¶ï‡¶∞‡¶≤‡ßá‡¶® ‡¶®‡¶æ ‡¶Ö‡¶∞‡ßç‡¶™‡¶ø‡¶§‡¶æ,‡¶Ü‡¶∞ ‡¶ú‡¶æ‡¶Æ‡¶ø‡¶®‡ßá‡¶∞ [MASK] ‡¶ï‡¶∞‡¶≤‡ßá‡¶® ‡¶®‡¶æ ‡¶Ö‡¶∞‡ßç‡¶™‡¶ø‡¶§‡¶æ,‡¶∏‡¶¨‡ßá‡¶¶‡¶®,‡¶Ü‡¶¨‡ßá‡¶¶‡¶®,20
2,‡¶∏‡ßá‡¶ñ‡¶æ‡¶®‡ßá ‡¶™‡¶æ‡¶∞‡ßç‡¶•‡¶∞ ‡¶Ü‡¶á‡¶®‡¶ú‡ßÄ‡¶¨‡ßÄ ‡¶ú‡¶æ‡¶Æ‡¶ø‡¶®‡ßá‡¶∞ ‡¶Ü‡¶¨‡ßá‡¶¶‡¶® ‡¶ú‡¶æ‡¶®‡¶æ‡¶≤‡ßá‡¶ì ‡¶Ö‡¶∞...,‡¶∏‡ßá‡¶ñ‡¶æ‡¶®‡ßá ‡¶™‡¶æ‡¶∞‡ßç‡¶•‡¶∞ ‡¶Ü‡¶á‡¶®‡¶ú‡ßÄ‡¶¨‡ßÄ ‡¶ú‡¶æ‡¶Æ‡¶ø‡¶®‡ßá‡¶∞ ‡¶Ü‡¶¨‡ßá‡¶¶‡¶® ‡¶ú‡¶æ‡¶®‡¶æ‡¶≤‡ßá‡¶ì ‡¶Ö‡¶∞...,‡¶∏‡¶¨‡ßá‡¶¶‡¶®,‡¶®‡¶æ,100
3,‡¶Ö‡¶∞‡ßç‡¶™‡¶ø‡¶§‡¶æ ‡¶Æ‡ßÅ‡¶ñ‡¶æ‡¶∞‡ßç‡¶ú‡¶ø ‡¶ï‡¶æ‡¶∞‡¶æ‡¶ó‡¶æ‡¶∞‡ßá‡¶á,‡¶Ö‡¶∞‡ßç‡¶™‡¶ø‡¶§‡¶æ ‡¶Æ‡ßÅ‡¶ñ‡¶æ‡¶∞‡ßç‡¶ú‡¶ø [MASK],‡¶ï‡¶æ‡¶∞‡¶æ‡¶ó ‡¶ï‡ßç‡¶∑‡ßç‡¶Æ ‡¶∞‡¶è‡¶á,‡¶ï‡¶æ‡¶∞‡¶æ‡¶ó‡¶æ‡¶∞‡ßá‡¶á,53
4,‡¶§‡¶æ‡¶Å‡¶∞ ‡¶ú‡ßÄ‡¶¨‡¶® ‡¶∏‡¶Ç‡¶∂‡ßü‡ßá‡¶∞ ‡¶Ü‡¶∂‡¶ô‡ßç‡¶ï‡¶æ ‡¶∞‡ßü‡ßá‡¶õ‡ßá,‡¶§‡¶æ‡¶Å‡¶∞ [MASK] ‡¶∏‡¶Ç‡¶∂‡ßü‡ßá‡¶∞ ‡¶Ü‡¶∂‡¶ô‡ßç‡¶ï‡¶æ ‡¶∞‡ßü‡ßá‡¶õ‡ßá,‡¶ú‡ßÄ ‡¶®‡ßç‡¶ß ‡¶®,‡¶ú‡ßÄ‡¶¨‡¶®,62
...,...,...,...,...,...
6851453,‡¶π‡¶§‡ßç‡¶Ø‡¶æ ‡¶ì ‡¶ó‡¶£‡¶π‡¶§‡ßç‡¶Ø‡¶æ‡¶∏‡¶π ‡¶Æ‡¶æ‡¶®‡¶¨‡¶§‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ö‡¶™‡¶∞‡¶æ‡¶ß‡ßá‡¶∞ ‡¶™‡¶æ‡¶Å‡¶ö‡¶ü‡¶ø ...,‡¶π‡¶§‡ßç‡¶Ø‡¶æ ‡¶ì ‡¶ó‡¶£‡¶π‡¶§‡ßç‡¶Ø‡¶æ‡¶∏‡¶π ‡¶Æ‡¶æ‡¶®‡¶¨‡¶§‡¶æ‡¶¨‡¶ø‡¶∞‡ßã‡¶ß‡ßÄ ‡¶Ö‡¶™‡¶∞‡¶æ‡¶ß‡ßá‡¶∞ ‡¶™‡¶æ‡¶Å‡¶ö‡¶ü‡¶ø ...,‡¶Ü‡¶≤‡ßã‡¶Ø‡ßá,‡¶Ü‡¶®‡¶æ,80
6851454,‡¶™‡¶æ‡¶Å‡¶ö‡¶ü‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó‡¶á ‡¶ü‡ßç‡¶∞‡¶æ‡¶á‡¶¨‡ßç‡¶Ø‡ßÅ‡¶®‡¶æ‡¶≤‡ßá ‡¶™‡ßç‡¶∞‡¶Æ‡¶æ‡¶£‡¶ø‡¶§ ‡¶π‡ßü‡ßá‡¶õ‡ßá,[MASK] ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó‡¶á ‡¶ü‡ßç‡¶∞‡¶æ‡¶á‡¶¨‡ßç‡¶Ø‡ßÅ‡¶®‡¶æ‡¶≤‡ßá ‡¶™‡ßç‡¶∞‡¶Æ‡¶æ‡¶£‡¶ø‡¶§ ‡¶π‡ßü‡ßá‡¶õ‡ßá,‡¶™‡¶Å‡¶ö‡¶ü‡¶ø,‡¶™‡¶æ‡¶Å‡¶ö‡¶ü‡¶ø,16
6851455,‡¶ö‡¶æ‡¶∞‡¶ü‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó‡ßá ‡¶§‡¶æ‡¶Å‡¶ï‡ßá ‡¶Æ‡ßÉ‡¶§‡ßç‡¶Ø‡ßÅ‡¶¶‡¶£‡ßç‡¶° ‡¶¶‡ßá‡¶ì‡ßü‡¶æ ‡¶π‡ßü,‡¶ö‡¶æ‡¶∞‡¶ü‡¶ø ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó‡ßá ‡¶§‡¶æ‡¶Å‡¶ï‡ßá [MASK] ‡¶¶‡ßá‡¶ì‡ßü‡¶æ ‡¶π‡ßü,‡¶Æ‡ßÉ‡¶§‡ßç‡¶Ø‡ßÅ‡¶¶‡¶£‡ßç‡¶∏,‡¶Æ‡ßÉ‡¶§‡ßç‡¶Ø‡ßÅ‡¶¶‡¶£‡ßç‡¶°,10
6851456,‡¶è‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶ó‡ßç‡¶∞‡ßÅ‡¶™‡¶ï‡ßá ‡¶ó‡¶£‡¶π‡¶§‡ßç‡¶Ø‡¶æ‡¶∞ ‡¶¶‡¶æ‡ßü‡ßá‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü‡¶ü...,‡¶è‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶∞‡¶æ‡¶ú‡¶®‡ßà‡¶§‡¶ø‡¶ï ‡¶ó‡ßç‡¶∞‡ßÅ‡¶™‡¶ï‡ßá ‡¶ó‡¶£‡¶π‡¶§‡ßç‡¶Ø‡¶æ‡¶∞ ‡¶¶‡¶æ‡ßü‡ßá‡¶∞ ‡¶¨‡¶ø‡¶∑‡ßü‡¶ü...,‡¶∞ ‡¶Ø‡ßç‡¶Ø ‡ßá‡¶õ‡ßá,‡¶∞‡ßü‡ßá‡¶õ‡ßá,55


In [37]:
df_custom_error_dataset.rename(columns = {'similarity':'lavenshtein_distance'}, inplace = True)

In [42]:
df_custom_error_dataset = df_custom_error_dataset[(df_custom_error_dataset.lavenshtein_distance > 10 ) & (df_custom_error_dataset.lavenshtein_distance < 90 )]

In [43]:
df_custom_error_dataset.to_csv('data/train/custom_error_dataset.csv',index=False)