# Install Necessary Libraries and Fetch the Corpus

In [1]:
%%capture

# Install libraries
!pip install -q gdown
!pip install --upgrade transformers[torch]
!pip install datasets

# Fetch the folder
!gdown "https://drive.google.com/drive/folders/1oIT7DZhd4uXTpjgBeRGSP-Fs-1Ux3m-b?usp=sharing" --folder

# Import Necessary Libraries

In [2]:
from tokenizers import Tokenizer, models, trainers, processors, pre_tokenizers, decoders
from transformers import AutoTokenizer
import pandas as pd
from tqdm import tqdm
import random
import numpy as np
import torch

# Load the Custom Corpus

In [3]:
test = pd.read_csv("/content/BanglaParaphraseBUETNLP/test.csv")
train = pd.read_csv("/content/BanglaParaphraseBUETNLP/train.csv")
valid = pd.read_csv("/content/BanglaParaphraseBUETNLP/valid.csv")

df = pd.concat([test, train, valid], ignore_index=True)
df.reset_index(drop=True, inplace=True)

In [4]:
source_texts = df['source'].tolist()
target_texts = df['target'].tolist()
all_texts = source_texts + target_texts
# all_texts = all_texts[:500000]
all_texts[:3]

['কিছুদিন আগে প্যারিস থেকে ঘুরে এসেছি।',
 'ভাড়া করে ফেললেন কার্নেগি হলের মতো অত্যন্ত অভিজাত অডিটোরিয়াম, যেখানে হাজার হাজার মানুষ একসাথে বসে পারফর্মেন্স দেখতে পারে।',
 'সম্পূর্ণ নিয়ন্ত্রণ হারিয়ে জাহাজ পড়লো ঘোর সমুদ্রে।']

In [5]:
# all_characters = []
# for sent in tqdm(all_texts):
#     for char in sent:
#         if char not in all_characters:
#             all_characters.append(char)

In [6]:
# all_characters_count = {}

# for char in all_characters:
#     all_characters_count[char] = 0

# for sent in tqdm(all_texts):
#     for char in sent:
#         all_characters_count[char] += 1

In [7]:
# df_char = pd.DataFrame({
#     "character": list(all_characters_count.keys()),
#     "count": list(all_characters_count.values())
# })

# df_char
# # df_char.loc[df_char['count'] > 100]

# List of Bangla Characters

In [8]:
# sorted(df_char.loc[df_char['count'] > 500]['character'].to_list())

all_considered_characters = [
    ' ',  'ঁ',  'ং',  'ঃ',  'অ',  'আ',  'ই',  'ঈ',  'উ',  'ঊ',  'ঋ',  'এ',  'ঐ',  'ও',  'ঔ',
    'ক',  'খ',  'গ',  'ঘ',  'ঙ',  'চ',  'ছ',  'জ',  'ঝ',  'ঞ',  'ট',  'ঠ',  'ড',  'ঢ',  'ণ',  'ত',
    'থ',  'দ',  'ধ',  'ন',  'প',  'ফ',  'ব',  'ভ',  'ম',  'য',  'র',  'ল',  'শ',  'ষ',  'স',  'হ',
    'ড়',   'ঢ়',   'য়',  '়',  'া',  'ি',  'ী',  'ু',  'ূ',  'ৃ',  'ে',  'ৈ',  'ো',  'ৌ',  '্',  'ৎ',
    '০',  '১',  '২',  '৩',  '৪',  '৫',  '৬',  '৭',  '৮',  '৯']

len(all_considered_characters)

73

# Character-level Preprocessing

In [9]:
all_cleaned_sentences = []

for sent in tqdm(all_texts):
    cleaned_sent = ""
    for char in sent:
        if char in all_considered_characters:
            cleaned_sent += str(char)
    all_cleaned_sentences.append(cleaned_sent)

100%|██████████| 933260/933260 [01:30<00:00, 10364.60it/s]


In [10]:
len(all_texts) == len(all_cleaned_sentences), len(all_cleaned_sentences)

(True, 933260)

# Existing Tokenizer from HuggingFace

In [11]:
pretrained_tokenizer_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

example_sent = all_cleaned_sentences[random.randint(0, len(all_cleaned_sentences)-1)]

print(example_sent)
print(tokenizer.tokenize(example_sent))
print(tokenizer.encode(example_sent))
print(tokenizer.decode(tokenizer.encode(example_sent)))

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

ধরুন আপনি একটা নতুন জামা কিনেছেন
['ধ', '##র', '##ন', 'আ', '##প', '##ন', '##ি', 'এ', '##ক', '##ট', '##া', 'ন', '##ত', '##ন', 'জ', '##া', '##ম', '##া', 'ক', '##ি', '##ন', '##ে', '##ছ', '##ে', '##ন']
[101, 1365, 29908, 29902, 1348, 29903, 29902, 29915, 1351, 29889, 29895, 29914, 1366, 29898, 29902, 1358, 29914, 29906, 29914, 1353, 29915, 29902, 29917, 29893, 29917, 29902, 102]
[CLS] ধরন আপনি একটা নতন জামা কিনেছেন [SEP]


# Finetune an Existing Tokenizer on Custom Corpus

In [12]:
pretrained_tokenizer_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

# Customize training parameters
vocab_size = 30000
min_frequency = 5

# Fine-tune the tokenizer on your custom dataset
tokenizer_finetuned = tokenizer.train_new_from_iterator(np.array(all_cleaned_sentences).reshape(-1, 1), vocab_size=vocab_size)

In [13]:
print(example_sent)
print(tokenizer_finetuned.tokenize(example_sent))
print(tokenizer_finetuned.encode(example_sent))
print(tokenizer_finetuned.decode(tokenizer_finetuned.encode(example_sent)))

ধরুন আপনি একটা নতুন জামা কিনেছেন
['ধরন', 'আপনি', 'একটা', 'নতন', 'জামা', 'কিনেছেন']
[2, 3569, 644, 391, 526, 6236, 27913, 3]
[CLS] ধরন আপনি একটা নতন জামা কিনেছেন [SEP]


# Make the Finetuned Tokenizer Publicly Avaliable Through HuggingFace Hub

In [15]:
%%capture
!apt install git-lfs

In [14]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [17]:
pretrained_tokenizer_name = "bert-base-uncased-finetuned-bangla-tokenizer"
tokenizer_finetuned.push_to_hub(pretrained_tokenizer_name)

CommitInfo(commit_url='https://huggingface.co/mehedihasanbijoy/bert-base-uncased-finetuned-bangla-tokenizer/commit/3f5edbbe272660b212aec2ce4436f5d6539e5aaf', commit_message='Upload tokenizer', commit_description='', oid='3f5edbbe272660b212aec2ce4436f5d6539e5aaf', pr_url=None, pr_revision=None, pr_num=None)

# Check the Pretrained Tokenizer by Loading from HuggingFace Hub

In [19]:
pretrained_tokenizer_name = "mehedihasanbijoy/bert-base-uncased-finetuned-bangla-tokenizer"
tokenizer = AutoTokenizer.from_pretrained(pretrained_tokenizer_name)

example_sent = all_cleaned_sentences[random.randint(0, len(all_cleaned_sentences)-1)]

print(example_sent)
print(tokenizer.tokenize(example_sent))
print(tokenizer.encode(example_sent))
print(tokenizer.decode(tokenizer.encode(example_sent)))

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/529k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

এরপর অন্যান্য দাসীরা বাকি দাসীদের কোলে নিয়ে নাচের তালে একই আনন্দ উপভোগ করতে শুরু করেছিল
['এরপর', 'অনযানয', 'দাসী', '##রা', 'বাকি', 'দাসী', '##দের', 'কোলে', 'নিযে', 'নাচের', 'তালে', 'একই', 'আননদ', 'উপভোগ', 'করতে', 'শর', 'করেছিল']
[2, 647, 854, 23427, 150, 1685, 23427, 151, 7830, 257, 12376, 5833, 737, 3131, 3829, 259, 268, 750, 3]
[CLS] এরপর অনযানয দাসীরা বাকি দাসীদের কোলে নিযে নাচের তালে একই আননদ উপভোগ করতে শর করেছিল [SEP]
