**install required dependencies**

In [1]:
pip install nepalitokenizers

Collecting nepalitokenizers
  Obtaining dependency information for nepalitokenizers from https://files.pythonhosted.org/packages/e3/ee/9b52ba391a9b3b74760adb93c855a17b1df0f4a11e5eb3c88e6378a6f97d/nepalitokenizers-0.0.2-py3-none-any.whl.metadata
  Downloading nepalitokenizers-0.0.2-py3-none-any.whl.metadata (8.8 kB)
Downloading nepalitokenizers-0.0.2-py3-none-any.whl (678 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m678.2/678.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nepalitokenizers
Successfully installed nepalitokenizers-0.0.2
Note: you may need to restart the kernel to use updated packages.


**Import required libraries**

In [2]:
from nepalitokenizers import WordPiece
import string
import numpy as np
import pandas as pd
import random
import tqdm
from tqdm import tnrange
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import spacy

**check for cuda cores GPU**

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

**Loading processed dataset for sample model**

In [4]:
path = "/kaggle/input/machine/Book1.csv"
df = pd.read_csv(path)

In [5]:
df.head()


Unnamed: 0,nepali,english
0,म पनि जान्छु है त अहिले लाई ।,I will also go now.
1,थाहा छैन ।,don't know
2,म खाना पकाउँदै छु ।,I am cooking.
3,तिमी हराएर हो नि भाई ।,You are lost brother.
4,मैले बाँसुरी ।,I blew the flute.


In [6]:
df.describe()

Unnamed: 0,nepali,english
count,171,171
unique,171,169
top,म पनि जान्छु है त अहिले लाई ।,It's fun.
freq,1,2


In [7]:
df

Unnamed: 0,nepali,english
0,म पनि जान्छु है त अहिले लाई ।,I will also go now.
1,थाहा छैन ।,don't know
2,म खाना पकाउँदै छु ।,I am cooking.
3,तिमी हराएर हो नि भाई ।,You are lost brother.
4,मैले बाँसुरी ।,I blew the flute.
...,...,...
166,राणाशाहीले नेपाललाई बिर्ता र मौजामा बाँडे ।,Ranashahi divided Nepal into Birta and Mauza.
167,के छ खबर साथीहरु?,What's up friends?
168,एक हातले तालि बज्दैन।,One hand does not clap.
169,अाज एक्कासि रुघा लाग्न थाल्यो ।,Today I suddenly started getting a cold.


In [8]:
df['nepali'] = df['nepali'].apply(lambda x :'<SOS>' + x + '<EOS>')

In [9]:
df

Unnamed: 0,nepali,english
0,<SOS>म पनि जान्छु है त अहिले लाई ।<EOS>,I will also go now.
1,<SOS>थाहा छैन ।<EOS>,don't know
2,<SOS>म खाना पकाउँदै छु ।<EOS>,I am cooking.
3,<SOS>तिमी हराएर हो नि भाई ।<EOS>,You are lost brother.
4,<SOS>मैले बाँसुरी ।<EOS>,I blew the flute.
...,...,...
166,<SOS>राणाशाहीले नेपाललाई बिर्ता र मौजामा बाँडे...,Ranashahi divided Nepal into Birta and Mauza.
167,<SOS>के छ खबर साथीहरु?<EOS>,What's up friends?
168,<SOS>एक हातले तालि बज्दैन।<EOS>,One hand does not clap.
169,<SOS>अाज एक्कासि रुघा लाग्न थाल्यो ।<EOS>,Today I suddenly started getting a cold.


In [10]:
df['english'] = df['english'].apply(lambda x :'<SOS>' + x + '<EOS>')

In [11]:
df

Unnamed: 0,nepali,english
0,<SOS>म पनि जान्छु है त अहिले लाई ।<EOS>,<SOS>I will also go now.<EOS>
1,<SOS>थाहा छैन ।<EOS>,<SOS>don't know<EOS>
2,<SOS>म खाना पकाउँदै छु ।<EOS>,<SOS>I am cooking.<EOS>
3,<SOS>तिमी हराएर हो नि भाई ।<EOS>,<SOS>You are lost brother.<EOS>
4,<SOS>मैले बाँसुरी ।<EOS>,<SOS>I blew the flute.<EOS>
...,...,...
166,<SOS>राणाशाहीले नेपाललाई बिर्ता र मौजामा बाँडे...,<SOS>Ranashahi divided Nepal into Birta and Ma...
167,<SOS>के छ खबर साथीहरु?<EOS>,<SOS>What's up friends?<EOS>
168,<SOS>एक हातले तालि बज्दैन।<EOS>,<SOS>One hand does not clap.<EOS>
169,<SOS>अाज एक्कासि रुघा लाग्न थाल्यो ।<EOS>,<SOS>Today I suddenly started getting a cold.<...


In [12]:
all_eng_words=set()
for eng in df['english']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_nepali_words=set()
for hin in df['nepali']:
    for word in hin.split():
        if word not in all_nepali_words:
            all_nepali_words.add(word)

            
print('no of english words: ', len(all_eng_words))
print('no of nepali words: ', len(all_nepali_words))            

no of english words:  561
no of nepali words:  630


In [13]:
# not necessary for now
# getting maximum sentence length of english sentences
length_list = []
for l in df.english:
    length_list.append(len(l.split(' ')))

max_input_length = np.max(length_list)
print('max_output_length: ', max_input_length)

max_output_length:  17


In [14]:
# getting maximum sentence length of nepali sentences
length_list = []
for l in df.nepali:
    length_list.append(len(l.split(' ')))

max_input_length = np.max(length_list)
print('max_input_length: ', max_input_length)

max_input_length:  13


In [15]:
df

Unnamed: 0,nepali,english
0,<SOS>म पनि जान्छु है त अहिले लाई ।<EOS>,<SOS>I will also go now.<EOS>
1,<SOS>थाहा छैन ।<EOS>,<SOS>don't know<EOS>
2,<SOS>म खाना पकाउँदै छु ।<EOS>,<SOS>I am cooking.<EOS>
3,<SOS>तिमी हराएर हो नि भाई ।<EOS>,<SOS>You are lost brother.<EOS>
4,<SOS>मैले बाँसुरी ।<EOS>,<SOS>I blew the flute.<EOS>
...,...,...
166,<SOS>राणाशाहीले नेपाललाई बिर्ता र मौजामा बाँडे...,<SOS>Ranashahi divided Nepal into Birta and Ma...
167,<SOS>के छ खबर साथीहरु?<EOS>,<SOS>What's up friends?<EOS>
168,<SOS>एक हातले तालि बज्दैन।<EOS>,<SOS>One hand does not clap.<EOS>
169,<SOS>अाज एक्कासि रुघा लाग्न थाल्यो ।<EOS>,<SOS>Today I suddenly started getting a cold.<...


In [16]:
# Step 4: Initialize the WordPiece tokenizer
tokenizer_wp = WordPiece()

# Step 5: Define a function to tokenize a text
def tokenize_text(text):
    encoded_text = tokenizer_wp.encode(text)
    return (encoded_text.tokens, encoded_text.ids)

# Step 6: Apply the tokenize_text function to the Nepali text column of the DataFrame
# This will create a new DataFrame column where each row contains a tuple of (tokens, token_ids)
df['nepali_tokenization'] = df['nepali'].apply(tokenize_text)

# Step 7: Split the tuples into separate columns
df['nepali_tokens'], df['token_ids'] = zip(*df['nepali_tokenization'])

# Step 8: Output the DataFrame with the tokenized text and token IDs
print(df[['nepali', 'nepali_tokens', 'token_ids']])


                                                nepali  \
0              <SOS>म पनि जान्छु है त अहिले लाई ।<EOS>   
1                                 <SOS>थाहा छैन ।<EOS>   
2                        <SOS>म खाना पकाउँदै छु ।<EOS>   
3                     <SOS>तिमी हराएर हो नि भाई ।<EOS>   
4                             <SOS>मैले बाँसुरी ।<EOS>   
..                                                 ...   
166  <SOS>राणाशाहीले नेपाललाई बिर्ता र मौजामा बाँडे...   
167                        <SOS>के छ खबर साथीहरु?<EOS>   
168                    <SOS>एक हातले तालि बज्दैन।<EOS>   
169          <SOS>अाज एक्कासि रुघा लाग्न थाल्यो ।<EOS>   
170                  <SOS>अब केहि दिनमा गर्नेछु ।<EOS>   

                                         nepali_tokens  \
0    [[CLS], <, so, ##s, >, म, पनि, जान्छु, है, त, ...   
1    [[CLS], <, so, ##s, >, थाहा, छैन, ।, ##<, e, #...   
2    [[CLS], <, so, ##s, >, म, खाना, पका, ##उँदै, छ...   
3    [[CLS], <, so, ##s, >, तिमी, हराएर, हो, नि, भा...   
4    [[CLS], 

In [17]:
# tokenizer for english
nlp = spacy.load("en_core_web_sm")

# Step 5: Tokenize the English text column (assuming the column name is 'english')
# This will create a new column where each row contains a spaCy Doc object with tokenized text
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    token_ids = [token.i for token in doc]
    return tokens, token_ids

# Step 6: Apply the tokenize_text function to the English text column of the DataFrame
# Assuming the column containing English text is named 'english'
df['english_tokens'], df['token_ids'] = zip(*df['english'].apply(tokenize_text))

# Step 7: Output the DataFrame with the tokenized text and token IDs
print(df[['english', 'english_tokens', 'token_ids']])

                                               english  \
0                        <SOS>I will also go now.<EOS>   
1                                 <SOS>don't know<EOS>   
2                              <SOS>I am cooking.<EOS>   
3                      <SOS>You are lost brother.<EOS>   
4                          <SOS>I blew the flute.<EOS>   
..                                                 ...   
166  <SOS>Ranashahi divided Nepal into Birta and Ma...   
167                       <SOS>What's up friends?<EOS>   
168                  <SOS>One hand does not clap.<EOS>   
169  <SOS>Today I suddenly started getting a cold.<...   
170              <SOS>I will do it in a few days.<EOS>   

                                        english_tokens  \
0          [<, SOS, >, I, will, also, go, now.<EOS, >]   
1                  [<, SOS, >, don't, know, <, EOS, >]   
2                  [<, SOS, >, I, am, cooking.<EOS, >]   
3         [<, SOS, >, You, are, lost, brother.<EOS, >]   
4            