In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/eng-hin/hin_valid.csv
/kaggle/input/eng-hin/hin_test.csv
/kaggle/input/eng-hin/hin_train.csv


In [2]:
# import cell
import torch
from torch.nn.utils.rnn import pad_sequence

In [3]:
SOS_token = "@"
EOS_token = "#"
PAD_token = "^"
UNK_token = "$"

SOS_idx = 0
EOS_idx = 1
PAD_idx = 2
UNK_idx = 3

class Script:
    def __init__(self, name):
        self.name = name
        self.char2index = {SOS_token: SOS_idx, EOS_token: EOS_idx, PAD_token: PAD_idx, UNK_token: UNK_idx}
        self.char2count = {}
        self.index2char = {SOS_idx: SOS_token, EOS_idx: EOS_token, PAD_idx: PAD_token, UNK_idx: UNK_token}
        self.n_chars = 4  # Count SOS, EOS, PAD and UNK

    def addWord(self, word):
        for char in word:
            self.addChar(char)

    def addChar(self, char):
        if char not in self.char2index:
            self.char2index[char] = self.n_chars
            self.char2count[char] = 1
            self.index2char[self.n_chars] = char
            self.n_chars += 1
        else:
            self.char2count[char] += 1

In [4]:
def prepareVocab(data, in_scr="lat", out_scr="dev"):
    input_vocab = Script(in_scr)
    output_vocab = Script(out_scr)
    
    for pair in data.values:
        input_vocab.addWord(pair[0])
        output_vocab.addWord(pair[1])
    
    return input_vocab, output_vocab

In [5]:
def tensorFromWord(vocab, word, sos=True, eos=True):
    char_list = []
    if sos:
        char_list.append(vocab.char2index[SOS_token])
    for char in word:
        if char in vocab.char2index:
            char_list.append(vocab.char2index[char])
        else:
            char_list.append(vocab.char2index[UNK_token])
    if eos:
        char_list.append(vocab.char2index[EOS_token])
    char_tensor = torch.tensor(char_list, dtype=torch.int)
    return char_tensor

In [6]:
def processData(data, in_vocab, out_vocab, sos=True, eos=True):
    in_tensor_list = []
    out_tensor_list = []
    for pair in data.values:
        input_tensor = tensorFromWord(in_vocab, pair[0], sos, eos)
        output_tensor = tensorFromWord(out_vocab, pair[1], sos, eos)
        in_tensor_list.append(input_tensor)
        out_tensor_list.append(output_tensor)
    in_tensor_pad = pad_sequence(in_tensor_list, padding_value=PAD_idx, batch_first=False)
    out_tensor_pad = pad_sequence(out_tensor_list, padding_value=PAD_idx, batch_first=False)
    return in_tensor_pad, out_tensor_pad

In [7]:
# load dataset
train_data = pd.read_csv('/kaggle/input/eng-hin/hin_train.csv', sep=',', header=None)
test_data = pd.read_csv('/kaggle/input/eng-hin/hin_test.csv', sep=',', header=None)
valid_data = pd.read_csv('/kaggle/input/eng-hin/hin_valid.csv', sep=',', header=None)

In [8]:
# build vocabulary
x_vocab, y_vocab = prepareVocab(train_data)

In [9]:
x_train, y_train = processData(train_data, x_vocab, y_vocab)
x_test, y_test = processData(test_data, x_vocab, y_vocab)
x_valid, y_valid = processData(valid_data, x_vocab, y_vocab)

In [10]:
print(x_train.size())
print(x_test.size())
print(x_valid.size())

torch.Size([26, 51200])
torch.Size([28, 4096])
torch.Size([24, 4096])


In [11]:
print(y_train.size())
print(y_test.size())
print(y_valid.size())

torch.Size([22, 51200])
torch.Size([22, 4096])
torch.Size([22, 4096])


In [12]:
# print first 5 words from train dataset
for i in range(5):
    x_tensor = x_train[:,i]
    y_tensor = y_train[:,i]
    
    x_word = ""
    y_word = ""
    
    for idx in x_tensor:
        if idx >= 3:
            x_word += x_vocab.index2char[idx.item()]
    
    for idx in y_tensor:
        if idx >= 3:
            y_word += y_vocab.index2char[idx.item()]
            
    print(x_word, y_word)

shastragaar शस्त्रागार
bindhya बिन्द्या
kirankant किरणकांत
yagyopaveet यज्ञोपवीत
ratania रटानिया


In [13]:
# print 5 words from test dataset
for i in range(5):
    x_tensor = x_test[:,i+210]
    y_tensor = y_test[:,i+210]
    
    x_word = ""
    y_word = ""
    
    for idx in x_tensor:
        if idx >= UNK_idx:
            x_word += x_vocab.index2char[idx.item()]
    
    for idx in y_tensor:
        if idx >= UNK_idx:
            y_word += y_vocab.index2char[idx.item()]
            
    print(x_word, y_word)

chimate चिमाते
kikiyana किकियाना
rocket र$केट
maxwell मैक्सवेल
hippo हिप्पो
