In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
import os

BATCH_SIZE = 4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
directories = os.listdir("aksharantar_sampled/hin/")
train_data, val_data, test_data = [],[],[]

for i in open("aksharantar_sampled/hin/"+directories[0], encoding='utf-8'):
    test_data.append([i.split(",")[0],i.split(",")[1][:-1]])

for i in open("aksharantar_sampled/hin/"+directories[1], encoding='utf-8'):
    train_data.append([i.split(",")[0],i.split(",")[1][:-1]])

for i in open("aksharantar_sampled/hin/"+directories[2], encoding='utf-8'):
    val_data.append([i.split(",")[0],i.split(",")[1][:-1]])

print(len(test_data))
print(len(train_data))
print(len(val_data))

def hindi_chars(hindi_word):

    return [c for c in unicodedata.normalize('NFD', hindi_word) if unicodedata.category(c) != 'Mn']

def english_chars(english_word):

    return [c for c in english_word]

def one_hot(x, n):

    encoding = np.zeros(n)
    encoding[x] = 1
    return encoding

def generate_char_to_idx():
    
    en_start, en_end = 0x0061, 0x007A # ASCII characters (lowercase only)
    hi_start, hi_end = 0x0900, 0x0965 # Devanagari characters

    char_to_idx = {}

    for i, char in enumerate(range(en_start, en_end+1)):
        char_to_idx[chr(char)] = len(char_to_idx)

    for i, char in enumerate(range(hi_start, hi_end+1)):
        char_to_idx[chr(char)] = len(char_to_idx)

    return char_to_idx

char_to_idx = generate_char_to_idx()
char_to_idx['/start'] = len(char_to_idx)
char_to_idx['/end'] = len(char_to_idx)
char_to_idx['/pad']  = len(char_to_idx)

def word_to_idx(data):

    idx = 0

    for eng_word, hin_word in data:

        eng_sequence = [char_to_idx[char] for char in eng_word]
        hin_sequence = [char_to_idx[char] for char in hin_word]

        eng_sequence.insert(0,char_to_idx['/start'])
        hin_sequence.insert(0,char_to_idx['/start'])

        eng_sequence.append(char_to_idx['/end'])
        hin_sequence.append(char_to_idx['/end'])

        data[idx][0] = eng_sequence
        data[idx][1] = hin_sequence

        idx+=1

    return data

train_data = word_to_idx(train_data)
test_data = word_to_idx(test_data)
val_data = word_to_idx(val_data)



4096
51200
4096


In [10]:
max_seq_length = 0

def set_max_seq(data):

    global max_seq_length

    for seq_x, seq_y in data:

        if(len(seq_x) > max_seq_length) : max_seq_length = len(seq_x)
        if(len(seq_y) > max_seq_length) : max_seq_length = len(seq_y)

def pad(data):

    global max_seq_legnth

    for seq_x, seq_y in data:

        while(len(seq_x) < max_seq_length):

            seq_x.insert(len(seq_x), char_to_idx['/pad'])

        while(len(seq_y) < max_seq_length):

            seq_y.insert(len(seq_y), char_to_idx['/pad'])

set_max_seq(train_data)
set_max_seq(test_data)
set_max_seq(val_data)
pad(train_data)
pad(test_data)
pad(val_data)


In [11]:
train_data[0][0]

[128,
 18,
 7,
 0,
 18,
 19,
 17,
 0,
 6,
 0,
 0,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 130,
 17,
 129]

In [9]:
print(char_to_idx)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, 'ऀ': 26, 'ँ': 27, 'ं': 28, 'ः': 29, 'ऄ': 30, 'अ': 31, 'आ': 32, 'इ': 33, 'ई': 34, 'उ': 35, 'ऊ': 36, 'ऋ': 37, 'ऌ': 38, 'ऍ': 39, 'ऎ': 40, 'ए': 41, 'ऐ': 42, 'ऑ': 43, 'ऒ': 44, 'ओ': 45, 'औ': 46, 'क': 47, 'ख': 48, 'ग': 49, 'घ': 50, 'ङ': 51, 'च': 52, 'छ': 53, 'ज': 54, 'झ': 55, 'ञ': 56, 'ट': 57, 'ठ': 58, 'ड': 59, 'ढ': 60, 'ण': 61, 'त': 62, 'थ': 63, 'द': 64, 'ध': 65, 'न': 66, 'ऩ': 67, 'प': 68, 'फ': 69, 'ब': 70, 'भ': 71, 'म': 72, 'य': 73, 'र': 74, 'ऱ': 75, 'ल': 76, 'ळ': 77, 'ऴ': 78, 'व': 79, 'श': 80, 'ष': 81, 'स': 82, 'ह': 83, 'ऺ': 84, 'ऻ': 85, '़': 86, 'ऽ': 87, 'ा': 88, 'ि': 89, 'ी': 90, 'ु': 91, 'ू': 92, 'ृ': 93, 'ॄ': 94, 'ॅ': 95, 'ॆ': 96, 'े': 97, 'ै': 98, 'ॉ': 99, 'ॊ': 100, 'ो': 101, 'ौ': 102, '्': 103, 'ॎ': 104, 'ॏ': 105, 'ॐ': 106, '॑': 107, '॒': 108, '॓': 109, '॔': 110,