<a href="https://colab.research.google.com/github/narendra974/insidedeeplearning/blob/main/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
zip_file_url = "https://download.pytorch.org/tutorial/data.zip"

import requests, zipfile, io
r = requests.get(zip_file_url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

In [2]:
name_language_data = {}

import unicodedata
import string

all_letters = string.ascii_letters + ".,;"
n_letters = len(all_letters)
alphabet = { }
for i in range(n_letters):
  alphabet[all_letters[i]] = i

def unicodeToAscii(s):
  return ''.join(
      c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn'
      and c in all_letters
  ) 

for zip_path in z.namelist():
  if "data/names/" in zip_path and zip_path.endswith(".txt"):
    lang = zip_path[len("data/names/"):-len("txt")]
    with z.open(zip_path) as myfile:
      lang_names = [unicodeToAscii(line).lower() for line in str(myfile.read(), encoding = 'utf-8').strip().split("\n")]
    name_language_data[lang] = lang_names
    print(lang, ":", len(lang_names))

Arabic. : 2000
Chinese. : 268
Czech. : 519
Dutch. : 297
English. : 3668
French. : 277
German. : 724
Greek. : 203
Irish. : 232
Italian. : 709
Japanese. : 991
Korean. : 94
Polish. : 139
Portuguese. : 74
Russian. : 9408
Scottish. : 100
Spanish. : 298
Vietnamese. : 73


In [9]:
from torch.utils.data import Dataset, DataLoader
import torch

class LanguageNameDataset(Dataset):
  
  def __init__(self, lang_name_dict, vocabulary):
    self.label_names = [x for x in lang_name_dict.keys()]
    self.data = []
    self.labels = []
    self.vocabulary = vocabulary
    for y, language in enumerate(self.label_names):
      for sample in lang_name_dict[language]:
        self.data.append(sample)
        self.labels.append(y)

  def __len__(self):
    return len(self.data)

  def string2InputVec(self, input_string):
    T = len(input_string)
    name_vec = torch.zeros((T), dtype=torch.long)
    for pos, character in enumerate(input_string):
      name_vec[pos] = self.vocabulary[character]
    return name_vec
  
  def __getitem__(self, idx):
    name = self.data[idx]
    label = self.data[idx]

    label_vec = torch.tensor([label], dtype=torch.long)

    return self.string2InputVec(name), label

In [11]:
dataset = LanguageNameDataset(name_language_data, alphabet)

train_data, test_data = torch.utils.data.random_split(dataset, (len(dataset)-300, 300))
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1, shuffle=True)


In [12]:
import torch.nn as nn

with torch.no_grad():
  input_sequence = torch.tensor([0, 1, 1, 0, 2], dtype=torch.long)
  embd = nn.Embedding(3, 2)  # 3 represents the vocab size in the example {0, 1, 2} represents vocabulary. 
  x_seq = embd(input_sequence)
  print(input_sequence.shape, x_seq.shape)
  print(x_seq)

torch.Size([5]) torch.Size([5, 2])
tensor([[ 0.5331,  0.4130],
        [ 0.9056,  0.2064],
        [ 0.9056,  0.2064],
        [ 0.5331,  0.4130],
        [-0.8712, -0.4594]])
