<a href="https://colab.research.google.com/github/manishiitg/ML_Experiments/blob/master/nlp/RNN_Indian_Name_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Generate Indian Names Randomly via Character RNNs**


Purpose of this NN is to use character RNNs to be able to generate randome names.

The NN will train on indian names data from kaggle.

Network is designed to predict the next character based a sequence of previous characters


In [0]:
# Let's make sure the kaggle.json file is present.
!ls -lha kaggle.json
# Next, install the Kaggle API client.
!pip install -q kaggle
# The Kaggle API client expects this file to be in ~/.kaggle,
# so move it there.
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# This permissions change avoids a warning on Kaggle tool startup.
!chmod 600 ~/.kaggle/kaggle.json

-rw-r--r-- 1 root root 66 Nov 26 12:20 kaggle.json


In [0]:
!kaggle datasets download -d chaitanyapatil7/indian-names

indian-names.zip: Skipping, found more recently modified local copy (use --force to force download)


In [0]:
!unzip indian-names.zip

Archive:  indian-names.zip
replace Indian-Female-Names.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Indian-Female-Names.csv  
replace Indian-Male-Names.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Indian-Male-Names.csv   


Till now we have downloaded and setup our data

In [0]:
import torch
from torch import nn

import torch.nn.functional as F

import numpy as np

from sklearn.metrics import accuracy_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [0]:
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'-"

def findFiles(path): return glob.glob(path)
# Read a file and split into lines

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    lines = [line.split(",")[0].strip() for line in lines]
    return [unicodeToAscii(line) for line in lines]

# Build the category_lines dictionary, a list of lines per category

lines1 = readLines("Indian-Female-Names.csv")
lines2 = readLines("Indian-Male-Names.csv")

print("female names", len(lines1))
print("male names", len(lines2))

lines = lines1 + lines2

print("total names", len(lines))

female names 15383
male names 14846
total names 30229


Read the names from the csv files 

In [0]:
data = []
for l in lines:
   data += list(l)

chars = list(set(data))

# start_char = "<"
# end_char = ">"

# chars.append(start_char)
# chars.append(end_char)

data_size, vocab_size = len(data), len(chars)

print('data has %d characters, %d unique.' % (data_size, vocab_size))

n_chars = len(chars)

print(chars)

data has 274545 characters, 29 unique.
['e', 'p', 'd', 's', 'f', 'q', 't', ' ', 'v', 'y', '.', 'u', 'r', 'i', 'h', 'k', 'c', 'j', 'b', 'a', 'n', 'z', 'o', 'w', 'm', 'x', 'l', '-', 'g']


In [0]:
char_to_ix = {ch:i for i, ch in enumerate(chars)}
ix_to_char = {i:ch for i, ch in enumerate(chars)}
print('char_to_ix', char_to_ix)
print('ix_to_char', ix_to_char)

char_to_ix {'e': 0, 'p': 1, 'd': 2, 's': 3, 'f': 4, 'q': 5, 't': 6, ' ': 7, 'v': 8, 'y': 9, '.': 10, 'u': 11, 'r': 12, 'i': 13, 'h': 14, 'k': 15, 'c': 16, 'j': 17, 'b': 18, 'a': 19, 'n': 20, 'z': 21, 'o': 22, 'w': 23, 'm': 24, 'x': 25, 'l': 26, '-': 27, 'g': 28}
ix_to_char {0: 'e', 1: 'p', 2: 'd', 3: 's', 4: 'f', 5: 'q', 6: 't', 7: ' ', 8: 'v', 9: 'y', 10: '.', 11: 'u', 12: 'r', 13: 'i', 14: 'h', 15: 'k', 16: 'c', 17: 'j', 18: 'b', 19: 'a', 20: 'n', 21: 'z', 22: 'o', 23: 'w', 24: 'm', 25: 'x', 26: 'l', 27: '-', 28: 'g'}


In [0]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers, max_seq_length):
        super(Model, self).__init__()

        # Defining some parameters
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        #Defining the layers
        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)   

        # https://discuss.pytorch.org/t/could-someone-explain-batch-first-true-in-lstm/15402

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim * max_seq_length, output_size)

        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        
        batch_size = x.size(0)

        #Initializing hidden state for first input using method defined below
        hidden = self.init_hidden(batch_size)

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden)
        # print(out.shape, 'rnn output snape')
        # Reshaping the outputs such that it can be fit into the fully connected layer
        # basically we are concating all the letters together. 
        # the size would be hidden_dim * max_seq_length
        # since our purpose to predect the next character i.e a single character 
        # our network should output only a single character
        out = out.contiguous().view(batch_size, -1)
        # print(out.shape, " new out shape")
        
        
        out = self.dropout(out)
        out = self.fc(out)

        # print(out.shape, 'single layer output shape')
        
        
        out = self.softmax(out)

        return out, hidden
    
    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
         # We'll send the tensor holding the hidden state to the device we specified earlier as well
        return hidden

In [0]:
max_seq_length = 20
# this will maximum length of a name
# this is required to define the network

model = Model(input_size=n_chars, output_size=n_chars, hidden_dim=12, n_layers=1, max_seq_length=max_seq_length)

# We'll also set the model to the device that we defined earlier (default is CPU)
model = model.to(device)

# Define hyperparameters
n_epochs = 1000
lr=0.001

# Define Loss, Optimizer
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [0]:
def one_hot_encode_batch(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape

    # we need to have all seq of same length or else cannot create this array
    # if we don't have batch_size i.e if we process one input at a time. we 
    # don't need to have sequence of same length

    tensor = torch.zeros(batch_size, seq_len, dict_size)

    # len(sequence) and seq_len will be same in case when we are standaring the lenght!

    # features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)
    
    # Replacing the 0 at the relevant character index with a 1 to represent that character
    for i in range(batch_size):
        for li in range(len(sequence[i])):
          tensor[i][li][sequence[i][li]] = 1
    return tensor

In [0]:
# general utility function but not used as such
def one_hot_encode(line, dict_size):
    tensor = torch.zeros(len(line), dict_size)
    for li in range(len(line)):
        tensor[li][line[li]] = 1
    return tensor

In [0]:
def targetTensor(seq):
  tensor = torch.zeros(len(seq))
  for i in range(len(seq)):
    line = seq[i]
    tensor[i] = line
  return tensor.long()

Target tensor is a simple tensor of indexes.
The reason for this because we are use NN Loss, and for we need to provide indexes of character. Its basically is a classification problem with characters a label. 

So our network will output software of possible charaters and we need to compare that with index's of the actual character 

This is important to understand

In [0]:
def train(epoch = -1):
  tloss = 0
  itrloss = 0
  for line_no in range(len(lines)):
    name = lines[line_no]
    name = name.strip()

    if len(name) <= 1:
      continue

    input_seq_idx = []
    target_char_idx = []
    if len(name) >= max_seq_length: 
      # print(name , " bigger than max seq length FYI")
      name = name[0:max_seq_length - 1]
    
    for i in range(1, len(name)):
      # input_seq = name[0:i] if i != 0 else start_char
      input_seq = name[0:i]
      target_char = name[i]

      # input_seq = input_seq + end_char
      input_seq = input_seq

      # print(input_seq," ==== ", target_char)

      input_seq_idx.append([char_to_ix[ch] for ch in input_seq])
      target_char_idx.append(char_to_ix[target_char])

    input_encoded = one_hot_encode_batch(input_seq_idx, n_chars, max_seq_length, len(input_seq_idx))
    target_encoded = targetTensor(target_char_idx)

    input_encoded = input_encoded.to(device)
    target_encoded = target_encoded.to(device)
    # print(input_encoded.shape, "input shape")
    

    optimizer.zero_grad()
    output, hidden = model(input_encoded)
    output = output.to(device)
    # output = output

    # print(output.shape, 'final nn ouput shape')
    # print(target_encoded.shape, "target shape")
    # print(target_encoded, "target")

    # The input is expected to contain scores for each class.
    # input has to be a 2D Tensor of size (minibatch, C).
    # This criterion expects a class index (0 to C-1) as the target for each value of a 1D tensor of size minibatch

    loss = criterion(output, target_encoded)
    loss.backward() # Does backpropagation and calculates gradients
    optimizer.step() # Updates the weights accordingly
    tloss += loss 
    itrloss += loss

    

    if line_no%1000 == 0 and epoch == -1:
      print("Iteration Loss: {}/{}...".format(line_no, itrloss/1000))
      prediction = output.argmax(dim=1)
      itrloss = 0

  print("Epoch: {}/ Loss: {}".format(epoch, tloss / len(lines)))
  return tloss

train()




Iteration Loss: 0/0.002708323299884796...
Iteration Loss: 1000/2.239064931869507...
Iteration Loss: 2000/2.2679617404937744...
Iteration Loss: 3000/2.25607967376709...
Iteration Loss: 4000/2.2140843868255615...
Iteration Loss: 5000/2.248345375061035...
Iteration Loss: 6000/2.2255237102508545...
Iteration Loss: 7000/2.2739601135253906...
Iteration Loss: 8000/2.292475700378418...
Iteration Loss: 9000/2.263462781906128...
Iteration Loss: 10000/2.2794179916381836...
Iteration Loss: 11000/2.2275888919830322...
Iteration Loss: 12000/2.2459630966186523...
Iteration Loss: 13000/2.2625675201416016...
Iteration Loss: 14000/2.215899705886841...
Iteration Loss: 15000/2.234445095062256...
Iteration Loss: 16000/2.498283863067627...
Iteration Loss: 17000/2.4593005180358887...
Iteration Loss: 18000/2.4239258766174316...
Iteration Loss: 19000/2.378148317337036...
Iteration Loss: 20000/2.383873224258423...
Iteration Loss: 21000/2.3793511390686035...
Iteration Loss: 22000/2.4287781715393066...
Iteration 

tensor(70119.9766, device='cuda:0', grad_fn=<AddBackward0>)

In [0]:
import random

def eval():
  x = random.randrange(100,500)
  # print("index" , x)
  eval_lines = lines[x:x+200]

  input_seq_idx = []
  target_char_idx = []
  for line_no in range(len(eval_lines)):
      name = lines[line_no]
      name = name.strip()

      if len(name) == 0:
        continue

      if len(name) >= max_seq_length: 
        name = name[0:max_seq_length - 1]
      
      for i in range(1, len(name)):
        input_seq = name[0:i]
        target_char = name[i]

        # input_seq = input_seq + end_char
        input_seq = input_seq

        input_seq_idx.append([char_to_ix[ch] for ch in input_seq])
        target_char_idx.append(char_to_ix[target_char])

  input_encoded = one_hot_encode_batch(input_seq_idx, n_chars, max_seq_length, len(input_seq_idx))
  target_encoded = targetTensor(target_char_idx)

  input_encoded = input_encoded.to(device)
  target_encoded = target_encoded.to(device)

  # print(input_encoded.shape, "input shape")
  # print(target_encoded.shape, "target shape")

  with torch.no_grad():

    output, _ = model(input_encoded)

    output = output.to(device)

    prediction = output.argmax(dim=1).cpu()
    acc = accuracy_score(target_encoded.cpu(), prediction)
    print("Accuracy {}".format(acc))


eval()

Accuracy 0.23433048433048434


In [0]:
epochs = 20
for epoch in range(epochs):
  train(epoch)
  eval()

Epoch: 0/ Loss: 2.3081984519958496
Accuracy 0.22435897435897437
Epoch: 1/ Loss: 2.309570789337158
Accuracy 0.22293447293447294
Epoch: 2/ Loss: 2.308520555496216
Accuracy 0.22863247863247863
Epoch: 3/ Loss: 2.3068556785583496
Accuracy 0.23717948717948717
Epoch: 4/ Loss: 2.306570053100586
Accuracy 0.2571225071225071
Epoch: 5/ Loss: 2.3057808876037598
Accuracy 0.2336182336182336
Epoch: 6/ Loss: 2.3072760105133057
Accuracy 0.23575498575498577
Epoch: 7/ Loss: 2.307806968688965
Accuracy 0.23005698005698005
Epoch: 8/ Loss: 2.306295394897461
Accuracy 0.2336182336182336
Epoch: 9/ Loss: 2.308224678039551
Accuracy 0.22863247863247863
Epoch: 10/ Loss: 2.30735182762146
Accuracy 0.21652421652421652
Epoch: 11/ Loss: 2.308230400085449
Accuracy 0.22934472934472935
Epoch: 12/ Loss: 2.3063833713531494
Accuracy 0.23717948717948717
Epoch: 13/ Loss: 2.3082544803619385
Accuracy 0.21866096866096865
Epoch: 14/ Loss: 2.308337926864624
Accuracy 0.21723646723646722
Epoch: 15/ Loss: 2.3048954010009766
Accuracy 0.2

In [0]:
def generate(input_seq_sample):
  # input_seq_sample = input_seq_sample + end_char
  input_seq_sample = input_seq_sample
  # print(input_seq_sample)
  input_seq_idx = []
  input_seq_idx.append([char_to_ix[ch] for ch in input_seq_sample])
  input_encoded = one_hot_encode_batch(input_seq_idx, n_chars, max_seq_length, len(input_seq_idx))

  input_encoded = input_encoded.to(device)

  output, _ = model(input_encoded)

  prediction = output.argmax(dim=1).cpu().numpy().data

  text = [ix_to_char[idx] for idx in prediction ]
  # print(text)

  return "".join(text)


def generate_name(input_seq_sample, name_length):
  print("generate name with starting seq '{}' and of size '{}'".format(input_seq_sample, name_length))

  for i in range(name_length):
    ret = generate(input_seq_sample)
    # print(input_seq_sample, " :::: ", ret)
    input_seq_sample += ret

  print("Name Generated {}".format(input_seq_sample))

  return input_seq_sample

def run_random_generation():
  init_seq_size = random.randrange(0,5)
  init_seq_index = random.randrange(0, len(chars) - init_seq_size)
  init_seq = chars[init_seq_index: init_seq_index + init_seq_size]

  input_seq_sample = "".join(init_seq)
  name_length = random.randrange(1, max_seq_length-len(input_seq_sample))

  generate_name(input_seq_sample,name_length)

  

for i in range(10):
  run_random_generation()

generate_name("manish", 10)
generate_name("arun", 10)
generate_name("mahima", 10)

generate_name("dee", 3)
generate_name("man", 3)
generate_name("mahi", 2)
generate_name("z", 5)

generate name with starting seq '' and of size '11'
Name Generated ashak kumar
generate name with starting seq '' and of size '11'
Name Generated ashash kuma
generate name with starting seq 'ihk' and of size '3'
Name Generated ihkar 
generate name with starting seq 'hk' and of size '16'
Name Generated hkarekumarhararama
generate name with starting seq 'owm' and of size '2'
Name Generated owmat
generate name with starting seq 'sfq' and of size '11'
Name Generated sfqoododoreded
generate name with starting seq ' ' and of size '4'
Name Generated  rush
generate name with starting seq 'l' and of size '6'
Name Generated laxandi
generate name with starting seq 'sfqt' and of size '12'
Name Generated sfqtayanondadada
generate name with starting seq ' vy' and of size '2'
Name Generated  vyip
generate name with starting seq 'manish' and of size '10'
Name Generated manish kumararat
generate name with starting seq 'arun' and of size '10'
Name Generated aruns hhakumar
generate name with starting seq

'zakesh'

In [0]:
# criterion = nn.NLLLoss()

# output = torch.randn(10, 120).float()
# target = torch.FloatTensor(10).uniform_(0, 120).long()

# print(output.shape)
# print(target.shape)
# print(target)

# loss = criterion(output, target)