In [1]:
import os
import random 
from string import ascii_letters
import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

_= torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [2]:
data_dir = r"D:\PROJECTS\RNN_PyTorch\data\data\names"
lang2label={
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
    for i, file_name in enumerate (os.listdir(data_dir))
}

In [3]:
lang2label

{'Arabic': tensor([0]),
 'Chinese': tensor([1]),
 'Czech': tensor([2]),
 'Dutch': tensor([3]),
 'English': tensor([4]),
 'French': tensor([5]),
 'German': tensor([6]),
 'Greek': tensor([7]),
 'Irish': tensor([8]),
 'Italian': tensor([9]),
 'Japanese': tensor([10]),
 'Korean': tensor([11]),
 'Polish': tensor([12]),
 'Portuguese': tensor([13]),
 'Russian': tensor([14]),
 'Scottish': tensor([15]),
 'Spanish': tensor([16]),
 'Vietnamese': tensor([17])}

In [4]:
lang2label['Arabic']

tensor([0])

In [5]:
num_langs= len(lang2label)

In [6]:
unidecode("Ślusàrski")

'Slusarski'

In [7]:
char2idx = {letter: i for i,letter in enumerate(ascii_letters + " .,:;-'")}
num_letters= len(char2idx); num_letters

59

In [8]:
def name2tensor(name):
    tensor= torch.zeros(len(name),1,num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]]=1
    return tensor        

In [9]:
name2tensor("abc")

tensor([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]],

        [[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0.]]])

Dataset Creation

In [10]:
tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass
                    

In [11]:
target_langs[:5]

[tensor([0]), tensor([0]), tensor([0]), tensor([0]), tensor([0])]

In [12]:
tensor_names[:1]

[tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [13]:
len(target_langs)


20070

In [14]:
len(tensor_names)

20070

In [15]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)),
    test_size=0.1,
    shuffle=True,
    stratify=target_langs
                                       )

  array = np.asarray(array, order=order, dtype=dtype)
  array = np.asarray(array, order=order, dtype=dtype)


In [16]:
train_dataset =[
    (tensor_names[i],target_langs[i])
    for i in train_idx
]

test_dataset =[
    (tensor_names[i],target_langs[i])
    for i in test_idx
]

In [17]:
len(train_dataset)+len(test_dataset)

20070

In [18]:
print(f"Train: {len(train_dataset)}")
print(f"Test: {len(test_dataset)}")

Train: 18063
Test: 2007


Model

Simple RNN

In [19]:
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size,output_size):
        super(MyRNN,self).__init__()
        self.hidden_size= hidden_size
        self.in2hidden =nn.Linear(input_size+hidden_size, hidden_size)
        self.in2output =nn.Linear(input_size+hidden_size, output_size)
        
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state),1)
        hidedn = torch.sigmoid(self.in2hidden(combined))
        output=self.in2output(combined)
        return output,hidedn
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1,self.hidden_size))    

In [21]:
hidden_size = 256
learning_rate = 0.001
model = MyRNN(num_letters, hidden_size, num_langs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [24]:
num_epochs = 2
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i,(name,label) in enumerate(train_dataset):
        hidden_state =model.init_hidden()
        for char in name:
            output, hidden_state = model(char,hidden_state)
        loss = criterion(output,label)
        
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(),1)
        optimizer.step()
        
        if (i+1)% print_interval ==0: 
            print  (
                f"Epoch [{epoch + 1}/{num_epochs}],"
                f"Step[{i + 1}/{len(train_dataset)}],"
                f"Loss: {loss.item():.4f}"
                
            )

Epoch [1/2],Step[3000/18063],Loss: 2.1150
Epoch [1/2],Step[6000/18063],Loss: 1.0445
Epoch [1/2],Step[9000/18063],Loss: 0.0646
Epoch [1/2],Step[12000/18063],Loss: 0.0087
Epoch [1/2],Step[15000/18063],Loss: 0.1009
Epoch [1/2],Step[18000/18063],Loss: 0.7074
Epoch [2/2],Step[3000/18063],Loss: 1.5462
Epoch [2/2],Step[6000/18063],Loss: 5.8079
Epoch [2/2],Step[9000/18063],Loss: 0.0118
Epoch [2/2],Step[12000/18063],Loss: 0.1388
Epoch [2/2],Step[15000/18063],Loss: 1.6468
Epoch [2/2],Step[18000/18063],Loss: 0.0560


In [37]:
num_correct = 0
num_samples = len(test_dataset)

model.eval()
with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)
           
print(f"Accuracy: {num_correct / num_samples * 100: .4f}%")            

Accuracy:  76.1335%


In [38]:
label2lang = {label.item(): lang for lang, label in lang2label.items()}

def myrnn_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        hidden_state= model.init_hidden()
        for char in tensor_name:
            output, hidden_state = model(char, hidden_state)
        _,pred = torch.max(output, dim=1)
    model.train()
    return label2lang[pred.item()]        

In [43]:
myrnn_predict("Mike")

'Japanese'

In [44]:
myrnn_predict("Qin")

'Chinese'

In [45]:
myrnn_predict("Slaveya")

'Russian'

PyTorch GRUPermalink

In [46]:
class GRUModel(nn.Module):
    def __init__(self, num_layers, hidden_size):
        super(GRUModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.gru = nn.GRU(
            input_size=num_letters, 
            hidden_size=hidden_size, 
            num_layers=num_layers,
        )
        self.fc = nn.Linear(hidden_size, num_langs)
    
    def forward(self, x):
        hidden_state = self.init_hidden()
        output, hidden_state = self.gru(x, hidden_state)
        output = self.fc(output[-1])
        return output
    
    def init_hidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size).to(device)

In [47]:
model = GRUModel(num_layers=2, hidden_size=hidden_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [48]:
for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        output = model(name)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
         
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/2], Step [3000/18063], Loss: 0.0705
Epoch [1/2], Step [6000/18063], Loss: 2.3005
Epoch [1/2], Step [9000/18063], Loss: 0.1784
Epoch [1/2], Step [12000/18063], Loss: 4.0543
Epoch [1/2], Step [15000/18063], Loss: 0.2517
Epoch [1/2], Step [18000/18063], Loss: 0.0043
Epoch [2/2], Step [3000/18063], Loss: 0.0993
Epoch [2/2], Step [6000/18063], Loss: 0.2225
Epoch [2/2], Step [9000/18063], Loss: 0.1432
Epoch [2/2], Step [12000/18063], Loss: 0.3153
Epoch [2/2], Step [15000/18063], Loss: 0.1858
Epoch [2/2], Step [18000/18063], Loss: 0.1279


In [49]:
num_correct = 0

model.eval()

with torch.no_grad():
    for name, label in test_dataset:
        output = model(name)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 81.4649%


In [57]:
def pytorch_predict(name):
    model.eval()
    tensor_name = name2tensor(name)
    with torch.no_grad():
        output = model(tensor_name)
        _, pred = torch.max(output, dim=1)
    model.train()
    return label2lang[pred.item()]

In [58]:
pytorch_predict("Jake")

'English'

In [59]:
pytorch_predict("Qin")

'Russian'

In [60]:
pytorch_predict("Fernando")

'Italian'

In [61]:
pytorch_predict("Demirkan")

'English'

In [62]:
pytorch_predict("Mike")

'Japanese'