In [None]:
# rnn to predict gender of names
# prep -> encode -> create nn -> train -> eval -> deploy

In [None]:
import pandas as pd

names_data = pd.read_csv()

import random

names_data = names_data.drop_duplicates(subset=['Name'], keep=random.choice(['first','last']))

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEconder()
names_data['Gender'] = le.fit_transform(names_data['Gender'])

genders = ['Female','Male']

In [None]:
import string
all_letters = string.ascii_letters + ".,;'"
n_letters = len(all_letters)

In [None]:
import torch
def name_to_tensor(name):
    name_in_tensor = torch.zeros(len(name), 1, n_letters)
    for i, letter in enumerate(name):
        name_in_tensor[i][0][all_letters.find(letter)]=1
    return name_in_tensor

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, inp, hid, out):
        super(RNN, self).__init__()
        
        self.hidden_size = hid
        # first linear layer
        self.i2h = nn.Linear(inp + hid, hid)
        self.i2o = nn.Linear(inp + hid, out)
        
        self.softmax = nn.LogSoftmax(dim = 1)
        
    def forward(self, inp, hid):
        combined = torch.cat((input, hidden), 1)
        hiden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1,self.hidden_size)
    
n_hidden = 148
n_genders = len(genders)

In [None]:
rnn = RNN(n_letters,n_hidden,out = n_genders)

In [None]:
iterations = 1e6
criterion = nn.NLLLoss()
lr = 0.005

def output_to_gender(output):
    
    top_n, top_index = output.topk(1)
    pred_i = top_index[0].item()
    pred = genders[pred_i]
    
    return pred

In [None]:
for iteration in range(1, iterations + 1):
    # randomly pick a name
    i = random.randint(0,len(names_data) - 1)
    
    name = names_data.iloc[i][0]
    name_in_tensor = name_to_tensor(name)
    
    gender = names_data.iloc[i][1]
    gender_in_tensor = torch.LongTensor([gender])
    
    hidden = rnn.initHidden()
    rnn.zero_grad()
    
    # unroll rnn
    for i in range(name_in_tensor.size()[0]):
        # at each time step pass in current character and prev hidden output to rnn, stored in output and hidden
        output, hidden = rnn(name_in_tensor[i],hidden)
    
    loss = criterion(output, gender_in_tensor)
    loss.backward()
    
    for p in rnn.parameters():
        # do optim ourselves
        p.data.add_(-learning_rate, p.grad.data)
    
    # verbose
    if iteration%5000==0:
        pred = output_to_gender(output)
    correct = ':)' if pred == genders[gender] else ':( (%s)' %genders[gender]
    print ('iters- %d %d%% (%s) Name - %s Gender - %s %s' % \
          (iteration, iteration/iterations*100, loss.item, name, pred, correct))

In [None]:
# confusion matrix in eval classifiers

n_conf = 10000
prediction = []
actual = []

for _ in range(n_conf):
    i = random.randint(0,len(names_data)-1)
    
    name = names_data.iloc[i][0]
    name_in_tensor = name-to_tensor(name)
    
    gender_idx = names_daata.iloc[i][1]
    gender_in_tensor = torch.LongTensor([gender_idx])
    
    hidden = rnn.initHidden()
    
    for j in range(name_in_tensor.size()[0]):
        output, hidden = rnn(name_in_tensor[j], hidden)
    
    pred = output_to_gender(output)
    
    prediction.append(pred)
    actual.append(genders[gender_idx])

In [None]:
# install pandas_ml

In [None]:
from pandas_ml import ConfusionMatrix
import numpy as np

In [None]:
np_prediction = np.array(prediction)
np_actual= np.array(actual)

In [None]:
cm = ConfusionMatrix(np.where(np_prediction == 'Female', True, False),
                    np.where(np_actual == 'Female', True, False))

cm.plot(normalized = True)
plt.show()