In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import torch
import pandas as pd
import numpy as np

from surnames.data import SurnameClassificationDataset, generate_batches

In [3]:
# notable directories
SOURCE_DIR = Path('..')
DATA_DIR = SOURCE_DIR / 'data'

In [4]:
# load data
df = pd.read_csv(DATA_DIR / 'surnames_with_splits.csv')
df.split.unique()

array(['train', 'val', 'test'], dtype=object)

In [5]:
from torch import nn

from surnames.training import Trainer
from surnames.utils import handle_dirs
from surnames.models import SurnameClassifier

# create dataset
dataset = SurnameClassificationDataset.from_dataframe(df)
dataset.set_split('train')

# grab vectorizer
vectorizer = dataset.surname_vectorizer

# model
surname_classifier = SurnameClassifier(
    char_embedding_dim=100, 
    char_vocab_size=len(vectorizer.surname_vocab),
    rnn_hidden_size=50, # this represents the entire sequence
    nb_categories=len(vectorizer.origin_vocab),
    padding_idx=vectorizer.surname_vocab.mask_index
)

loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(surname_classifier.parameters(), lr=0.001)
MODELS_DIR = SOURCE_DIR / 'models'
MODEL_DIR = MODELS_DIR / 'SurnameClassifier'
handle_dirs(MODELS_DIR)
handle_dirs(MODEL_DIR)
device = "cude" if torch.cuda.is_available() else "cpu"

trainer = Trainer(generate_batches, optimizer, surname_classifier, MODEL_DIR, loss_func, device)

# training params
nb_epochs = 20
batch_size = 64
trainer.run(nb_epochs, dataset, batch_size, checkpoint=True)

In [6]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
MODELS_DIR = SOURCE_DIR / 'models'
MODEL_DIR = MODELS_DIR / 'special4'
handle_dirs(MODELS_DIR)
handle_dirs(MODEL_DIR)
device = "cude" if torch.cuda.is_available() else "cpu"

# training params
trainer = Trainer(generate_batches, optimizer, model, MODEL_DIR, loss_func, device)
trainer.run(30, dataset, 32, checkpoint=True, verbose=True)

NameError: name 'model' is not defined

In [None]:
def compute_batch_accuracy(y_pred, y_true):
    correct = (y_pred.eq(y_true)).sum()
    return correct.item() / y_pred.shape[0]

def inference(surname, vectorizer, model):
    vector = vectorizer.vectorize_surname(surname, max_len=10)
    tensor = torch.tensor(vector).unsqueeze(0)
    out = model(tensor).squeeze()
    cat_index = torch.argmax(out)
    return vectorizer.origin_vocab.lookup_index(cat_index)

def test(dataset, model):
    dataset.set_split('test')
    accuracy = 0
    nb_bacthes = 0
    for batch_gen in generate_batches(dataset, batch_size=batch_size):
        x_test = batch_gen['x_data']
        y_true = batch_gen['y_target']
        y_pred = model(x_test)
        categories = torch.argmax(y_pred, dim=1)
        accuracy += compute_batch_accuracy(categories, y_true)
        nb_bacthes += 1
    accuracy /= nb_bacthes
    accuracy_str = f'{accuracy:.2%}'
    print(accuracy_str)

test(dataset, model)

In [None]:
lst = [
    'Biden', 
    'Merkel', 
    'Johnson', 
    'Trudeau', 
    'Jinping', 
    'Putin', 
    'Modi', 
    'Gautier', 
    'Macron', 
    'De Croo', 
    'Rutte',
    'Sanchez']
for name in lst:
    prediction = inference(name, dataset.surname_vectorizer, surname_classifier)
    print(f'{name} -> {prediction}')

In [None]:
inference('Trudeau', dataset.surname_vectorizer, surname_classifier)