## O objetivo do notebook é criar pipelines de dados que lidem com generators (situação bastante comum com grandes datasets)

In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [51]:
import pandas as pd

In [64]:
class Estimator(nn.Module):
    def __init__(self,
                 num_classes):
        super().__init__()
        self.conv = nn.Conv1d(in_channels=42, out_channels=30, kernel_size=4)
        self.max_pool = nn.MaxPool1d(kernel_size=4)
        self.dense_1 = nn.Linear(in_features=3, out_features=1024)
        self.dense_2 = nn.Linear(in_features=1024, out_features=512)
        self.dense_3 = nn.Linear(in_features=512, out_features=256)
        self.dense_4 = nn.Linear(in_features=256, out_features=128)
        self.dense_5 = nn.Linear(in_features=128, out_features=num_classes)
    
    def forward(self, x):
        net = self.conv(x)
        net = self.max_pool(net)
        net = net.unsqueeze(1)
        net = F.relu(self.dense_1(net))
        net = F.relu(self.dense_2(net))
        net = F.relu(self.dense_3(net))
        net = F.relu(self.dense_4(net))
        net = self.dense_5(net)
        net = net.squeeze(1)        
        net = net.mean(1)        
        return net

In [65]:
class Network(nn.Module):
    def __init__(self,
                 embedding,
                 estimator):
        super().__init__()
        self.embed = embedding
        self.estimator = estimator
        
    def forward(self, x):
        net = self.embed(x)
        net = self.estimator(net)       
        return net

In [66]:
def categorical_accuracy(preds, y):
    """
        Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim=1, keepdim=True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]])

In [67]:
def fit(model,
        data,
        optimizer,
        criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for step in range(100):        
        for batch_x, batch_y in data:
            predictions = model(batch_x)
                       
            loss = criterion(predictions, batch_y)
            loss.backward(retain_graph=True)              
            
            optimizer.step()
            optimizer.zero_grad()   
            
            acc = categorical_accuracy(predictions, batch_y)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()
                
    return epoch_loss / 100, epoch_acc / 100

In [68]:
def evaluate(model,
             data,
             criterion):
    epoch_loss = 0
    epoch_acc = 0 

    model.eval()
    with torch.no_grad():
        for step in range(100):            
            for batch_x, batch_y in data:
                predictions = model(batch_x)
                                           
                loss = criterion(predictions, batch_y)
                acc = categorical_accuracy(predictions, batch_y)
            
                epoch_loss += loss.item()
                epoch_acc += acc.item()
    return epoch_loss / 100, epoch_acc / 100

In [69]:
dataset = pd.read_csv('tabular_data.csv')

In [72]:
max_len = max(dataset.query_string.apply(lambda x: len(x.split(" "))))

In [73]:
text = [data for data in dataset['query_string']]

In [74]:
lookup_labels = sorted(dataset.output.apply(lambda x: x.split('/')[0]).unique().tolist())

In [75]:
lookup = {v: k for k, v in enumerate(lookup_labels)}

In [93]:
tokenizer = Tokenizer(char_level=True,
                      to_lower=True)

In [94]:
tokenizer.build_vocab(text)

In [99]:
dataset_generator = pd.read_csv('data.csv',
                                usecols=['query_string', 'output'],
                                chunksize=256)

In [100]:
dataset = Dataset(data_generator=dataset_generator,
                  tokenizer=tokenizer)

In [101]:
train, test, val = dataset.split(batch_size=256,
                                 max_len=max_len,
                                 input_dim=42,
                                 lookup_labels=lookup)

In [81]:
embedding = nn.Embedding(num_embeddings=42,
                         embedding_dim=15)

In [82]:
estimator = Estimator(num_classes=47)

In [83]:
model = Network(embedding=embedding,
                estimator=estimator)

In [84]:
optimizer = optim.Adam(model.parameters())

In [85]:
criterion = nn.CrossEntropyLoss()

In [86]:
epochs = 100

In [None]:
print("Initializing training and validation")
best_valid_loss = float('inf')
for epoch in range(epochs):
    train_loss, train_acc  = fit(model, train, optimizer, criterion)

    valid_loss, valid_acc = evaluate(model, val, criterion)

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Initializing training and validation
Epoch: 01
	Train Loss: 3.294 | Train Acc: 12.66%
	 Val. Loss: 3.169 |  Val. Acc: 16.38%
Epoch: 02
	Train Loss: 3.075 | Train Acc: 19.86%
	 Val. Loss: 3.005 |  Val. Acc: 20.58%
Epoch: 03
	Train Loss: 2.940 | Train Acc: 22.23%
	 Val. Loss: 2.872 |  Val. Acc: 23.60%
Epoch: 04
	Train Loss: 2.787 | Train Acc: 26.19%
	 Val. Loss: 2.728 |  Val. Acc: 26.99%
Epoch: 05
	Train Loss: 2.723 | Train Acc: 28.13%
	 Val. Loss: 2.697 |  Val. Acc: 28.61%
Epoch: 06
	Train Loss: 2.668 | Train Acc: 29.35%
	 Val. Loss: 2.626 |  Val. Acc: 30.48%
