In [1]:
%reload_ext autoreload
%autoreload 2

%xmode Verbose

Exception reporting mode: Verbose


In [2]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torchtext import data, datasets
import torchtext

import tqdm
import random

from TwitterPipeline import TwitterPipeline

## some constants

In [3]:
SEED = 762
IN_FILE = 'germeval2018.try.txt'
#IN_FILE = 'germeval2018.training.txt'
IN_FILE_TEST = 'germeval2018.test.txt'
BATCH_SIZE = 16

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

## Torchtext
Torchtext is a add-on to pytorch that brings data handlers and other tools for training of language data.

### define torchtext.Field instances

Fields are explained in the [torchtext docs](https://torchtext.readthedocs.io/en/latest/data.html#field) and shown at work in a [blog post from Allen Nie](http://anie.me/On-Torchtext/)

In [None]:
# define Fields
# HINT: don't specify a tokenizer here

# assign single fields to map

## Spacy
A tool from Berlin-based company Explosion AI that offered different language models right from the start. Initially used just as tokenizer, it has since version 2 one the best lemmatizer and POS-taggers for German.

[Usage](https://spacy.io/usage/)

[Full Documentation](https://spacy.io/api/) 

In [4]:
# create a spacy pipeline
# HINT: a simple one - maybe even without setting the model to use - is easier

# pre-process training data

## Splitting of data
We need to have 3 separate chunks of data: train, validation and test data

Torchtext has methods to help us with that. See [data.Dataset.split()](https://torchtext.readthedocs.io/en/latest/data.html#torchtext.data.Dataset.split)

In [None]:
# do the splitting with torchtext

In [None]:
print(f'train len {len(trn_ds.examples)}')
print(f'val len {len(val_ds.examples)}')
print(f'test len {len(tst_ds.examples)}')

## Vocabulary

In [5]:
# build vocab
# validation + test data should by no means influence the model, so build the vocab just on trn


In [None]:
print(f'text vocab size {len(f_text.vocab)}')
print(f'lemma vocab size {len(f_lemma.vocab)}')
print(f'label vocab size {len(f_label.vocab)}')[]

## Iterator for Training loop

In [None]:
# create training iterators
trn_iter, val_iter, tst_iter = data.BucketIterator.splits((trn_ds, val_ds, tst_ds),
                                                          batch_size=BATCH_SIZE,
                                                          device=-1,
                                                          sort_key=lambda t: len(
                                                              t.text),
                                                          sort_within_batch=False,
                                                          repeat=False)


## The model

In [7]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_dim, emb_dim=100, hidden_dim=200):
        pass
    
    def forward(self, x):
        # x type is Tensor[sentence len, batch size]. Internally pytorch does not use 1-hot
        
        # result should be Tensor[batch size]
        pass

## Metric to show model status and progress

In [12]:
def binary_accuracy(preds, y):
    """
    return accuracy per batch as ratio of correct/all
    """

    # round predictions to the closest integer
    rounded_preds = torch.round(F.sigmoid(preds))
    # convert into float for division
    pred_is_correct = (rounded_preds == y).float()
    acc = pred_is_correct.sum()/len(pred_is_correct)
    return acc

## training function (single epoch)

In [9]:
def train(model, iterator, optimizer, criterion, metric):
    epoch_loss = 0
    epoch_meter = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        y_hat = model(batch.text).squeeze(1)
        loss = criterion(y_hat, batch.label)
        meter = metric(y_hat, batch.label)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_meter += meter.item()

    return epoch_loss / len(iterator), epoch_meter / len(iterator)


## evaluation (single epoch)

In [10]:
def evaluate(model, iterator, criterion, metric):
    epoch_loss = 0
    epoch_meter = 0

    model.eval()

    with torch.no_grad():

        for batch in iterator:
            y_hat = model(batch.text).squeeze(1)
            loss = criterion(y_hat, batch.label)
            meter = metric(y_hat, batch.label)

            epoch_loss += loss.item()
            epoch_meter += meter.item()

    return epoch_loss / len(iterator), epoch_meter / len(iterator)


## model parameters

In [None]:
EMB_SIZE = 100
HID_SIZE = 200
NUM_LIN = 3
NUM_EPOCH = 5
LEARNING_RATE = 1e-3

# RNN variant SETUP
model = SimpleRNN(len(f_text.vocab), EMB_SIZE, HID_SIZE)
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

## training loop

In [None]:
for epoch in range(NUM_EPOCH):
    train_loss, train_acc = train(
        model, trn_iter, optimizer, criterion, binary_accuracy)
    valid_loss, valid_acc = evaluate(
        model, val_iter, criterion, binary_accuracy)

    print(f'EPOCH: {epoch:02} - TRN_LOSS: {train_loss:.3f} - TRN_ACC: {train_acc*100:.2f}% - VAL_LOSS: {valid_loss:.3f} - VAL_ACC: {valid_acc*100:.2f}%')



In [None]:
test_loss, test_acc = evaluate(model, tst_iter, criterion, binary_accuracy)
print(f'TEST_LOSS: {test_loss:.3f}, TEST_ACC: {test_acc*100:.2f}%')