# Classifying Text 

In this little turorial we are using PyTorch, TorchText and Byte Pair Encoding to quickly build a text classifyer.

In [None]:
!pip3 install bpemb pandas torchtext torch

In [None]:
import time
from bpemb import BPEmb
import pandas as pd
import numpy as np

import torch
from torchtext import data
import torch.nn as nn

## 1. Load the data


At first, we need to downlad the data:

In [None]:
!wget https://www.htw-dresden.de/~guhr/dist/sample/germeval2018.training.txt
!wget https://www.htw-dresden.de/~guhr/dist/sample/germeval2018.test.txt

Now we can load the data, using pandas:

In [None]:
test_df = pd.read_csv("germeval2018.test.txt", sep='\t', header=0,encoding="utf-8")
train_df = pd.read_csv("germeval2018.training.txt", sep='\t', header=0,encoding="utf-8")

In [None]:
train_df.head()

In [None]:
# drop unused columns
test_df.drop(columns=['label2'], inplace=True)
train_df.drop(columns=['label2'], inplace=True)

## 2. Data Preprocessing

Now we can preprocess our dataset. In this step we remove all special chars and binarize our labels:

In [None]:
def clean_text (text):
    text = text.str.lower() # lowercase
    text = text.str.replace(r"\#","") # replaces hashtags
    text = text.str.replace(r"http\S+","URL")  # remove URL addresses
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^A-Za-z0-9öäüÖÄÜß()!?]", " ")
    text = text.str.replace("\s{2,}", " ")
    return text

def convert_label(label):
    return 1 if label == "OFFENSE" else 0

In [None]:
train_df["text"]=clean_text(train_df["text"])
test_df["text"]=clean_text(test_df["text"])
train_df["label"]=train_df["label"].map(convert_label)
test_df["label"]=test_df["label"].map(convert_label)

In [None]:
# this is  how our data set looks now. No urls no @ :)
train_df.head()

In [None]:
# The following will help make the results reproducible later.
# This is will make shure that you get the same result every time you train you model
# Turn this off, for you final train run, to improve performance.
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

### data magic

The following class helps us to convert the pandas dataframe into an pytorch data set. You can skip that. 

In [None]:
# source : https://gist.github.com/lextoumbourou/8f90313cbc3598ffbabeeaa1741a11c8
# to use DataFrame as a Data source

class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        print(df)
        examples = []
        for i, row in df.iterrows():            
            label = row.label#row.target if not is_test else None            
            text = row.text            
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df, data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df, data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df, data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)

## 3. Loading the pretrained word vectors

For this tutorial we are using the byte pair encoding. The great [BPEmb](https://pypi.org/project/bpemb/) library helps us the encode the text and provides pretrained models for a lot of languages.

In [None]:
from collections import Counter
from torchtext import vocab

bpemb_de = BPEmb(lang="de", vs=10000)
bpemb_de_counter = Counter(bpemb_de.words)
bpemb_de_stoi = {word:i for i, word in enumerate(bpemb_de.words)}

bpemb_vocab = vocab.Vocab(counter = bpemb_de_counter)
bpemb_vocab.set_vectors(stoi = bpemb_de_stoi, vectors = torch.tensor(bpemb_de.vectors), dim = bpemb_de.dim)

bpemb_vocab.stoi = bpemb_de_stoi # pytorch overwrite our tokens, so we need to reset them


The byte pair encoding turns words into tokens. Every tokens has an id and a coresponding vector that we can feed to our neural network.

In [None]:
tokens = bpemb_de.encode_with_bos_eos("das ist ein test")
print(tokens)

token_ids = bpemb_de.encode_ids_with_bos_eos("das ist ein test")
print(token_ids)


In [None]:
# and this is how the vector for the "_das" token looks like:
bpemb_de.vectors[99]
#[bpemb_de.vectors[id] for id in token_ids] # vectors for all tokens

## 4. Load Train and Valid Data Sets

First, we define how the TEXT and LABEL's will encoded. Thats what the Field fields do. With these fields and the class we defined above we can create a data set.

In [None]:
TEXT = data.Field(tokenize= bpemb_de.encode,init_token ='<s>', eos_token='</s>',pad_token="<unk>",use_vocab = True, batch_first = True,sequential=True )

TEXT.vocab = bpemb_vocab # -> assign our byte pair endcoing module
LABEL = data.LabelField(dtype = torch.float, use_vocab = False)

fields = [('text',TEXT), ('label',LABEL)]
train_ds, val_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=test_df)

In [None]:
# Lets look at a the first example
print(vars(train_ds[0]))

### Batch Iterator

With this data set we can now create a iterator that prepares the batches for us.

In [None]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.Iterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE,
    shuffle = True,    
    device = device)

In [None]:
# This is how a batch looks like. Do you know why our texts a still id's?

batch = next(iter(train_iterator))

print(batch.label)
print(batch.text)

## 5. Define the Model

Now its finally time to define our model:

In [None]:
class SimpleModel(nn.Module):
    def __init__(self, weights,embedding_length = 100):
        super(SimpleModel, self).__init__()
        
        # these three lines load to pretrained vecotrs into our embedding layer
        vocab_size= len(weights)        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_length) 
        self.word_embeddings.weight = nn.Parameter(weights, requires_grad=False)                
        
    def forward(self, input_sentences):
        input = self.word_embeddings(input_sentences) # <-- here we turn our ids into actual vectors
        
        # since our sentences are do not have a equal length, we can't simply feed them 
        # into a feed forward network. How can we solve that?
        
        return input # 

## 6. Train the model

First we define a set of helper funtions, to make our live a bit easier. 

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
# we moved the training of a single batch into a method for convenience
def train(model, iterator):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        text = batch.text
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)        
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
# ...same with the eval code
def evaluate(model, iterator):
    
    epoch_acc = 0
    model.eval()
    
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            predictions = model(text).squeeze(1)
            acc = binary_accuracy(predictions, batch.label)
            
            epoch_acc += acc.item()
        
    return epoch_acc / len(iterator)

### Now we can create an instance of our model, with the pretrained byte pair vectors.

In [None]:
model = SimpleModel(torch.tensor(bpemb_de.vectors))
model.to(device)

learning_rate = 0.001

criterion = nn.BCEWithLogitsLoss()

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
num_epochs = 10
loss=[]
acc=[]
val_acc=[]

for epoch in range(num_epochs):
    
    train_loss, train_acc = train(model, train_iterator)
    valid_acc = evaluate(model, valid_iterator)
    
    print(f'{epoch} Train Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Val. Acc: {valid_acc*100:.2f}%')    
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_acc.append(valid_acc)
    


In [None]:
import matplotlib.pyplot as plt

epochs = range(1,num_epochs+1)
#plt.plot(epochs, loss, 'g', label='Training loss')
plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='validation acc')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Tasks

1. Implement a feed forward neural entwork classifyer

2. Try to improve the results. What happens when,
    * you use more layers
    * more neurons
    * a bigger vocabulary size
    
3. Try differnt models:
    * Use LSTMs 
    * Did you know that you can use a cnn to classify text?