In [1]:
!pip install nltk
!pip install torch
!pip install fasttext

Collecting fasttext
[?25l  Downloading https://files.pythonhosted.org/packages/f8/85/e2b368ab6d3528827b147fdb814f8189acc981a4bc2f99ab894650e05c40/fasttext-0.9.2.tar.gz (68kB)
[K     |████████████████████████████████| 71kB 5.8MB/s 
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3097886 sha256=6264d276a1997c126543d61eb2c084a7c610d9383264b0d001a809b0f808f774
  Stored in directory: /root/.cache/pip/wheels/98/ba/7f/b154944a1cf5a8cee91c154b75231136cc3a3321ab0e30f592
Successfully built fasttext
Installing collected packages: fasttext
Successfully installed fasttext-0.9.2


In [2]:
!pip install prettytable



In [3]:
import torch
import torch.nn as nn


In [4]:
import nltk
nltk.download('punkt')
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
''' 
Preprocess Data
'''

import re
import nltk
import string
import fasttext
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize

OTHER_TAG = "other"
PUNCT_TAG = "punct"

with open('./utils/tags.txt', encoding="utf-8", errors='ignore') as f:
    tags = set([str.rstrip(tag) for tag in f.readlines()])

def remove_labels(text):
    return re.sub(r'\<\/?[\w-]*\>\s*', "", text).strip()

def tag_token(token, tag):
    if token in string.punctuation:
        return (token, PUNCT_TAG)
    return (token, tag)

def get_tagged_tokens(groups):
    tagged_tokens = []
    for group in groups:
        ref, tag = group[0], group[1]
        if tag not in tags:
            tag = OTHER_TAG
        unlabelled = remove_labels(ref)
        tokens = word_tokenize(unlabelled)
        tagged_tokens.extend(list(map(lambda token: tag_token(token, tag), tokens)))
    return tagged_tokens

''' Attach tags to each token '''
def attach_tags(dataset_path):
    dataset = []
    with open(dataset_path, encoding="utf-8", errors='ignore') as f:
        refs = f.readlines()
        for ref in refs:
            groups = re.findall(r'(\<(.*)\>.*\<\/\2\>)', ref) # format (<tag>...</tag>, tag)
            tagged_tokens = get_tagged_tokens(groups)
            dataset.append(tagged_tokens)
    return dataset

''' Removes labels and tokenizes '''
def tokenize_dataset(dataset_path, sep=" "):
    dataset = []
    with open(dataset_path, encoding="utf-8", errors='ignore') as f:
        refs = f.readlines()
        for ref in refs:
            ref = remove_labels(ref) 
            tokenized = sep.join(word_tokenize(ref))
            dataset.append(tokenized)
    return dataset

def train_word_embedding_model(dataset_path, embedding_dim, use_subwords=False):
    embedding_dataset_path = './dataset/word_embedding_dataset'

    with open(embedding_dataset_path, 'w', errors='ignore') as f:
        # fasttext tokenizes by whitespaces
        word_embedding_dataset = tokenize_dataset(dataset_path, sep=" ") 
        f.write("\n".join(word_embedding_dataset))
    if use_subwords:
      model_path = './models/subword_embedding.bin'
      model = fasttext.train_unsupervised(embedding_dataset_path, dim=embedding_dim)
    else:
      model_path = './models/word_embedding.bin'
      model = fasttext.train_unsupervised(embedding_dataset_path, dim=embedding_dim, maxn=0)
    model.save_model(model_path)
    return model

def map_to_index(keys, idx_start=0):
    key_to_idx, keys_arr, idx = {}, [], idx_start
    for key in keys:
        key_to_idx[key] = idx
        keys_arr.append(key)
        idx += 1
    return key_to_idx, keys_arr

dataset_path = './dataset/standardized_dataset.txt'
dataset = attach_tags(dataset_path)
EMBEDDING_DIM = 100
word_embedding_model = train_word_embedding_model(dataset_path, embedding_dim = EMBEDDING_DIM)

all_tags = tags 
all_tags.add(OTHER_TAG)
all_tags.add(PUNCT_TAG)
tag_to_idx, tag_arr = map_to_index(all_tags)
print(all_tags)

''' Get inputs and outputs for model '''
X, y = [], []
for ref in dataset:
    X_ref, y_ref = [], []
    for token, tag in ref:
        X_ref.append(word_embedding_model.get_word_vector(token))
        y_ref.append(tag_to_idx[tag])
    X.append(X_ref)
    y.append(y_ref)

max_length = max(map(lambda ref: len(ref), X))
X = pad_sequences(X, maxlen=max_length, padding='post', truncating='pre', value=float(len(all_tags)), dtype='float32')
y = pad_sequences(y, maxlen=max_length, padding='post', truncating='pre', value=float(len(all_tags)), dtype='float32')


{'note', 'author', 'institution', 'journal', 'editor', 'tech', 'other', 'date', 'publisher', 'volume', 'booktitle', 'punct', 'pages', 'location', 'title'}


In [6]:
from torch.nn.utils.rnn import pack_padded_sequence
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
    
        self.lstm = nn.LSTM(
            input_size = self.input_size,
            hidden_size = self.hidden_size,
            num_layers = self.num_layers,
            batch_first = True
        )

    def forward(self, x):
        # Initialise hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Initialise internal state
        c0 = torch.zeros(self.num_layers,x.size(0), self.hidden_size)

        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(x, (h0, c0))

        return output, (hn, cn)


In [7]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.lstm = LSTM(input_size, hidden_size, output_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        output = self.fc(output)

        return output


In [8]:
'''
Hyperparameters
'''
num_epochs = 700
learning_rate = 0.001

input_size = EMBEDDING_DIM # Number of features
hidden_size = 25 # Number of features in the hidden state
num_layers = 1 # Number of stacked LSTM layers

output_size = len(all_tags) # Number of output classes

model = Net(input_size, hidden_size, output_size, num_layers)

In [9]:
'''
Loss Function and Optimiser
'''
criterion = nn.CrossEntropyLoss(ignore_index=len(all_tags))
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [12]:
import numpy as np
from torch.nn.utils.rnn import pad_packed_sequence
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, KFold
from prettytable import PrettyTable
from torch.utils.data import TensorDataset, DataLoader, ConcatDataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test = torch.tensor(X_train), torch.tensor(X_test)
y_train, y_test = torch.tensor(y_train), torch.tensor(y_test)

def categorical_accuracy(outputs, y, pad_index):
    max_outputs = outputs.argmax(dim = 1, keepdim=True)
    non_padded_elements = (y != pad_index).nonzero()
    correct = max_outputs[non_padded_elements].squeeze(1).eq(y[non_padded_elements])
    return correct.sum() / torch.FloatTensor([y[non_padded_elements].shape[0]])

def get_max_outputs(outputs):
    max_outputs = outputs.argmax(dim = 1)
    return max_outputs

def print_report(report):
    table = PrettyTable(float_format="1.5f")
    table.field_names = ["Tag", "Precision", "Recall", "FBeta"]
    for i in range(len(tag_arr)):
      tag, scores = [tag_arr[i]], list(map(lambda metric: metric[i], report))[:-1] # exclude support metric
      tag.extend(scores)
      table.add_row(tag)
    print(table)

def sum_report(report1, report2):
    summed_report = []
    for i in range(len(report1)):
        label_report1, label_report2 = np.array(report1[i]), np.array(report2[i])
        summed_report.append(np.add(label_report1, label_report2))
    return np.array(summed_report)

def average_report(report, num):
    return list(map(lambda label_report: np.true_divide(label_report, num), report))

def train(train_dataset, concat_dataset, k_folds=10):
    kfold = KFold(n_splits=k_folds, shuffle=True)
    summed_report = np.array([])
    for fold, (train_ids, test_ids) in enumerate(kfold.split(concat_dataset)):
        train_loader = DataLoader(train_dataset, batch_size=256)
        for epoch in range(num_epochs):
            for i, data in enumerate(train_loader, 0):
                X_train, y_train = data
                outputs = model.forward(X_train)

                optimizer.zero_grad()

                outputs = outputs.view(-1, outputs.shape[-1]) # [batch_size * seq_len, output_dim]
                y_train = y_train.view(-1) # [batch_size * seq_len]
              
                # Get the loss function
                loss = criterion(outputs, y_train.long())

                # Calculate loss
                loss.backward()

                # Backpropagation
                optimizer.step()

                # Print loss at every 100th epoch
                if epoch % 100 == 0:
                    print("Epoch: %d, loss after minibatch %5d: %1.5f" % (epoch, i+1, loss.item()))
            
        report = precision_recall_fscore_support(y_train.long(), \
                                                 get_max_outputs(outputs.detach()), \
                                                 average=None, \
                                                 zero_division=0, \
                                                 labels = [i for i in range(len(all_tags))])
        if len(summed_report) == 0:
            summed_report = np.array(report)
        else:
            summed_report = sum_report(summed_report, report)
        print("Report after %d fold" % (fold))
        averaged_report = average_report(summed_report, fold + 1)
        print_report(averaged_report)  

def test(test_set):
    test_loader = DataLoader(test_dataset, batch_size=256)
    with torch.no_grad():
        summed_report = []
        for i, data in enumerate(test_loader, 0):
              X_test, y_test = data
              outputs = model.forward(X_test)

              outputs = outputs.view(-1, outputs.shape[-1]) # [batch_size * seq_len, output_dim]
              y_test = y_test.view(-1) # [batch_size * seq_len]
            
              # Get the loss function
              loss = criterion(outputs, y_test.long())

              report = precision_recall_fscore_support(y_test.long(), \
                                                       get_max_outputs(outputs.detach()), \
                                                       average=None, \
                                                       zero_division=0, \
                                                       labels = [i for i in range(len(all_tags))])
              if len(summed_report) == 0:
                  summed_report = np.array(report)
              else:
                  summed_report = sum_report(summed_report, report)
        averaged_report = average_report(summed_report, i + 1)
        print_report(averaged_report)  
          

training_set = TensorDataset(X_train, y_train)
test_set = TensorDataset(X_test, y_test)
concat_dataset = ConcatDataset([training_set, test_set])
# run(training_set, test_set)
train(training_set, concat_dataset)
evaluate(test_set)

Epoch: 0, loss after minibatch     1: 2.76049
Epoch: 0, loss after minibatch     2: 2.74658
Epoch: 0, loss after minibatch     3: 2.73501
Epoch: 0, loss after minibatch     4: 2.72767
Epoch: 0, loss after minibatch     5: 2.71433
Epoch: 0, loss after minibatch     6: 2.70305
Epoch: 0, loss after minibatch     7: 2.69163
Epoch: 0, loss after minibatch     8: 2.68375
Epoch: 0, loss after minibatch     9: 2.67053
Epoch: 0, loss after minibatch    10: 2.66301


KeyboardInterrupt: ignored