In [None]:
!pip install nltk
!pip install torch
!pip install fasttext
!pip install sklearn-hierarchical-classification
!pip install spacy

In [None]:
!pip install prettytable

In [None]:
!pip install transformers

In [1]:
import torch
import torch.nn as nn

In [2]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yiyin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
''' 
Preprocess Data
'''

import re
import nltk
import string
import fasttext
from keras.preprocessing.sequence import pad_sequences
from nltk import word_tokenize

OTHER_TAG = "other"
PUNCT_TAG = "punct"

with open('./utils/tags.txt', encoding="utf-8", errors='ignore') as f:
    tags = set([str.rstrip(tag) for tag in f.readlines()])

with open('./utils/tags_hierarchy.txt', encoding="utf-8", errors='ignore') as f:
    hierarchy_tags = set([str.rstrip(tag) for tag in f.readlines()])

def remove_labels(text):
    return re.sub(r'\<\/?[\w-]*\>\s*', "", text).strip()

def tag_token(token, tag):
    if token in string.punctuation:
        return (token, PUNCT_TAG)
    return (token, tag)

def get_tagged_tokens(groups):
    tagged_tokens = []
    for group in groups:
        ref, tag = group[0], group[1]
        if tag not in tags:
            tag = OTHER_TAG
        unlabelled = remove_labels(ref)
        tokens = word_tokenize(unlabelled)
        tagged_tokens.extend(list(map(lambda token: tag_token(token, tag), tokens)))
    return tagged_tokens

''' Attach tags to each token '''
def attach_tags(dataset_path):
    dataset = []
    with open(dataset_path, encoding="utf-8", errors='ignore') as f:
        refs = f.readlines()
        for ref in refs:
            groups = re.findall(r'(\<(.*)\>.*\<\/\2\>)', ref) # format (<tag>...</tag>, tag)
            tagged_tokens = get_tagged_tokens(groups)
            dataset.append(tagged_tokens)
    return dataset

''' Removes labels and tokenizes '''
def tokenize_dataset(dataset_path, sep=" "):
    dataset = []
    with open(dataset_path, encoding="utf-8", errors='ignore') as f:
        refs = f.readlines()
        for ref in refs:
            ref = remove_labels(ref) 
            tokenized = sep.join(word_tokenize(ref))
            dataset.append(tokenized)
    return dataset

def map_to_index(keys, idx_start=0):
    key_to_idx, keys_arr, idx = {}, [], idx_start
    for key in keys:
        key_to_idx[key] = idx
        keys_arr.append(key)
        idx += 1
    return key_to_idx, keys_arr

all_tags = tags 
all_tags.add(OTHER_TAG)
all_tags.add(PUNCT_TAG)
tag_to_idx, tag_arr = map_to_index(all_tags)

dataset_path = './dataset/standardized_dataset.txt'
dataset = attach_tags(dataset_path)

In [4]:

EMBEDDING_DIM = 100

def train_word_embedding_model(dataset_path, embedding_dim, use_subwords=False, use_hierarchy=False):
    embedding_dataset_path = './dataset/word_embedding_dataset.txt'
    hierarchy_dataset_path = './dataset/umass-citation/training'

    with open(embedding_dataset_path, 'w', errors='ignore') as f:
        # fasttext tokenizes by whitespaces
        word_embedding_dataset = tokenize_dataset(dataset_path, sep=" ") 
        f.write("\n".join(word_embedding_dataset))
    if use_subwords:
      model_path = './models/subword_embedding.bin'
      model = fasttext.train_unsupervised(embedding_dataset_path, dim = embedding_dim, minn = 3, maxn = 6) 
    elif use_hierarchy:
        model_path = './models/hierarchy_word_embedding.bin'
        model = fasttext.train_unsupervised(embedding_dataset_path, dim=embedding_dim, maxn=0)
    else:
      model_path = './models/word_embedding.bin'
      model = fasttext.train_unsupervised(embedding_dataset_path, dim=embedding_dim, maxn=0)
    model.save_model(model_path)
    return model

''' 1. Word Embeddings: Without Pretrained Word Embeddings '''
# WE_model = train_word_embedding_model(dataset_path, embedding_dim = EMBEDDING_DIM)

''' 2. Subword Embeddings: Without Pretrained Subword Embeddings '''
SWE_model = train_word_embedding_model(dataset_path, embedding_dim = EMBEDDING_DIM, use_subwords = True)

curr_WE_model = SWE_model # Change accordingly

def get_word_vector(token):
    return curr_WE_model.get_word_vector(token)

In [5]:
''' Named Entity Recognition'''
import spacy
import numpy as np
from collections import defaultdict

spacy_nlp = spacy.load("en_core_web_sm")
ner_dict = {
        'ORG': 0,
        "NORP": 1,
        "GPE": 2,
        "PERSON": 3,
        "LANGUAGE": 4,
        "DATE": 5,
        "TIME": 6,
        "PRODUCT": 7,
        "EVENT": 8,
        "ORDINAL": 9
}

# text should be tokenized and joined together by whitespaces
def generate_ner_features(text):
    doc = spacy_nlp(text)
    entities = doc.ents
    default_feature = [0 for _ in range(len(ner_dict.keys()) + 1)]
    default_feature[-1] = 1 
    entity_to_label = defaultdict(lambda: default_feature)
    for entity in entities:
        entity_tokens = entity.text_.split(" ")
        label = entity.label_
        if label in ner_dict:
            features = [0 for _ in range(len(ner_dict.keys()) + 1)]
            features[ner_dict[label]] = 1
            for token in entity_tokens:
                entity_to_label[token] = features
    return entity_to_label


In [6]:
from transformers import BertTokenizer, BertModel

model_version = 'allenai/scibert_scivocab_cased'
do_lower_case = False
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)

def get_tokens_and_segments_tensors(text):
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segment_ids = [1] * len(indexed_tokens)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensor = torch.tensor([segment_ids])
    return tokens_tensor, segments_tensor

def average(embeddings):
    if len(embeddings.size()) == 1: # if only one embedding, just return
        return embeddings
    averaged_embedding = np.array([0 for _ in range(len(embeddings[0]))])
    for embedding in embeddings:
        averaged_embedding = np.add(averaged_embedding, embedding)
    return np.true_divide(averaged_embedding, len(embeddings))

from transformers import BertForPreTraining, BertConfig 

config = BertConfig.from_json_file('./models/bert/fine_tuned_bert/config.json')
bert_model = BertModel(config)
bert_model.eval()

def get_scibert_vector(text):
    tokens_tensor, segments_tensor = get_tokens_and_segments_tensors(text)
    outputs = bert_model(tokens_tensor, segments_tensor)
    hidden_states = outputs.last_hidden_state
    token_embeddings = torch.squeeze(hidden_states[-1], dim=0)
    averaged_embeddings = average(token_embeddings.detach())
    return averaged_embeddings.tolist()


In [7]:
''' Get inputs and outputs for model '''
X, y = [], []
for ref in dataset:
    X_ref, y_ref = [], []
    # joined_ref = " ".join(list(map(lambda x: x[0], X_ref))) # concatenate tokens using whitespace
    # ner_features = generate_ner_features(joined_ref)
    for token, tag in ref:
        features = get_word_vector(token)
        # features = np.hstack([features, np.array(ner_features[token])])
        X_ref.append(features)
        y_ref.append(tag_to_idx[tag])
    X.append(X_ref)
    y.append(y_ref)

max_length = max(map(lambda ref: len(ref), X))
X = pad_sequences(X, maxlen=max_length, padding='post', truncating='pre', value=float(len(all_tags)), dtype='float32')
y = pad_sequences(y, maxlen=max_length, padding='post', truncating='pre', value=float(len(all_tags)), dtype='float32')

In [None]:
# ##create another model trained with the hierarchy dataset
# dataset_hierarch_path = "./dataset/hierarchy_dataset.txt"
# dataset_hierarch = attach_tags(dataset_hierarch_path)
# word_embedding_model = train_word_embedding_model(dataset_path, embedding_dim = EMBEDDING_DIM, use_subwords=False, use_hierarchy=True)
# all_tags = hierarchy_tags 
# all_tags.add(OTHER_TAG)
# all_tags.add(PUNCT_TAG)
# tag_to_idx, tag_arr = map_to_index(hierarchy_tags)

# ''' Get inputs and outputs for model '''
# X_hierarchy, y_hierarchy = [], []
# for ref in dataset:
#     X_ref, y_ref = [], []
#     for token, tag in ref:
#         X_ref.append(word_embedding_model.get_word_vector(token))
#         y_ref.append(tag_to_idx[tag])
#     X_hierarchy.append(X_ref)
#     y_hierarchy.append(y_ref)

# max_length = max(map(lambda ref: len(ref), X_hierarchy))
# X_hierarchy = pad_sequences(X_hierarchy, maxlen=max_length, padding='post', truncating='pre', value=float(len(all_tags)), dtype='float32')
# y_hierarchy = pad_sequences(y_hierarchy, maxlen=max_length, padding='post', truncating='pre', value=float(len(all_tags)), dtype='float32')

In [8]:
from torch.nn.utils.rnn import pack_padded_sequence
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
    
        self.lstm = nn.LSTM(
            input_size = self.input_size,
            hidden_size = self.hidden_size,
            num_layers = self.num_layers,
            batch_first = True
        )

    def forward(self, x):
        # Initialise hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Initialise internal state
        c0 = torch.zeros(self.num_layers,x.size(0), self.hidden_size)

        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(x, (h0, c0))

        return output, (hn, cn)


In [9]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.lstm = LSTM(input_size, hidden_size, output_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        output = self.fc(output)

        return output


In [10]:
'''
Hyperparameters
'''
num_epochs = 700
learning_rate = 0.001

we_feature_size = EMBEDDING_DIM
ner_feature_size = len(ner_dict.keys()) + 1
scibert_feature_size = 768

input_size = we_feature_size # Number of features (change accordingly)
hidden_size = 25 # Number of features in the hidden state
num_layers = 1 # Number of stacked LSTM layers

output_size = len(all_tags) # Number of output classes

model = Net(input_size, hidden_size, output_size, num_layers)

In [11]:
'''
Loss Function and Optimiser
'''
criterion = nn.CrossEntropyLoss(ignore_index=len(all_tags))
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [19]:
import numpy as np
from torch.nn.utils.rnn import pad_packed_sequence
from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled
from sklearn.metrics import f1_score, precision_recall_fscore_support
from sklearn.metrics import plot_confusion_matrix, confusion_matrix as cm
from sklearn.model_selection import train_test_split, KFold
from prettytable import PrettyTable
from torch.utils.data import TensorDataset, DataLoader, ConcatDataset


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test = torch.tensor(X_train), torch.tensor(X_test)
y_train, y_test = torch.tensor(y_train), torch.tensor(y_test)

# X_train_hierarchy, X_test_hierarchy, y_train_hierarchy, y_test_hierarchy = train_test_split(X_hierarchy, y_hierarchy, test_size = 0.2, random_state=42)
##X_train_hierarchy, X_test_hierarchy = torch.tensor(X_train_hierarchy), torch.tensor(X_test_hierarchy)
##y_train_hierarchy, y_test_hierarchy = torch.tensor(y_train_hierarchy), torch.tensor(y_test_hierarchy)

def categorical_accuracy(outputs, y, pad_index):
    max_outputs = outputs.argmax(dim = 1, keepdim=True)
    non_padded_elements = (y != pad_index).nonzero()
    correct = max_outputs[non_padded_elements].squeeze(1).eq(y[non_padded_elements])
    return correct.sum() / torch.FloatTensor([y[non_padded_elements].shape[0]])

def get_max_outputs(outputs):
    max_outputs = outputs.argmax(dim = 1)
    return max_outputs

def print_report(report):
    table = PrettyTable(float_format="1.5f")
    table.field_names = ["Tag", "Precision", "Recall", "FBeta"]
    for i in range(len(tag_arr)):
      tag, scores = [tag_arr[i]], list(map(lambda metric: metric[i], report))[:-1] # exclude support metric
      tag.extend(scores)
      table.add_row(tag)
    print(table)

def sum_report(report1, report2):
    summed_report = []
    for i in range(len(report1)):
        label_report1, label_report2 = np.array(report1[i]), np.array(report2[i])
        summed_report.append(np.add(label_report1, label_report2))
    return np.array(summed_report)

def average_report(report, num):
    return list(map(lambda label_report: np.true_divide(label_report, num), report))

def print_statistics(X_test, y_test, y_pred, model):
    macro_score = f1_score(y_test, y_pred, average='macro')
    micro_score = f1_score(y_test, y_pred, average='micro')
    cMtx = cm(y_test, y_pred)
    normalized_cMtx = []
    for row in cMtx:
        total = sum(row)
        if total != 0:
            row = list(map(lambda value: "{:.5f}".format(value / total), row))
        normalized_cMtx.append(row)
    print('Test F1 macro score: {}'.format(macro_score))
    print('Test F1 micro score: {}'.format(micro_score))
    print(normalized_cMtx)


# model_filename is used to save the model
def train(train_dataset, concat_dataset, k_folds=10, model_filename=None):
    kfold = KFold(n_splits=k_folds, shuffle=True)
    summed_report = np.array([])
    for fold, (train_ids, test_ids) in enumerate(kfold.split(concat_dataset)):
        train_loader = DataLoader(train_dataset, batch_size=256)
        for epoch in range(num_epochs):
            for i, data in enumerate(train_loader, 0):
                X_train, y_train = data
                outputs = model.forward(X_train)

                optimizer.zero_grad()

                outputs = outputs.view(-1, outputs.shape[-1]) # [batch_size * seq_len, output_dim]
                y_train = y_train.view(-1) # [batch_size * seq_len]
              
                # Get the loss function
                loss = criterion(outputs, y_train.long())

                # Calculate loss
                loss.backward()

                # Backpropagation
                optimizer.step()

                # Print loss at every 100th epoch
                if epoch % 100 == 0:
                    print("Epoch: %d, loss after minibatch %5d: %1.5f" % (epoch, i+1, loss.item()))
            
        report = precision_recall_fscore_support(y_train.long(), \
                                                 get_max_outputs(outputs.detach()), \
                                                 average=None, \
                                                 zero_division=0, \
                                                 labels = [i for i in range(len(all_tags))])
        if len(summed_report) == 0:
            summed_report = np.array(report)
        else:
            summed_report = sum_report(summed_report, report)
        print("Report after %d fold" % (fold))
        averaged_report = average_report(summed_report, fold + 1)
        print_report(averaged_report)  

def test(X_test, y_test):
    with torch.no_grad():
        outputs = model.forward(X_test)

        outputs = outputs.view(-1, outputs.shape[-1]) # [batch_size * seq_len, output_dim]
        y_test = y_test.view(-1) # [batch_size * seq_len]

        # Get the loss function
        loss = criterion(outputs, y_test.long())
        y_pred = get_max_outputs(outputs.detach())
        report = precision_recall_fscore_support(y_test.long(), \
                                                y_pred, \
                                                average=None, \
                                                zero_division=0, \
                                                labels = [i for i in range(len(all_tags))])
        print_report(report)  
        print_statistics(X_test, y_test, y_pred, model)

training_set = TensorDataset(X_train, y_train)
test_set = TensorDataset(X_test, y_test)
concat_dataset = ConcatDataset([training_set, test_set])
train(training_set, concat_dataset)

model_filename = "model.pth" # change accordingly based on features being run
torch.save(model.state_dict(), "./models/" + str(model_filename)) # save trained model 

test(X_test, y_test)

s after minibatch     3: 0.74094
Epoch: 0, loss after minibatch     4: 0.77650
Epoch: 0, loss after minibatch     5: 0.73343
Epoch: 0, loss after minibatch     6: 0.70342
Epoch: 0, loss after minibatch     7: 0.74060
Epoch: 0, loss after minibatch     8: 0.71688
Epoch: 0, loss after minibatch     9: 0.70487
Epoch: 0, loss after minibatch    10: 0.82609
Report after 3 fold
+-------------+---------------------+--------------------+--------------------+
|     Tag     |      Precision      |       Recall       |       FBeta        |
+-------------+---------------------+--------------------+--------------------+
|    other    | 0.47528511404182106 | 0.6289752650176679 | 0.5402306995067826 |
|    volume   |         0.0         |        0.0         |        0.0         |
|     tech    |         0.0         |        0.0         |        0.0         |
|     note    |         0.0         |        0.0         |        0.0         |
|    punct    |  0.5898522846205422 | 0.9007701421800949 | 0.7128

In [None]:
# training_set_hierarchy = TensorDataset(X_train_hierarchy, y_train_hierarchy)
# test_set_hierarchy = TensorDataset(X_test_hierarchy, y_test_hierarchy)
# concat_dataset_hierarchy = ConcatDataset([training_set_hierarchy, test_set_hierarchy])
# train(training_set_hierarchy, concat_dataset_hierarchy)
# evaluate(test_set_hierarchy)