In [None]:
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import torchtext
from torchtext.data import Field, BucketIterator
from torchcrf import CRF

from tqdm import tqdm
import spacy
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
spacy_en = spacy.load("en_core_web_sm")

In [None]:
def tokenizer_en(text):
    return text.split(" ")

In [None]:
def get_text_labels(sequence_tags):
    # TOKEN TXT_SOURCE_FILE START_CHAR END_CHAR TAG TAG_ID ROOT_ID RELATION
    text = [data[0].strip() for data in sequence_tags]
    tags = [data[4].strip() for data in sequence_tags]
    return {"text":" ".join(text), "labels": " ".join(tags)}

def parse_deft(deft_file):
    with open(deft_file, 'r') as deft:
        all_text = deft.read()
    all_sequences = []
    for lines in all_text.split("\n\n"):
        sents = []
        for token_data in lines.split("\n"):
            if len(token_data.split("\t"))==8 and (token_data.split("\t")[4].strip()[0] in ["B", "I", "O"]):
                sents.append(token_data.split("\t"))
        all_sequences.append(get_text_labels(sents))
    return all_sequences
# parse_deft(train_deft_file)


In [None]:
import os
train_deft_file = "../deft_corpus/data/deft_files/train/"
all_data = []
for files in os.listdir(train_deft_file):
    all_data.extend(parse_deft(os.path.join(train_deft_file, files)))
print(len(all_data))

In [None]:
# Qualifier
# for files in os.listdir(train_deft_file):
#     with open(os.path.join(train_deft_file, files), 'r') as deft:
#         all_text = deft.read()
#     for lines in all_text.split("\n\n"):
#         for token_data in lines.split("\n"):
#             if len(token_data.split("\t"))==8:
#                 if "Alias-Term" in token_data.split("\t")[4]:
#                     print(token_data)

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(all_data, test_size=0.1)
len(train), len(test)

In [None]:
train[0]

In [None]:
import pandas as pd
pd.DataFrame.from_records(train).to_csv("train.csv", index=False)
pd.DataFrame.from_records(test).to_csv("val.csv", index=False)

In [None]:
TEXT = Field(
    sequential=True, 
    tokenize=None,
    init_token="<sos>",
    eos_token="<eos>",
#     batch_first=True,
)
LABELS = Field(
    sequential=True, 
    init_token="<sos>",
    eos_token="<eos>",
#     batch_first=True,
    is_target=True
)

In [None]:
from torchtext.data import TabularDataset

tv_datafields = [("text", TEXT), ("labels", LABELS)]
trn, vld = TabularDataset.splits(
               path="", # the root directory where the data lies
               train='train.csv',validation="val.csv",
               format='csv',
               skip_header=True,
               fields=tv_datafields)

In [None]:
TEXT.build_vocab(trn,vld, vectors="glove.6B.300d")
LABELS.build_vocab(trn,vld)
# PTB_LABELS.build_vocab(training_data)

len(TEXT.vocab), len(LABELS.vocab)

In [None]:
vars(LABELS.vocab)

In [None]:
training_iterator, val_iter = BucketIterator.splits(
    (trn, vld),
    batch_sizes=(256, 256), device=device, sort_within_batch=False, sort_key=lambda x: len(x.text)
)
len(training_iterator), len(val_iter)

In [None]:
class BiLstm_Crf(nn.Module):
    def __init__(self, embedding_vector, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional):
        super(BiLstm_Crf, self).__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(embedding_vector))#(self.vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, bidirectional=self.bidirectional,
                           dropout=0.5)
        
        self.dropout_layer = nn.Dropout(0.5)
        self.linear = nn.Linear(self.hidden_dim, self.output_dim)
        
        self.crf_layer = CRF(self.output_dim)
        self.inference = False
        
    def forward(self, inp, labels):
        # inp = [seq_len, batch_size]
        # labels = [seq_len, batch_size]
             
        embedded = self.dropout_layer(self.embedding(inp))
        # embedded = [seq_len, batch_size, embedding_dim]
        
        outputs, (hidden, cell) = self.lstm(embedded)
        # outputs = [seq_len, batch_size, 1 * hidden_size]
        
        out = self.linear(outputs)
        # out = [seq_len, batch_size, output_dim]
        
        if self.inference is False:
            loss = self.crf_layer(out, labels) * torch.tensor(-1, device=device)
            return loss 
        else:
            loss = self.crf_layer(out, labels) * torch.tensor(-1, device=device)
            out = self.crf_layer.decode(out)
            out = torch.tensor(out, dtype=torch.long, device=device).permute(1, 0)
            # out = [seq_len, batch_size]
            return out, loss

In [None]:
def train(model, iterator, optimizer, device=None):
    model.train()
    model.inference = False
    
    epoch_loss = 0.0 
    
    for batch in tqdm(iterator):
        inp = batch.text
        target = batch.labels
        
        optimizer.zero_grad()
        
        loss = model(inp, target)
        # crf loss
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, device=None):
    model.eval()
    
    epoch_loss = 0.0
    predictions = []
    true_labels = []
    model.inference = True
    for batch in tqdm(iterator):
        inp = batch.text
        target = batch.labels
                
        out, loss = model(inp, target)
        # out = [seq_len, batch_size]
        # crf loss
        
        predictions.extend(out.contiguous().view(-1).cpu().tolist())
        true_labels.extend(target.contiguous().view(-1).cpu().tolist())
                
        epoch_loss += loss.item()
    
    f1 = f1_score(true_labels, predictions, average="macro")
        
    return epoch_loss / len(iterator), f1

def number_of_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
model = BiLstm_Crf(
    TEXT.vocab.vectors,
    vocab_size=len(TEXT.vocab), 
    embedding_dim=300, hidden_dim=512, 
    output_dim=len(LABELS.vocab), 
    num_layers=2, bidirectional=False
)

In [None]:
# model.embedding.weight.data.copy_(TEXT.vocab.vectors)

In [None]:
number_of_parameters(model)

In [None]:
optimizer = optim.Adam(model.parameters())
model = model.to(device)


In [None]:
N_EPOCHS = 50
model.inference = False
VAL_LOSS = 1e10
for epoch in range(N_EPOCHS):
    train_loss = train(model, training_iterator, optimizer)
    val_loss, val_f1 = evaluate(model, val_iter)
    
    if VAL_LOSS > val_loss:
        VAL_LOSS = val_loss
        torch.save(model.state_dict(), 'bilstm-ner-crf-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {val_loss:.3f}')
    print(f'Val. F1 Score is : {val_f1:.2f}')
#     torch.cuda.empty_cache()