In [116]:
import pandas as pd
import numpy as np
import nltk, re, json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as t
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.datasets as transforms

from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [138]:
############# TEST
# variable length sorted
a = [torch.randn(113,100), torch.randn(3,100), torch.randn(2,100), torch.randn(1,100)]
seq = [113,3,2,1]
b = pad_sequence(a, batch_first=True, padding_value=0) # b.shape (4, 113, 100)
print(b.shape)
c = pack_padded_sequence(b, seq, batch_first=True) # c.data.shape (11, 12)
c


type(a[0])

torch.Size([4, 113, 100])


torch.Tensor

In [None]:
lstm_layer = nn.LSTM(input_size = 12, hidden_size = 16, batch_first = True, bidirectional = False, num_layers = 1)
b_result, hidden = lstm_layer(b)
print(b_result.shape) # b_result.shape (4, 5, 16)
c_result, hidden = lstm_layer(c)
print(c_result.data.shape) # c_result.data.shape (11, 16)


lstm_layer = nn.LSTM(input_size = 12, hidden_size = 16, batch_first = True, bidirectional = True, num_layers = 1)

b_result, hidden = lstm_layer(b)
print(b_result.shape) # b_result.shape (4, 5, 32)

c_result, hidden = lstm_layer(c)
print(c_result.data.shape) # c_result.data.shape (11, 32)

In [72]:
# store train sentences 
train_file = 'data/train'
dev_file = 'data/dev'
test_file = 'data/test'
dummy_file ='data/dummy'

In [73]:
# read train/test file, each line as {s_idx, word, tag} tuple, store in a list
def readFile(file):
    f = open(file)
    lines = f.readlines()
    words = []
    for line in lines:
        if line.strip():
            words.append(line.strip().split(' '))
    return words

In [74]:
# DF: index - s_idx - word - tag
train_lines = readFile(train_file)
df = pd.DataFrame(train_lines, columns = ["s_idx", "word", "tag"])
df.head(5)

Unnamed: 0,s_idx,word,tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O


In [75]:
# Randomly select some rare words to be <unk> words
unique_words = df["word"].value_counts().reset_index()
unique_words.columns = ["word", "freq"]
threshold = 3
# words with freq > threshold
vocab_words = unique_words[ unique_words['freq'] > threshold ]
# words with freq <= threshold
rare_words = unique_words[ unique_words['freq'] <= threshold ]

print("vocab words:", vocab_words.shape[0])
print("rare words:", rare_words.shape[0])

vocab words: 6182
rare words: 17442


In [76]:
# Randomly select 3000 words from rare words to set as unknown words
# unk_count = len(rare_words)
# unk_words = rare_words.sample(unk_count)

# drop the selected rare words from vocab
# rare_words = rare_words.drop(unk_words.index)

# build new vocab = freq_words + rest_rare_words + <unk>
# vocab_words = vocab_words.append(rare_words, ignore_index=True)

# custom words unk, pad etc
# custom_vocab = ['<unk>']
custom_vocab = ['<unk>', '<pad>']

# main vocab list, to generate embedding
vocab_set = set(custom_vocab + vocab_words['word'].unique().tolist())
vocab_size = len(vocab_set)

In [161]:
vocab_size

6184

In [78]:
# all the vocab
word_to_idx = {word:i for i, word in enumerate(vocab_set)}

# all the unique tags
unique_tags = set(df["tag"].unique())
tag_to_idx = {tag:i for i, tag in enumerate(unique_tags)}
idx_to_tag = {i:tag for i, tag in enumerate(unique_tags)}

In [9]:
# read files, group words by sentence, return list of sentences
def readData(file):
    f = open(file)
    lines = f.readlines()
    sentences = []
    sentence = []
    for line in lines:
        if not line.strip():
            sentences.append(sentence.copy())
            sentence.clear()
        else:
            sentence.append(line.strip().split(' '))
    # append the last sentence
    sentences.append(sentence.copy())
    return sentences

In [183]:
# word = [idx, word, tag]  train_data = list of sentences in term of list of words
train_data = readData(train_file)

dev_data = readData(dev_file)
# word = [idx, word]
test_data = readData(test_file)

# Dummy test data
dummy_file ='data/dummy'
dummy_data = readData(dummy_file)

In [144]:
# Preapare training data
def processData(tuples):
    training_data = []
    for t in tuples:
        training_data.append( ( [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ], [ word[2] for word in t ] ) )
    return training_data

# Convert sequence into tensor
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# prepare padded data, return inputs and labels
# def processPaddedData(tuples, max_seq_len):
#     inputs = []
#     labels = []
#     PAD = '<pad>'
#     for t in tuples:
#         seq = [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ]
#         # pad seq
#         if len(seq) < max_seq_len:
#             seq += [ PAD for _ in range(max_seq_len-len(seq)) ]
#         inputs.append(seq)
#         labels.append( [ word[2] for word in t] )
        
#     return inputs, labels

def processPaddedData(tuples, max_seq_len):
    inputs = []
    labels = []
    PAD = '<pad>'
    for t in tuples:
        seq = [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ]
        inputs.append(seq)
        labels.append( [ word[2] for word in t] )
        
    return inputs, labels

def seq2idx(inputs, to_ix):
    return [ torch.tensor([to_ix[w] for w in seq]) for seq in inputs ]

In [50]:
training_data = processData(train_data)
max_seq_len = max([ len(sent) for sent, _ in training_data])
max_seq_len

113

In [184]:
inputs, labels = processPaddedData(dummy_data, max_seq_len)

In [187]:
unpadded_inputs = seq2idx(inputs, word_to_idx)
print(unpadded_inputs)
unsort_lengths = t.tensor([len(s) for s in unpadded_inputs])
print(unsort_lengths)

[tensor([981, 334, 174]), tensor([5778]), tensor([3361, 3361])]
tensor([3, 1, 2])


In [189]:
padding_idx = word_to_idx['<pad>']
print("padding_idx", padding_idx)

embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)

unsort_padded_inputs = pad_sequence(unpadded_inputs, batch_first=True, padding_value=padding_idx)
unsort_padded_inputs = embedding(unsort_padded_inputs)
unsort_padded_inputs.shape

padding_idx 3626


torch.Size([3, 3, 100])

In [191]:
sorted_lengths, sorted_len_idx = unsort_lengths.sort(0, descending=True)
print(sorted_lengths, sorted_len_idx)
_, unsorted_idx = t.sort(sorted_lengths, dim=0)
print("unsort_idx", unsorted_idx)

a = unsort_padded_inputs[idx] 

tensor([3, 2, 1]) tensor([0, 2, 1])
unsort_idx tensor([2, 1, 0])


In [192]:
a

tensor([[[-2.1548e-01,  4.6567e-01, -5.5241e-01,  1.5951e+00,  1.5188e+00,
           6.3830e-01, -6.1782e-01,  6.7159e-01,  1.3609e+00,  1.4467e-02,
          -4.4473e-01,  8.5518e-02,  1.9484e-01, -2.4091e+00,  9.4244e-01,
           9.6899e-01, -8.1337e-01,  4.4211e-01, -6.1085e-01, -1.1370e+00,
           1.3328e+00, -4.6324e-01, -5.0769e-01, -3.4775e-01, -7.6382e-01,
          -5.7312e-01,  1.8974e+00,  4.4645e-01,  1.1496e+00, -2.1394e-01,
          -9.2324e-01, -9.3872e-01, -1.2094e+00, -8.4744e-01,  2.0133e+00,
          -9.8805e-01, -6.7415e-02, -5.6366e-01,  4.5789e-01, -1.5816e-01,
           1.2547e+00, -2.1328e+00, -3.5176e-01,  6.9318e-01,  3.4555e-01,
           8.3684e-01, -1.1757e-01,  3.4243e-01,  8.8368e-01,  5.8781e-01,
           1.7209e-05, -1.2327e+00, -2.0951e-01,  5.3319e-01, -9.1351e-01,
           8.3567e-01,  9.0224e-01, -2.2854e-01, -6.6803e-01,  6.3380e-01,
           9.7053e-02,  1.3704e-01,  6.3149e-01,  9.7861e-01, -2.1652e-01,
          -7.5137e-01,  1

In [172]:
b = embedding(b)
b.shape # batch=3, seq=113, dim=100

torch.Size([3, 113, 100])

In [84]:
inputs = seq2idx(inputs, word_to_idx)
labels = seq2idx(labels, tag_to_idx)

In [24]:
inputs = torch.rand(1000, max_len, 100)

In [25]:
inputs.shape
#[batch, seq, dim]

torch.Size([1000, 113, 100])

In [316]:
training_data = processData(train_data)

In [160]:
embedding_dim = 100
hidden_dim = 256
vocab_size = len(word_to_idx)
tagset_size = len(tag_to_idx)

lstm_layer = 1
lstm_dropout = 0.33
linear_out_dim = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [293]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
    
    # sentence [seq, batch, embed_dim]
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [318]:
class BLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, linear_out_dim, lstm_layer, lstm_dropout):
        super(BLSTM, self).__init__()
        # word embedding
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim,linear_out_dim)
        self.fc = nn.Linear(linear_out_dim, tagset_size)
        self.dropout = nn.Dropout(lstm_dropout)
    
    def forward(self, sentence):
        # Embedding layer + LSTM input dropout
        embeds = self.word_embeddings(sentence)
        embeds = self.dropout(embeds)
        # BLSTM layer + LSTM output dropout
        lstm_out, _ = self.bilstm(embeds.view(len(sentence), 1, -1))
        lstm_out = self.dropout(lstm_out)
        # Linear layer + elu
        linear_out = F.elu(self.linear(lstm_out.view(len(sentence), -1)))
        # classifier
        tag_space = self.fc(linear_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [319]:
# Hyperparameter
lr = 0.1
epochs = 10
print_every = 1

In [320]:
model = LSTMTagger(embedding_dim, hidden_dim, vocab_size, tagset_size).to(device)
# model = BLSTM(embedding_dim, hidden_dim, vocab_size, tagset_size, linear_out_dim, lstm_layer, lstm_dropout).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

LSTMTagger(
  (word_embeddings): Embedding(4148, 100)
  (lstm): LSTM(100, 256)
  (hidden2tag): Linear(in_features=256, out_features=9, bias=True)
)

In [None]:
# Before training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_idx).to(device)
    tag_scores = model(inputs)
    print([idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()])


    
for epoch in range(epochs):
    for sentence, tags in tqdm(training_data, total=len(training_data)):
        model.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_to_idx).to(device)
        targets = prepare_sequence(tags, tag_to_idx).to(device)
        
        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    if epoch%print_every == 0:
        print(loss)
    
    
# After training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_idx).to(device)
    tag_scores = model(inputs)
    print([idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()])

['B-MISC', 'I-ORG', 'I-ORG', 'I-LOC', 'B-PER', 'I-ORG', 'I-ORG', 'I-ORG', 'O']


100%|██████████| 14987/14987 [00:25<00:00, 576.68it/s]


tensor(0.0004, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:25<00:00, 582.04it/s]


tensor(8.1059e-05, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:26<00:00, 556.21it/s]


tensor(1.6212e-05, device='cuda:0', grad_fn=<NllLossBackward>)


 24%|██▍       | 3655/14987 [00:06<00:19, 575.89it/s]

In [None]:
def generateEvalFile(model, input_data, file_name):
    # Reset the file
    open(file_name, 'w').close()
    f = open(file_name, "a")
    
    # model eval mode
    model.eval()
    
    for sentence, tags in input_data:
        idx = 1
        with torch.no_grad():
            inputs = prepare_sequence(sentence, word_to_idx).to(device)
            tag_scores = model(inputs) 
            preds = [idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()]
            for word, gold, pred in zip(sentence, tags, preds):
                f.write(f'{idx} {word} {gold} {pred}\n')
                idx+=1
            f.write('\n')      
    f.close()

In [None]:
model_name = "blstm_t3"

In [None]:
dev_input = processData(dev_data)
generateEvalFile(model, dev_input, f"{model_name}_eval.txt")

In [None]:
PATH = f"{model_name}.pt"

# Save
torch.save(model, PATH)

# Load
model = torch.load(PATH)
model.eval()