In [1]:
import pandas as pd
import numpy as np
import nltk, re, json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.datasets as transforms

from sklearn.model_selection import train_test_split
from tqdm import tqdm

# store train sentences 
train_file = 'data/train'
dev_file = 'data/dev'
test_file = 'data/test'
dummy_file ='data/dummy'

In [2]:
# read train/test file, each line as {s_idx, word, tag} tuple, store in a list
def readFile(file):
    f = open(file)
    lines = f.readlines()
    words = []
    for line in lines:
        if line.strip():
            words.append(line.strip().split(' '))
    return words

# read files, group words by sentence, return list of sentences
def readData(file):
    f = open(file)
    lines = f.readlines()
    sentences = []
    sentence = []
    for line in lines:
        if not line.strip():
            sentences.append(sentence.copy())
            sentence.clear()
        else:
            sentence.append(line.strip().split(' '))
    # append the last sentence
    sentences.append(sentence.copy())
    return sentences

# word = [idx, word, tag]  train_data = list of sentences in term of list of words
train_data = readData(train_file)

dev_data = readData(dev_file)
# word = [idx, word]
test_data = readData(test_file)

# Dummy test data
dummy_file ='data/dummy'
dummy_data = readData(dummy_file)

# Preapare training data
def processData(tuples):
    training_data = []
    for t in tuples:
        training_data.append( ( [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ], [ word[2] for word in t ] ) )
    return training_data

# Convert sequence into tensor
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

def generateEvalFile1(model, input_data, file_name, word_to_idx):
    # Reset the file
    open(file_name, 'w').close()
    f = open(file_name, "a")
    
    # model eval mode
    model.eval()
    
    for t in input_data:
        sentence = [ word[1] if word[1] in word_to_idx else '<unk>' for word in t]
        with torch.no_grad():
            inputs = prepare_sequence(sentence, word_to_idx).to(device)
            tag_scores = model(inputs) 
            preds = [idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()]
            for word, pred in zip(t, preds):
                f.write(f'{word[0]} {word[1]} {word[2]} {pred}\n')
            f.write('\n')      
    f.close()
    
def processTestData(tuples):
    training_data = []
    for t in tuples:
        training_data.append( ( [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ] ) )
    return training_data

def generateTestPred1(model, input_data, file_name):
    # Reset the file
    open(file_name, 'w').close()
    f = open(file_name, "a")
    
    # model eval mode
    model.eval()
    
    for t in input_data:

        sentence = [ word[1] if word[1] in word_to_idx else '<unk>' for word in t]
        with torch.no_grad():
            inputs = prepare_sequence(sentence, word_to_idx).to(device)
            tag_scores = model(inputs) 
            preds = [idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()]
            for word, pred in zip(t, preds):
                f.write(f'{word[0]} {word[1]} {pred}\n')
            f.write('\n')      
    f.close()

vocab words: 6182
rare words: 17442


In [None]:
# DF: index - s_idx - word - tag
train_lines = readFile(train_file)
df = pd.DataFrame(train_lines, columns = ["s_idx", "word", "tag"])

# Randomly select some rare words to be <unk> words
unique_words = df["word"].value_counts().reset_index()
unique_words.columns = ["word", "freq"]

# Rare ward threshold
threshold = 3

# words with freq > threshold
vocab_words = unique_words[ unique_words['freq'] > threshold ]
# words with freq <= threshold
rare_words = unique_words[ unique_words['freq'] <= threshold ]

print("vocab words:", vocab_words.shape[0])
print("rare words:", rare_words.shape[0])

# Randomly select 3000 words from rare words to set as unknown words
# unk_count = len(rare_words)
# unk_words = rare_words.sample(unk_count)

# drop the selected rare words from vocab
# rare_words = rare_words.drop(unk_words.index)

# build new vocab = freq_words + rest_rare_words + <unk>
# vocab_words = vocab_words.append(rare_words, ignore_index=True)

# custom words unk, pad etc
custom_vocab = ['<unk>']
# custom_vocab = ['<unk>', '<pad>']

# main vocab list, to generate embedding
vocab_set = set(custom_vocab + vocab_words['word'].unique().tolist())
vocab_size = len(vocab_set)

# all the vocab
word_to_idx = {word:i for i, word in enumerate(vocab_set)}

# all the unique tags
unique_tags = set(df["tag"].unique())
tag_to_idx = {tag:i for i, tag in enumerate(unique_tags)}
idx_to_tag = {i:tag for i, tag in enumerate(unique_tags)}

In [3]:
training_data = processData(train_data)

In [4]:
embedding_dim = 100
hidden_dim = 256
vocab_size = len(word_to_idx)
tagset_size = len(tag_to_idx)

lstm_layer = 1
lstm_dropout = 0.33
linear_out_dim = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [338]:
# class LSTMTagger(nn.Module):
#     def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
#         super(LSTMTagger, self).__init__()
#         self.hidden_dim = hidden_dim
        
#         self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
#         self.lstm = nn.LSTM(embedding_dim, hidden_dim)
#         self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
    
#     # sentence [seq, batch, embed_dim]
#     def forward(self, sentence):
#         embeds = self.word_embeddings(sentence)
#         lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
#         tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
#         tag_scores = F.log_softmax(tag_space, dim=1)
#         return tag_scores

In [5]:
class BLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size, linear_out_dim, 
                 lstm_layer, lstm_dropout):
        super(BLSTM, self).__init__()
        # word embedding
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim,linear_out_dim)
        self.fc = nn.Linear(linear_out_dim, tagset_size)
        self.dropout = nn.Dropout(lstm_dropout)
    
    def forward(self, sentence):
        # Embedding layer + LSTM input dropout
        embeds = self.word_embeddings(sentence)
        embeds = self.dropout(embeds)
        # BLSTM layer + LSTM output dropout
        lstm_out, _ = self.bilstm(embeds.view(len(sentence), 1, -1))
        lstm_out = self.dropout(lstm_out)
        # Linear layer + elu
        linear_out = F.elu(self.linear(lstm_out.view(len(sentence), -1)))
        # classifier
        tag_space = self.fc(linear_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores


In [6]:
# Hyperparameter
lr = 0.1
epochs = 50
print_every = 10

# model = LSTMTagger(embedding_dim, hidden_dim, vocab_size, tagset_size).to(device)
model = BLSTM(embedding_dim, hidden_dim, vocab_size, tagset_size, linear_out_dim, lstm_layer, 
              lstm_dropout).to(device)
loss_function = nn.NLLLoss()
# loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [7]:
# Before training
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[0][0], word_to_idx).to(device)
#     tag_scores = model(inputs)
#     print([idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()])


    
for epoch in range(epochs):
    for sentence, tags in tqdm(training_data, total=len(training_data)):
        model.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_to_idx).to(device)
        targets = prepare_sequence(tags, tag_to_idx).to(device)
        
        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    if (epoch+1)%print_every == 0:
        print(loss)
    
    
# After training
# with torch.no_grad():
#     inputs = prepare_sequence(training_data[0][0], word_to_idx).to(device)
#     tag_scores = model(inputs)
#     print([idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()])

100%|██████████| 14987/14987 [00:36<00:00, 411.01it/s]


In [8]:
blstm1_name = "blstm1"
PATH = f"{blstm1_name}.pt"

# Save
torch.save(model, PATH)

# Load
model = torch.load(PATH)
model.eval()

BLSTM(
  (word_embeddings): Embedding(6183, 100)
  (bilstm): LSTM(100, 256, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=9, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
)

In [9]:
generateEvalFile1(model, dev_data, "dev1.out",word_to_idx)

generateTestPred1(model, test_data, "test1.out")

# PART 2

In [35]:
# Expect glove.6B.100d.txt to be unzipped 
embeddings_index = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = vector
f.close()

In [36]:
train_lines = readFile(train_file)
df_train = pd.DataFrame(train_lines, columns = ["s_idx", "word", "tag"])

dev_lines = readFile(dev_file)
df_dev = pd.DataFrame(dev_lines, columns = ["s_idx", "word", "tag"])

test_lines = readFile(test_file)
df_test = pd.DataFrame(test_lines, columns = ["s_idx", "word"])

combo_df = df_train.append(df_dev).append(df_test)
combo_df['word'] = combo_df['word'].str.lower()

In [37]:
vocab_words = combo_df["word"].value_counts().reset_index()
vocab_words.columns = ["word", "freq"]

threshold2 = 0

vocab_words = vocab_words[ vocab_words['freq'] > threshold2 ]

In [38]:
# main vocab list, to generate embedding
vocab_set = set( vocab_words['word'].unique().tolist())
vocab_size = len(vocab_set)

# all the vocab
word_to_idx = {word:i for i, word in enumerate(vocab_set)}

# all the unique tags
unique_tags = set(df_train["tag"].unique())
tag_to_idx = {tag:i for i, tag in enumerate(unique_tags)}
idx_to_tag = {i:tag for i, tag in enumerate(unique_tags)}

# word : embeddings , unknown words got dropped 
vocab_embeddings = {}

for word in vocab_set:
    if word in embeddings_index:
        vocab_embeddings[word] = embeddings_index[word]
        
vocab_embeddings['<unk>'] = torch.rand(100).numpy()

def processData2(tuples):
    training_data = []
    for t in tuples:
        training_data.append( ( [ word[1] if word[1] in vocab_embeddings else '<unk>' for word in t ], [ word[2] for word in t ] ) )
    return training_data



In [39]:
class BLSTM2(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, tagset_size, linear_out_dim, lstm_layer, lstm_dropout):
        super(BLSTM2, self).__init__()
        
        self.bilstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            bidirectional=True)
        self.linear = nn.Linear(2*hidden_dim,linear_out_dim)
        self.fc = nn.Linear(linear_out_dim, tagset_size)
        self.dropout = nn.Dropout(lstm_dropout)
    
    def forward(self, sentence):
        # Sentence input is tensor
        sentence = self.dropout(sentence)
        # BLSTM layer + LSTM output dropout
        lstm_out, _ = self.bilstm(sentence.view(len(sentence), 1, -1))
        lstm_out = self.dropout(lstm_out)
        # Linear layer + elu
        linear_out = F.elu(self.linear(lstm_out.view(len(sentence), -1)))
        # classifier
        tag_space = self.fc(linear_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores


In [40]:
def prepare_glove_sequence(seq, to_ix):
    words = [w.lower() for w in seq]
    embeds = [vocab_embeddings[word] for word in words]
    return torch.tensor(embeds)

In [41]:
training_data = processData2(train_data)

In [554]:
dummy_file ='data/dummy'
dummy_data = readData(dummy_file)
dummy_data = processData(dummy_data)
dummy_data

[(['<unk>', 'shot', 'to'], ['B-PER', 'O', 'O']),
 (['<unk>'], ['O']),
 (['<unk>', '<unk>'], ['O', 'O'])]

In [42]:
# Hyperparameter
lr = 0.1
epochs = 1
print_every = 10

blstm2 = BLSTM2( embedding_dim, hidden_dim, tagset_size, linear_out_dim, lstm_layer, lstm_dropout ).to(device)

loss_function = nn.NLLLoss().to(device)
# loss_func = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [43]:
for epoch in range(epochs):
    for sentence, tags in tqdm(training_data, total=len(training_data)):
        blstm2.zero_grad()
        
        sentence = prepare_glove_sequence(sentence, word_to_idx).to(device)
        tags = prepare_sequence(tags, tag_to_idx).to(device)
        
        tag_scores = blstm2(sentence)
        loss = loss_function(tag_scores, tags)
        loss.backward()
        optimizer.step()
    if (epoch+1)%print_every == 0:
        print(loss)

100%|██████████| 14987/14987 [00:36<00:00, 405.06it/s]
100%|██████████| 14987/14987 [00:35<00:00, 426.59it/s]
100%|██████████| 14987/14987 [00:34<00:00, 430.37it/s]
100%|██████████| 14987/14987 [00:36<00:00, 407.38it/s]
100%|██████████| 14987/14987 [00:36<00:00, 412.83it/s]
100%|██████████| 14987/14987 [00:35<00:00, 425.43it/s]
100%|██████████| 14987/14987 [00:35<00:00, 427.54it/s]
100%|██████████| 14987/14987 [00:35<00:00, 428.08it/s]
100%|██████████| 14987/14987 [00:34<00:00, 430.49it/s]
100%|██████████| 14987/14987 [00:34<00:00, 430.49it/s]


tensor(2.1333, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:36<00:00, 406.85it/s]
100%|██████████| 14987/14987 [00:36<00:00, 411.92it/s]
100%|██████████| 14987/14987 [00:32<00:00, 460.73it/s]
100%|██████████| 14987/14987 [00:32<00:00, 458.37it/s]
100%|██████████| 14987/14987 [00:32<00:00, 459.81it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.92it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.66it/s]
100%|██████████| 14987/14987 [00:32<00:00, 458.73it/s]
100%|██████████| 14987/14987 [00:32<00:00, 460.57it/s]
100%|██████████| 14987/14987 [00:32<00:00, 458.23it/s]


tensor(2.1273, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:32<00:00, 461.04it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.66it/s]
100%|██████████| 14987/14987 [00:32<00:00, 459.74it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.32it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.70it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.50it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.12it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.81it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.80it/s]
100%|██████████| 14987/14987 [00:32<00:00, 458.31it/s]


tensor(2.1379, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:32<00:00, 461.31it/s]
100%|██████████| 14987/14987 [00:32<00:00, 458.73it/s]
100%|██████████| 14987/14987 [00:32<00:00, 460.71it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.73it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.52it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.07it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.62it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.48it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.37it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.64it/s]


tensor(2.1300, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:32<00:00, 462.13it/s]
100%|██████████| 14987/14987 [00:32<00:00, 458.45it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.36it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.86it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.76it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.43it/s]
100%|██████████| 14987/14987 [00:32<00:00, 461.52it/s]
100%|██████████| 14987/14987 [00:32<00:00, 460.37it/s]
100%|██████████| 14987/14987 [00:32<00:00, 459.94it/s]
100%|██████████| 14987/14987 [00:32<00:00, 462.73it/s]

tensor(2.1414, device='cuda:0', grad_fn=<NllLossBackward>)





In [44]:
blstm1_name = "blstm2"
PATH = f"{blstm1_name}.pt"

# Save
torch.save(blstm2, PATH)

# Load
blstm2 = torch.load(PATH)
blstm2.eval()

BLSTM2(
  (bilstm): LSTM(100, 256, bidirectional=True)
  (linear): Linear(in_features=512, out_features=128, bias=True)
  (fc): Linear(in_features=128, out_features=9, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
)

In [45]:
def generateEvalFile2(model, input_data, file_name, word_to_idx):
    # Reset the file
    open(file_name, 'w').close()
    f = open(file_name, "a")
    
    # model eval mode
    model.eval()
    
    for t in input_data:
        sentence = [ word[1] if word[1] in vocab_embeddings else '<unk>' for word in t]
        with torch.no_grad():
            inputs = prepare_glove_sequence(sentence, word_to_idx).to(device)
            tag_scores = model(inputs) 
            preds = [idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()]
            for word, pred in zip(t, preds):
                f.write(f'{word[0]} {word[1]} {word[2]} {pred}\n')
            f.write('\n')      
    f.close()
    
def generateTestPred2(model, input_data, file_name):
    # Reset the file
    open(file_name, 'w').close()
    f = open(file_name, "a")
    
    # model eval mode
    model.eval()
    
    for t in input_data:
        sentence = [ word[1] if word[1] in vocab_embeddings else '<unk>' for word in t]
        with torch.no_grad():
            inputs = prepare_glove_sequence(sentence, word_to_idx).to(device)
            tag_scores = model(inputs) 
            preds = [idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()]
            for word, pred in zip(t, preds):
                f.write(f'{word[0]} {word[1]} {pred}\n')
            f.write('\n')      
    f.close()

In [46]:
# dev_input = processData(dev_data)

generateEvalFile2(blstm2, dev_data, "dev2.out", word_to_idx)

generateTestPred2(blstm2, test_data, "test2.out")