In [2]:
import pandas as pd
import numpy as np
import nltk, re, json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.datasets as transforms

from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
# store train sentences 
train_file = 'data/train'
dev_file = 'data/dev'
test_file = 'data/test'
dummy_file ='data/dummy'

In [4]:
# read train/test file, each line as {s_idx, word, tag} tuple, store in a list
def readFile(file):
    f = open(file)
    lines = f.readlines()
    words = []
    for line in lines:
        if line.strip():
            words.append(line.strip().split(' '))
    return words

In [5]:
# DF: index - s_idx - word - tag
train_lines = readFile(train_file)
df = pd.DataFrame(train_lines, columns = ["s_idx", "word", "tag"])
df.head(5)

Unnamed: 0,s_idx,word,tag
0,1,EU,B-ORG
1,2,rejects,O
2,3,German,B-MISC
3,4,call,O
4,5,to,O


In [6]:
# Randomly select some rare words to be <unk> words
unique_words = df["word"].value_counts().reset_index()
unique_words.columns = ["word", "freq"]
threshold = 1
# words with freq > 1
vocab_words = unique_words[ unique_words['freq'] > threshold ]
# words with freq = 1
rare_words = unique_words[ unique_words['freq'] == threshold ]

print("vocab words:", vocab_words.shape[0])
print("rare words:", rare_words.shape[0])

vocab words: 11983
rare words: 11641


In [7]:
# Randomly select 3000 words from rare words to set as unknown words
unk_count = 3000
unk_words = rare_words.sample(unk_count)

# drop the selected rare words from vocab
rare_words = rare_words.drop(unk_words.index)

# build new vocab = freq_words + rest_rare_words + <unk>
vocab_words = vocab_words.append(rare_words, ignore_index=True)

# custom words unk, pad etc
custom_vocab = ['<unk>']

# main vocab list, to generate embedding
vocab_set = set(custom_vocab + vocab_words['word'].unique().tolist())
vocab_size = len(vocab_set)

In [8]:
# all the vocab
word_to_idx = {word:i for i, word in enumerate(vocab_set)}

# all the unique tags
unique_tags = set(df["tag"].unique())
tag_to_idx = {tag:i for i, tag in enumerate(unique_tags)}
idx_to_tag = {i:tag for i, tag in enumerate(unique_tags)}

In [9]:
# read files, group words by sentence, return list of sentences
def readData(file):
    f = open(file)
    lines = f.readlines()
    sentences = []
    sentence = []
    for line in lines:
        if not line.strip():
            sentences.append(sentence.copy())
            sentence.clear()
        else:
            sentence.append(line.strip().split(' '))
    # append the last sentence
    sentences.append(sentence.copy())
    return sentences

In [10]:
# word = [idx, word, tag]  train_data = list of sentences in term of list of words
train_data = readData(train_file)

dev_data = readData(dev_file)
# word = [idx, word]
test_data = readData(test_file)

# Dummy test data
dummy_file ='data/dummy'
dummy_data = readData(dummy_file)

In [183]:
print(len(train_sentences), len(dev_sentences), len(test_sentences))

14987 3466 3684


In [11]:
dummy_data

[[['1', 'Weaver', 'B-PER'],
  ['2', 'shot', 'O'],
  ['3', 'to', 'O'],
  ['4', 'prominence', 'O'],
  ['5', 'in', 'O'],
  ['6', '1994', 'O'],
  ['7', 'when', 'O'],
  ['8', 'he', 'O'],
  ['9', 'won', 'O'],
  ['10', 'the', 'O'],
  ['11', 'English', 'B-MISC'],
  ['12', '2,000', 'B-MISC'],
  ['13', 'Guineas', 'I-MISC'],
  ['14', 'on', 'O'],
  ['15', 'Mister', 'B-LOC'],
  ['16', 'Baileys', 'I-LOC'],
  ['17', 'in', 'O'],
  ['18', 'his', 'O'],
  ['19', 'first', 'O'],
  ['20', 'ride', 'O'],
  ['21', 'in', 'O'],
  ['22', 'a', 'O'],
  ['23', 'classic', 'O'],
  ['24', '.', 'O']],
 [['1', 'Results', 'O'],
  ['2', 'of', 'O'],
  ['3', 'English', 'B-MISC'],
  ['4', 'league', 'O'],
  ['5', 'matches', 'O']]]

In [12]:
# Preapare training data
def processData(tuples):
    training_data = []
    for t in tuples:
        training_data.append( ( [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ], [ word[2] for word in t ] ) )
    return training_data

# Convert sequence into tensor
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [97]:
training_data = processData(train_data)

In [15]:
for sent, t in training_data:
    print(sent)
    print(t)
    print("---")

['Weaver', 'shot', 'to', 'prominence', 'in', '1994', 'when', 'he', 'won', 'the', 'English', '2,000', 'Guineas', 'on', 'Mister', 'Baileys', 'in', 'his', 'first', 'ride', 'in', 'a', 'classic', '.']
['B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'B-MISC', 'I-MISC', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
---
['Results', 'of', 'English', 'league', 'matches']
['O', 'O', 'B-MISC', 'O', 'O']
---


In [98]:
embedding_dim = 100
hidden_dim = 256
vocab_size = len(word_to_idx)
tagset_size = len(tag_to_idx)

lstm_layer = 1
lstm_dropout = 0.33
linear_out_dim = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [99]:
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
    
    # sentence [seq, batch, embed_dim]
    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [103]:
# Hyperparameter
lr = 0.1
epochs = 10
print_every = 1

In [104]:
model = LSTMTagger(embedding_dim, hidden_dim, vocab_size, tagset_size).to(device)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=lr)

In [105]:
# Before training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_idx).to(device)
    tag_scores = model(inputs)
    print([idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()])


    
for epoch in range(epochs):
    for sentence, tags in tqdm(training_data, total=len(training_data)):
        model.zero_grad()
        
        sentence_in = prepare_sequence(sentence, word_to_idx).to(device)
        targets = prepare_sequence(tags, tag_to_idx).to(device)
        
        tag_scores = model(sentence_in)
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
    if epoch%print_every == 0:
        print(loss)
    
    
# After training
with torch.no_grad():
    inputs = prepare_sequence(training_data[0][0], word_to_idx).to(device)
    tag_scores = model(inputs)
    print([idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()])

['I-MISC', 'B-MISC', 'B-LOC', 'B-LOC', 'B-PER', 'I-PER', 'O', 'O', 'O']


100%|██████████| 14987/14987 [00:28<00:00, 523.46it/s]


tensor(0.0017, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:28<00:00, 522.86it/s]


tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:28<00:00, 529.18it/s]


tensor(3.1113e-05, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:28<00:00, 519.47it/s]


tensor(1.4305e-06, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:29<00:00, 508.82it/s]


tensor(2.3842e-07, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:28<00:00, 520.68it/s]


tensor(1.1921e-07, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:29<00:00, 500.97it/s]


tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:31<00:00, 476.77it/s]


tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:29<00:00, 514.64it/s]


tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:28<00:00, 518.65it/s]

tensor(0., device='cuda:0', grad_fn=<NllLossBackward>)
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']





In [106]:
def generateEvalFile(model, input_data, file_name):
    # Reset the file
    open(file_name, 'w').close()
    f = open(file_name, "a")
    
    for sentence, tags in input_data:
        idx = 1
        with torch.no_grad():
            inputs = prepare_sequence(sentence, word_to_idx).to(device)
            tag_scores = model(inputs) 
            preds = [idx_to_tag[i] for i in torch.argmax(tag_scores, dim=1).tolist()]
            for word, gold, pred in zip(sentence, tags, preds):
                f.write(f'{idx} {word} {gold} {pred}\n')
                idx+=1
            f.write('\n')      
    f.close()

In [107]:
dev_input = processData(dev_data)
generateEvalFile(model, dev_input, "dev_eval.txt")