In [131]:
import pandas as pd
import numpy as np
import nltk, re, json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.datasets as transforms

from sklearn.model_selection import train_test_split

In [78]:
def readFile(file):
    f = open(file)
    lines = f.readlines()
    words = []
    for line in lines:
        if line.strip():
            words.append(line.strip().split(' '))
    return words

In [79]:
# store train sentences 
train_file = 'data/train'
dev_file = 'data/dev'
test_file = 'data/test'

In [140]:
train_lines = readFile(train_file)
df = pd.DataFrame(train_lines, columns = ["s_idx", "word", "tag"])

In [149]:
# Randomly select some rare words to be <unk> words
unique_words = df["word"].value_counts().reset_index()
unique_words.columns = ["word", "freq"]
threshold = 1
vocab_words = unique_words[ unique_words['freq'] > threshold ]
rare_words = unique_words[ unique_words['freq'] == threshold ]
print("vocab words:", vocab_words.shape[0])
print("rare words:", rare_words.shape[0])

vocab words: 11983
rare words: 11641


In [165]:
# Randomly select 3000 words from rare words to set as unknown words
unk_count = 3000
unk_rares = rare_words.sample(unk_count)

# Use this list to replace train words to <unk>
unk_set = set(unk_rares["word"].unique().tolist())

# drop selected rare words from rare words
rest_rares = rare_words.drop(unk_rares.index)

# build new vocab = freq_words + rest_rare_words + <unk>
vocab = vocab_words.append(rest_rares, ignore_index=True)
unk_row = pd.DataFrame([["<unk>", 3000]], columns = ["word", "freq"])
vocab = vocab.append(unk_row, ignore_index=True)

# main vocab list, to generate embedding
vocab_set = set(vocab['word'].unique().tolist())

In [187]:
input_dim = 100
lstm_layer = 1
hidden_dim = 256
lstm_dropout = 0.33
linear_out_dim = 128

# all the tags
tags = set(df["tag"].unique())
tag_to_idx = {tag:i for i, tag in enumerate(tags)}
num_classes = len(df["tag"].unique())

# all the vocab
word_to_idx = {word:i for i, word in enumerate(vocab_set)}
# embeds = nn.Embedding(len(vocab), input_dim)

In [37]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [180]:
def readSentences(file):
    f = open(file)
    lines = f.readlines()
    sentences = []
    sentence = []
    for line in lines:
        if not line.strip():
            sentences.append(sentence.copy())
            sentence.clear()
        else:
            sentence.append(line.strip().split(' '))
    # append the last sentence
    sentences.append(sentence.copy())
    return sentences

In [182]:
# Group sentences together
train_file = 'data/train'
dev_file = 'data/dev'
test_file = 'data/test'

# word = [idx, word, tag]
train_sentences = readSentences(train_file)
dev_sentences = readSentences(dev_file)
# word = [idx, word]
test_sentences = readSentences(test_file)

In [183]:
print(len(train_sentences), len(dev_sentences), len(test_sentences))

14987 3466 3684


In [181]:
dummy_file ='data/dummy'
dummy_sentences = readSentences(dummy_file)
dummy_sentences

[[['1', 'Weaver', 'B-PER'],
  ['2', 'shot', 'O'],
  ['3', 'to', 'O'],
  ['4', 'prominence', 'O'],
  ['5', 'in', 'O'],
  ['6', '1994', 'O'],
  ['7', 'when', 'O'],
  ['8', 'he', 'O'],
  ['9', 'won', 'O'],
  ['10', 'the', 'O'],
  ['11', 'English', 'B-MISC'],
  ['12', '2,000', 'B-MISC'],
  ['13', 'Guineas', 'I-MISC'],
  ['14', 'on', 'O'],
  ['15', 'Mister', 'B-LOC'],
  ['16', 'Baileys', 'I-LOC'],
  ['17', 'in', 'O'],
  ['18', 'his', 'O'],
  ['19', 'first', 'O'],
  ['20', 'ride', 'O'],
  ['21', 'in', 'O'],
  ['22', 'a', 'O'],
  ['23', 'classic', 'O'],
  ['24', '.', 'O']],
 [['1', 'Results', 'O'],
  ['2', 'of', 'O'],
  ['3', 'English', 'B-MISC'],
  ['4', 'league', 'O'],
  ['5', 'matches', 'O']]]

In [196]:
def makeData(sentences):
    inputs = []
    targets = []
    for sentence in sentences:
        word_idx = []
        target = []
        for word in sentence:
            if word[1] in vocab_set:
                word_idx.append(word_to_idx[word[1]])
            else:
                word_idx.append(word_to_idx['<unk>'])            
            target.append(tag_to_idx[word[2]])
        inputs.append(word_idx)
        targets.append(target)
    return inputs, targets

In [189]:
word_to_idx['<unk>']

18762

In [205]:
makeData(dummy_sentences)

([[13244,
   6244,
   10461,
   16573,
   11161,
   4466,
   7331,
   7126,
   2098,
   13859,
   19747,
   11692,
   2973,
   13325,
   11308,
   18762,
   11161,
   14128,
   9291,
   17302,
   11161,
   9525,
   18762,
   10983],
  [4341, 17001, 19747, 19233, 700]],
 [[1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 3, 4, 6, 7, 4, 4, 4, 4, 4, 4, 4, 4],
  [4, 4, 0, 4, 4]])

ValueError: expected sequence of length 24 at dim 1 (got 5)

In [122]:
def getEmbedding(word):
    index = word_to_idx[word]
    lookup_tensor = torch.tensor(index, dtype=torch.long)
    return embeds(lookup_tensor)

In [167]:
unk_set

{'Baileys',
 'perilous',
 '13.33',
 'teacher',
 'servicing',
 '1203',
 '2:44.22',
 'KABUL',
 'comi',
 '23-27',
 'stretching',
 'harrowing',
 'Gjon',
 'Hess',
 '12.124',
 'Likkason',
 '1:45.62',
 'Man-of-the-Match',
 'dwell',
 'startups',
 '112.8',
 'mobilised',
 'Fannie',
 'Warren',
 'BOOK',
 'Koeln',
 'untie',
 'sadmasochistic',
 'Nikiforov',
 '90.18',
 '7.84',
 'implying',
 'portfolios',
 'compromised',
 'fortunate',
 '12.2',
 'Massira',
 'ostracism',
 'Angelica',
 'RSR',
 'Kryvy',
 'comprises',
 '469',
 'Pentagon',
 'Bernardo',
 'long-delayed',
 'half-volley',
 '8-10',
 '21.JAN.99',
 '1,294.5',
 '65-8703092',
 'IN-NAZZJON',
 'Gorgona',
 '5,700',
 'fifth-placed',
 'Boopathy',
 '1,130',
 'Mendy',
 '27-4',
 'kingdoms',
 '62.50',
 'well-wishers',
 'Conc',
 '2030',
 'Dnipropetrovsk',
 '146.2',
 'hat-tricks',
 'Syrian-Lebanese',
 '.571',
 'Ralf',
 '45.455',
 'firefighters',
 'scoreboard',
 'square-leg',
 'vulnerable',
 'DNEVNI',
 'Supermarkets',
 '212-859-1736',
 'Eleonora',
 'Yap',
 'MCC

In [140]:
tests = sentences[:5]
input_batch, target_batch = makeData(tests)
input_batch

KeyError: 'LEICESTERSHIRE'

In [65]:
# (batch, seq, feature)
rnn = nn.LSTM(
    input_size=input_dim, 
    hidden_size=hidden_dim,
    num_layers=1,
    batch_first=True,
    bidirectional=True,
    dropout=lstm_dropout)

