In [351]:
import pandas as pd
import numpy as np
import nltk, re, json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.datasets as transforms

from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [78]:
def readFile(file):
    f = open(file)
    lines = f.readlines()
    words = []
    for line in lines:
        if line.strip():
            words.append(line.strip().split(' '))
    return words

In [79]:
# store train sentences 
train_file = 'data/train'
dev_file = 'data/dev'
test_file = 'data/test'

In [140]:
train_lines = readFile(train_file)
df = pd.DataFrame(train_lines, columns = ["s_idx", "word", "tag"])

In [149]:
# Randomly select some rare words to be <unk> words
unique_words = df["word"].value_counts().reset_index()
unique_words.columns = ["word", "freq"]
threshold = 1
vocab_words = unique_words[ unique_words['freq'] > threshold ]
rare_words = unique_words[ unique_words['freq'] == threshold ]
print("vocab words:", vocab_words.shape[0])
print("rare words:", rare_words.shape[0])

vocab words: 11983
rare words: 11641


In [165]:
# Randomly select 3000 words from rare words to set as unknown words
unk_count = 3000
unk_rares = rare_words.sample(unk_count)

# Use this list to replace train words to <unk>
unk_set = set(unk_rares["word"].unique().tolist())

# drop selected rare words from rare words
rest_rares = rare_words.drop(unk_rares.index)

# build new vocab = freq_words + rest_rare_words + <unk>
vocab = vocab_words.append(rest_rares, ignore_index=True)
unk_row = pd.DataFrame([["<unk>", 3000]], columns = ["word", "freq"])
vocab = vocab.append(unk_row, ignore_index=True)

# main vocab list, to generate embedding
vocab_set = set(vocab['word'].unique().tolist())

In [None]:
# all the vocab
word_to_idx = {word:i for i, word in enumerate(vocab_set)}

In [180]:
def readSentences(file):
    f = open(file)
    lines = f.readlines()
    sentences = []
    sentence = []
    for line in lines:
        if not line.strip():
            sentences.append(sentence.copy())
            sentence.clear()
        else:
            sentence.append(line.strip().split(' '))
    # append the last sentence
    sentences.append(sentence.copy())
    return sentences

In [182]:
# Group sentences together
train_file = 'data/train'
dev_file = 'data/dev'
test_file = 'data/test'

# word = [idx, word, tag]
train_sentences = readSentences(train_file)
dev_sentences = readSentences(dev_file)
# word = [idx, word]
test_sentences = readSentences(test_file)

# Dummy test data
dummy_file ='data/dummy'
dummy_sentences = readSentences(dummy_file)

In [183]:
print(len(train_sentences), len(dev_sentences), len(test_sentences))

14987 3466 3684


In [225]:
def makeData(sentences):
    inputs = []
    targets = []
    for sentence in sentences:
        word_idx = []
        target = []
        for word in sentence:
            if word[1] in vocab_set:
                word_idx.append(word_to_idx[word[1]])
            else:
                word_idx.append(word_to_idx['<unk>'])            
            target.append(tag_to_idx[word[2]])
        inputs.append(word_idx)
        targets.append(target)
    return inputs, targets

In [207]:
# # Take sentences array, return embedding index, and targets
# def makeData(sentences):
#     # List of list of tensor idx, each idx represent a word in the sentence
#     inputs = []
#     # list of list of int idx, each idx represent a tag.
#     targets = []
#     for sentence in sentences:
#         word_idx = []
#         target = []
#         for word in sentence:
#             if word[1] in vocab_set:
#                 # convert int index to torch.long
#                 word_idx.append(torch.tensor(word_to_idx[word[1]], dtype=torch.long))
#             else:
#                 word_idx.append(torch.tensor(word_to_idx['<unk>'], dtype=torch.long))            
#             target.append(tag_to_idx[word[2]])
#         inputs.append(word_idx)
#         targets.append(target)
#     return inputs, targets

In [356]:
# all the unique tags
tags = set(df["tag"].unique())
tag_to_idx = {tag:i for i, tag in enumerate(tags)}
num_classes = len(df["tag"].unique())

embedding_dim = 100
lstm_layer = 1
hidden_dim = 256
lstm_dropout = 0.33
linear_out_dim = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [389]:
class BLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes, linear_out_dim, lstm_layer, lstm_dropout):
        super(BLSTM, self).__init__()
        self.hidden_size = hidden_dim
        self.num_layers = lstm_layer
        self.out_size = num_classes
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.bilstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=self.hidden_size,
            batch_first=True,
            bidirectional=True)
        # Linear 1
        self.fc1 = nn.Linear(2*self.hidden_size,linear_out_dim)
        # classifier linear
        self.fc2 = nn.Linear(linear_out_dim, num_classes)
        self.dropout = nn.Dropout(lstm_dropout)
        
        
        
    def forward(self, x):
        batch_size, seq_len = x.shape
        
#         h0 = torch.randn(self.num_layers*2, batch_size, self.hidden_size).to(device)
#         c0 = torch.randn(self.num_layers*2, batch_size, self.hidden_size).to(device)
        h0 = torch.randn(self.num_layers*2, batch_size, self.hidden_size).to(device)
        c0 = torch.randn(self.num_layers*2, batch_size, self.hidden_size).to(device)
        # embedding layer + dropout
        x = self.dropout(self.embedding(x))
        # BLSTM layer
        x , _ = self.bilstm(x)
        # Linear + ELU
        x = F.elu(self.fc1(x))
        # classifier
        x = self.fc2(x)
        x = F.softmax(x, dim=1)
        return x.view(seq_len, self.out_size)
    

In [329]:

input = torch.randn(2,10)
out = F.elu(input)
print(out)
fc = nn.Linear(10,5)
out = fc(out)
print(out)
F.augmaF.softmax(out, dim=1)）

tensor([[ 1.1715,  0.3064, -0.1101,  0.5220, -0.4690, -0.6432, -0.3959, -0.0424,
          1.0491, -0.0986],
        [ 0.9205,  0.5053, -0.2874, -0.0737, -0.8828, -0.5819,  0.3481,  1.1651,
          0.3174,  0.9363]])
tensor([[-0.3601, -1.0359,  0.1784,  0.9853,  0.2091],
        [ 0.0415, -0.8341,  0.7340,  0.5089, -0.0962]],
       grad_fn=<AddmmBackward>)


tensor([[0.1133, 0.0576, 0.1941, 0.4349, 0.2001],
        [0.1700, 0.0708, 0.3398, 0.2713, 0.1481]], grad_fn=<SoftmaxBackward>)

In [None]:
blstm_model = BLSTM(len(vocab_set),embedding_dim, hidden_dim, num_classes, linear_out_dim, lstm_layer, lstm_dropout).to(device)
optimizer = torch.optim.SGD(blstm_model.parameters(), lr=0.1)
loss_func = nn.CrossEntropyLoss().to(device)
epochs = 50

input_batch, target_batch = makeData(train_sentences)

for epoch in range(epochs):
    # train one sentence at a time
    for input, target in tqdm(zip(input_batch, target_batch), total=len(input_batch)):

        x = torch.LongTensor([input]).to(device)
        y = torch.LongTensor(target).to(device)
        preds = blstm_model(x)
        loss = loss_func(preds,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(loss)

    

100%|██████████| 14987/14987 [00:42<00:00, 353.68it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:43<00:00, 344.11it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:41<00:00, 357.13it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:47<00:00, 315.46it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:43<00:00, 344.35it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:43<00:00, 341.49it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:44<00:00, 335.01it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:41<00:00, 361.58it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:41<00:00, 360.19it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:43<00:00, 346.98it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:43<00:00, 344.08it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:41<00:00, 363.57it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:43<00:00, 347.21it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:43<00:00, 343.76it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:42<00:00, 355.54it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:43<00:00, 344.25it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:42<00:00, 353.10it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:42<00:00, 355.64it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:42<00:00, 356.20it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


100%|██████████| 14987/14987 [00:42<00:00, 349.44it/s]


tensor(2.1972, device='cuda:0', grad_fn=<NllLossBackward>)


 91%|█████████ | 13662/14987 [00:38<00:03, 360.10it/s]

In [384]:
blstm_model = BLSTM(len(vocab_set),embedding_dim, hidden_dim, num_classes, linear_out_dim, lstm_layer, lstm_dropout).to(device)
blstm_model

BLSTM(
  (embedding): Embedding(20625, 100)
  (bilstm): LSTM(100, 256, batch_first=True, bidirectional=True)
  (fc1): Linear(in_features=512, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=9, bias=True)
  (dropout): Dropout(p=0.33, inplace=False)
)