In [1]:
import pandas as pd
import numpy as np
import nltk, re, json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as t
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

from torch.utils.data import DataLoader
import torchvision.datasets as datasets
import torchvision.datasets as transforms

from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
train_file = 'data/train'
dev_file = 'data/dev'
test_file = 'data/test'
dummy_file ='data/dummy'

# read train/test file, each line as {s_idx, word, tag} tuple, store in a list
def readFile(file):
    f = open(file)
    lines = f.readlines()
    words = []
    for line in lines:
        if line.strip():
            words.append(line.strip().split(' '))
    return words

# DF: index - s_idx - word - tag
train_lines = readFile(train_file)
df = pd.DataFrame(train_lines, columns = ["s_idx", "word", "tag"])

# Randomly select some rare words to be <unk> words
unique_words = df["word"].value_counts().reset_index()
unique_words.columns = ["word", "freq"]
threshold = 3
# words with freq > threshold
vocab_words = unique_words[ unique_words['freq'] > threshold ]
# words with freq <= threshold
rare_words = unique_words[ unique_words['freq'] <= threshold ]

# custom words unk, pad etc
# custom_vocab = ['<unk>']
custom_vocab = ['<unk>', '<pad>']

# main vocab list, to generate embedding
vocab_set = set(custom_vocab + vocab_words['word'].unique().tolist())
vocab_size = len(vocab_set)

# all the vocab
word_to_idx = {word:i for i, word in enumerate(vocab_set)}

# all the unique tags
unique_tags = set(df["tag"].unique().tolist())
tag_to_idx = {tag:i for i, tag in enumerate(unique_tags)}
idx_to_tag = {i:tag for i, tag in enumerate(unique_tags)}

# read files, group words by sentence, return list of sentences
def readData(file):
    f = open(file)
    lines = f.readlines()
    sentences = []
    sentence = []
    for line in lines:
        if not line.strip():
            sentences.append(sentence.copy())
            sentence.clear()
        else:
            sentence.append(line.strip().split(' '))
    # append the last sentence
    sentences.append(sentence.copy())
    return sentences

# word = [idx, word, tag]  train_data = list of sentences in term of list of words
train_data = readData(train_file)

dev_data = readData(dev_file)
# word = [idx, word]
test_data = readData(test_file)

# Dummy test data
dummy_file ='data/dummy'
dummy_data = readData(dummy_file)

# Preapare training data
def processData(tuples):
    training_data = []
    for t in tuples:
        training_data.append( ( [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ], [ word[2] for word in t ] ) )
    return training_data

# Convert sequence into tensor
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# prepare padded data, return inputs and labels
# def processPaddedData(tuples, max_seq_len):
#     inputs = []
#     labels = []
#     PAD = '<pad>'
#     for t in tuples:
#         seq = [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ]
#         # pad seq
#         if len(seq) < max_seq_len:
#             seq += [ PAD for _ in range(max_seq_len-len(seq)) ]
#         inputs.append(seq)
#         labels.append( [ word[2] for word in t] )
        
#     return inputs, labels

def processPaddedData(tuples, max_seq_len):
    inputs = []
    labels = []
    PAD = '<pad>'
    for t in tuples:
        seq = [ word[1] if word[1] in word_to_idx else '<unk>' for word in t ]
        inputs.append(seq)
        labels.append( [ word[2] for word in t] )
        
    return inputs, labels

def seq2idx(inputs, to_ix):
    return [ torch.tensor([to_ix[w] for w in seq]) for seq in inputs ]

In [7]:
seqs, labels = processPaddedData(dummy_data, word_to_idx)

In [31]:
a = pad_sequence(seq2idx(seqs, word_to_idx), batch_first=True, padding_value=padding_idx)
lengths = t.tensor([len(_) for _ in seqs])
padding_idx = word_to_idx['<pad>']
print("padding_idx", padding_idx)
print(a, lengths)

padding_idx 4968
tensor([[2446, 5046, 5881],
        [5937, 4968, 4968],
        [1720, 1720, 4968]]) tensor([3, 1, 2])


In [None]:
a_y = 

In [34]:
# Sort seqs by length
a_lengths, idx = lengths.sort(0, descending=True)
_, un_idx = t.sort(idx, dim=0)
a = a[idx]
a

tensor([[2446, 5046, 5881],
        [1720, 1720, 4968],
        [5937, 4968, 4968]])

In [33]:
# LSTM

embedding = nn.Embedding(vocab_size, 100, padding_idx=word_to_idx['<pad>'])
lstm = nn.LSTM(100, 128, batch_first=True)


In [35]:
a_input = embedding(a)
print(a_input.shape)

a_packed_input = t.nn.utils.rnn.pack_padded_sequence(input=a_input, lengths=a_lengths, batch_first=True)
packed_out, _ = lstm(a_packed_input)
out, _ = pad_packed_sequence(packed_out)
#original order
out = t.index_select(out, 0, un_idx)

# out, _ = lstm(a_input)
print(out.shape)
out

torch.Size([3, 3, 100])
torch.Size([3, 3, 128])


tensor([[[ 0.0019, -0.0315, -0.0702,  ..., -0.0456, -0.1659, -0.1094],
         [ 0.0301,  0.1060,  0.1276,  ..., -0.0611, -0.0634,  0.0340],
         [ 0.1644, -0.2324, -0.2320,  ...,  0.0900, -0.0094,  0.1267]],

        [[-0.1318, -0.1016, -0.0932,  ...,  0.1361, -0.0183, -0.0684],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[-0.0458,  0.0207,  0.0781,  ..., -0.1244, -0.0597,  0.1081],
         [ 0.0310,  0.1713,  0.2261,  ..., -0.0700, -0.1128,  0.0323],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]],
       grad_fn=<IndexSelectBackward>)

In [36]:
fc = nn.Linear(128, 9)
fc_out = fc(out)
fc_out

tensor([[[ 0.0947, -0.0699, -0.0393, -0.0997,  0.1415,  0.0559, -0.1292,
          -0.0003,  0.0490],
         [ 0.1884, -0.0755, -0.0905, -0.1593,  0.1813, -0.0312, -0.0448,
           0.0479,  0.0121],
         [ 0.1885, -0.1232,  0.0196, -0.0502,  0.0886, -0.0023, -0.0610,
           0.0203,  0.0592]],

        [[ 0.1491, -0.0374, -0.1163, -0.0381,  0.0950, -0.0411, -0.1155,
           0.0087, -0.0763],
         [ 0.0823, -0.0788, -0.0048, -0.0126,  0.0610,  0.0547, -0.0855,
           0.0088,  0.0745],
         [ 0.0823, -0.0788, -0.0048, -0.0126,  0.0610,  0.0547, -0.0855,
           0.0088,  0.0745]],

        [[ 0.0356, -0.0730,  0.0846, -0.0952,  0.1377,  0.0878, -0.0780,
          -0.0179,  0.0475],
         [ 0.2325, -0.0737, -0.1263, -0.2308,  0.2237, -0.0728, -0.0297,
           0.0686, -0.0182],
         [ 0.0823, -0.0788, -0.0048, -0.0126,  0.0610,  0.0547, -0.0855,
           0.0088,  0.0745]]], grad_fn=<AddBackward0>)

In [44]:
fc_out.shape

torch.Size([3, 3, 9])

In [30]:
for l in lengths:
    print()

tensor([3, 1, 2])