In [8]:
import torch
import torch.nn as nn


In [2]:
!pip install nltk

You should consider upgrading via the 'c:\users\yi wai\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [4]:
''' 
Preprocess Data
'''
import re
import string
import nltk
from nltk import word_tokenize

OTHER_TAG = "other"
PUNCT_TAG = "punct"

with open('./utils/tags.txt', encoding="utf-8", errors='ignore') as f:
    tags = set([tag for tag in f.readlines()])

def tag_token(token, tag):
    if token in string.punctuation:
        return (token, PUNCT_TAG)
    return (token, tag)

def get_tagged_tokens(groups):
    for group in groups:
        ref, tag = group[0], group[1]
        if tag not in tags:
            tag = OTHER_TAG
        unlabelled = re.sub(r'\<\/?\w*\>\s*', "", ref).strip()
        tokens = word_tokenize(unlabelled)
        tagged_tokens = list(map(lambda token: tag_token(token, tag), tokens))
    return tagged_tokens

dataset = []
with open('./dataset/standardized_dataset.txt', encoding="utf-8", errors='ignore') as f:
    refs = f.readlines()
    for ref in refs:
        groups = re.findall(r'(\<(.*)\>.*\<\/\2\>)', ref) # format (<tag>...</tag>, tag)
        tagged_tokens = get_tagged_tokens(groups)
        dataset.append(tagged_tokens)

print(dataset)

other'), ('98', 'other'), ('.', 'punct'), ('Springer', 'other'), (',', 'punct'), ('2012', 'other'), ('.', 'punct')], [('A', 'title'), ('classification', 'title'), ('of', 'title'), ('simple', 'title'), ('modules', 'title'), ('of', 'title'), ('the', 'title'), ('Lie', 'title'), ('algebra', 'title'), ('of', 'title'), ('polynomial', 'title'), ('unitriangular', 'title'), ('derivations', 'title'), ('in', 'title'), ('two', 'title'), ('variables', 'title'), ('.', 'punct')], [('Phd-thesis', 'other'), (',', 'punct'), ('KU', 'other'), ('Leuven', 'other'), ('(', 'punct'), ('1995', 'other'), (')', 'punct'), ('.', 'punct')], [('2008', 'other'), (',', 'punct'), ('A', 'other'), ('&', 'punct'), ('A', 'other'), (',', 'punct'), ('478', 'other'), (',', 'punct'), ('83', 'other')], [('Available', 'other'), ('as', 'other'), ('arXiv', 'other'), (':', 'punct'), ('math.0603563', 'other'), ('.', 'punct')], [('Submitted', 'other'), ('to', 'other'), ('C.', 'other'), ('R.', 'other'), ('A.', 'other'), ('S', 'other'),

In [9]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size = self.input_size,
            hidden_size = self.hidden_size,
            num_layers = self.num_layers,
            batch_first = True
        )

    def forward(self, x):
        # Initialise hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Initialise internal state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(x, (h_0, c_0))
        
        return output, (hn, cn)


In [14]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.lstm = LSTM(input_size, hidden_size, output_size, num_layers)
        self.fc1 = nn.Linear(hidden_size, 128)
        self.fc = nn.Linear(128, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        hn = hn.view(-1, self.hidden_size)

        output = self.relu(hn)
        output = self.fc1(output)
        output = self.relu(output)
        output = self.fc(output)

        return output


In [16]:
'''
Hyperparameters
'''
num_epochs = 1000
learning_rate = 0.001

input_size = 5 # Number of features
hidden_size = 2 # Number of features in the hidden state
num_layers = 1 # Number of stacked LSTM layers

output_size = 2 # Number of output classes

model = Net(input_size, hidden_size, output_size, num_layers)

In [17]:
'''
Loss Function and Optimiser
'''
criterion = nn.CrossEntropyLoss() 
optimiser = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
# TODO: Obtain data x_train and y_train
for epoch in range(num_epochs):
    outputs = model.forward(x_train)
    optimizer.zero_grad()

    # Get the loss function
    loss = criterion(outputs, y_train)

    # Calculate loss
    loss.backward()

    # Backpropagation
    optimizer.step()

    # Print loss at every 100th epoch
    if epoch % 100 == 0:
        print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))