In [None]:
!pip install nltk
!pip install torch

In [None]:
import torch
import torch.nn as nn


In [20]:
''' 
Preprocess Data
'''

import re
import nltk
import string
import fasttext
from nltk import word_tokenize

OTHER_TAG = "other"
PUNCT_TAG = "punct"

with open('./utils/tags.txt', encoding="utf-8", errors='ignore') as f:
    tags = set([tag for tag in f.readlines()])

def remove_labels(text):
    return re.sub(r'\<\/?\w*\>\s*', "", text).strip()

def tag_token(token, tag):
    if token in string.punctuation:
        return (token, PUNCT_TAG)
    return (token, tag)

def get_tagged_tokens(groups):
    for group in groups:
        ref, tag = group[0], group[1]
        if tag not in tags:
            tag = OTHER_TAG
        unlabelled = remove_labels(ref)
        tokens = word_tokenize(unlabelled)
        tagged_tokens = list(map(lambda token: tag_token(token, tag), tokens))
    return tagged_tokens

''' Attach tags to each token '''
def attach_tags(dataset_path):
    dataset = []
    with open(dataset_path, encoding="utf-8", errors='ignore') as f:
        refs = f.readlines()
        for ref in refs:
            groups = re.findall(r'(\<(.*)\>.*\<\/\2\>)', ref) # format (<tag>...</tag>, tag)
            tagged_tokens = get_tagged_tokens(groups)
            dataset.append(tagged_tokens)
    return dataset

''' Removes labels and tokenizes '''
def tokenized_dataset(dataset_path, sep=" "):
    dataset = []
    with open(dataset_path, encoding="utf-8", errors='ignore') as f:
        refs = f.readlines()
        for ref in refs:
            ref = remove_labels(ref) 
            tokenized = " ".join(word_tokenize(ref))
            dataset.append(tokenized)
    return dataset

def train_word_embedding_model(dataset_path, retrain=True):
    embedding_dataset_path = './dataset/word_embedding_dataset'
    model_path = './models/word_embedding.bin'

    if retrain:
        with open(embedding_dataset_path, 'w', errors='ignore') as f:
            # fasttext tokenizes by whitespaces
            word_embedding_dataset = tokenized_dataset(dataset_path, sep=" ") 
            f.write("\n".join(word_embedding_dataset))
        model = fasttext.train_unsupervised(embedding_dataset_path)
        model.save_model(model_path)
        return model
    else:
        return fasttext.load_model(model_path)

def map_to_index(keys):
    idx_map, idx = {}, 0
    for key in keys:
        idx_map[key] = idx
        idx += 1
    return idx_map

dataset_path = './dataset/standardized_dataset.txt'
dataset = attach_tags(dataset_path)

word_embedding_model = train_word_embedding_model(dataset_path, retrain=True)

all_tags = tags 
all_tags.add(OTHER_TAG)
all_tags.add(PUNCT_TAG)
tag_to_idx = map_to_index(all_tags)

X, y = [], []
for ref in dataset:
    X_ref, y_ref = [], []
    for token, tag in ref:
        X_ref.append(word_embedding_model.get_word_vector(token))
        y_ref.append(tag_to_idx[tag])
    X.append(X_ref)
    y.append(y_ref)


In [21]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
    
        self.lstm = nn.LSTM(
            input_size = self.input_size,
            hidden_size = self.hidden_size,
            num_layers = self.num_layers,
            batch_first = True
        )

    def forward(self, x):
        # Initialise hidden state
        print(self.num_layers)
        print(x.size(0))
        print(self.hidden_size)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Initialise internal state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)

        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(embedding, (h_0, c_0))
        
        return output, (hn, cn)


In [22]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super().__init__()
        self.hidden_size = hidden_size
        
        self.lstm = LSTM(input_size, hidden_size, output_size, num_layers)
        self.fc1 = nn.Linear(hidden_size, 128)
        self.fc = nn.Linear(128, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        output, (hn, cn) = self.lstm(x)
        hn = hn.view(-1, self.hidden_size)

        output = self.relu(hn)
        output = self.fc1(output)
        output = self.relu(output)
        output = self.fc(output)

        return output


In [23]:
'''
Hyperparameters
'''
num_epochs = 1000
learning_rate = 0.001

input_size = 5 # Number of features
hidden_size = 2 # Number of features in the hidden state
num_layers = 1 # Number of stacked LSTM layers

output_size = 2 # Number of output classes

model = Net(input_size, hidden_size, output_size, num_layers)

In [24]:
'''
Loss Function and Optimiser
'''
criterion = nn.CrossEntropyLoss() 
optimiser = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [27]:
# TODO: Obtain data x_train and y_train
import numpy as n
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test = np.array(X_train), np.array(X_test)
for epoch in range(num_epochs):
    outputs = model.forward(X_train)
    optimizer.zero_grad()

    # Get the loss function
    loss = criterion(outputs, y_train)

    # Calculate loss
    loss.backward()

    # Backpropagation
    optimizer.step()

    # Print loss at every 100th epoch
    if epoch % 100 == 0:
        print("Epoch: %d, loss: %1.5f" % (epoch, loss.item()))

  X_train, X_test = np.array(X_train), np.array(X_test)


TypeError: 'int' object is not callable