In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ExponentialLR, StepLR


def create_word_vec_dict():
    vecs = np.loadtxt("pretrained vectors.txt")
    with open("words.txt", 'r') as words_file:
        words = words_file.read().splitlines()
    words2vecs = dict()
    words2inx = dict()
    i, j = 0, 0
    for word in words:
        if word not in words2vecs:
            words2inx[word] = j
            vec = vecs[i]
            words2vecs[word] = vec
            j += 1
        i += 1
    return words2vecs, words2inx

ModuleNotFoundError: No module named 'torch'

In [None]:
def parse_NER(file_path, window_size):
    # initialize dictionaries
    word2index = {'<S>': 0, '<E>': 1, '<U>': 2}
    index2word = {0: '<S>', 1: '<E>', 2: '<U>'}
    label2index = {'<START>': 0, '<END>': 1, '<UNSEEN>': 2}
    index2label = {0: '<START>', 1: '<END>', 2: '<UNSEEN>'}

    with open(file_path, 'r', encoding='utf-8') as f:
        word_index = 3
        label_index = 3
        dataset = []

        # split into sentences (separated by blank rows)
        sentences = f.read().split('\n\n')

        for sentence in sentences:
            if sentence == '' or sentence == '\n':
                continue
            # add special words of start and end of sentence with special labels (<S> = START, <E> = END)
            sentence = '<S>\tSTART\n<S>\tSTART\n' + sentence + '\n<E>\tEND\n<E>\tEND'
            words = sentence.split('\n')

            # go over the words (not including the start and end words)
            for i in range(window_size, len(words) - window_size):
                # for each word split into word and label
                word, ner = words[i].split('\t')

                # insert to the dataset a tuple of label and 5 words when the label is of the middle word
                dataset.append((ner, [word.split('\t')[0] for word in words[i - window_size: i + window_size + 1]]))

                # keep track of word and index
                if word not in word2index:
                    word2index[word] = word_index
                    index2word[word_index] = word
                    word_index += 1

                # keep track of label and index
                if ner not in label2index:
                    label2index[ner] = label_index
                    index2label[label_index] = ner
                    label_index += 1

    return dataset, word2index, index2word, label2index, index2label

In [None]:
def parse_POS(file_path, window_size, pretrained=False):
    # initialize dictionaries
    word2index = {'<S>': 0, '<E>': 1, '<U>': 2}
    index2word = {0: '<S>', 1: '<E>', 2: '<U>'}
    label2index = {'<START>': 0, '<END>': 1, '<UNSEEN>': 2}
    index2label = {0: '<START>', 1: '<END>', 2: '<UNSEEN>'}

    with open(file_path, 'r', encoding='utf-8') as f:
        word_index = 3
        label_index = 3
        dataset = []

        # split into sentences (separated by blank rows)
        sentences = f.read().split('\n\n')

        for sentence in sentences:
            if sentence == '' or sentence == '\n':
                continue
            # add special words of start and end of sentence with special labels (<S> = START, <E> = END)
            sentence = '<S> START\n<S> START\n' + sentence + '\n<E> END\n<E> END'
            words = sentence.split('\n')

            # go over the words (not including the start and end words)
            for i in range(window_size, len(words) - window_size):
                # for each word split into word and label
                word, pos = words[i].split(' ')

                # insert to the dataset a tuple of label and 5 words when the label is of the middle word
                dataset.append((pos, [word.split(' ')[0] for word in words[i - window_size: i + window_size + 1]]))

                # keep track of word and index
                if word not in word2index:
                    word2index[word] = word_index
                    index2word[word_index] = word
                    word_index += 1

                # keep track of label and index
                if pos not in label2index:
                    label2index[pos] = label_index
                    index2label[label_index] = pos
                    label_index += 1

    return dataset, word2index, index2word, label2index, index2label

In [None]:
def convert_dataset_to_index(dataset, word2index, label2index):
    for i in range(len(dataset)):
        # get current sample
        pos, words = dataset[i]
        # go over the words in the window
        for j in range(len(words)):
            # convert word to index. if the word was not seen - convert to unseen letter
            dataset[i][1][j] = word2index.get(words[j], word2index['<U>'])
        # change the tag to index
        dataset[i] = list(dataset[i])
        dataset[i][0] = label2index.get(pos, label2index['<UNSEEN>'])
        dataset[i] = tuple(dataset[i])

    return dataset

In [None]:
def parse_test_file(file_path, window_size):

    with open(file_path, 'r', encoding='utf-8') as f:
        dataset = []

        # split into sentences (separated by blank rows)
        sentences = f.read().split('\n\n')

        for sentence in sentences:
            if sentence == '' or sentence == '\n':
                continue
            # add special words of start and end of sentence with special labels (<S> = START, <E> = END)
            sentence = '<S>\n<S>\n' + sentence + '\n<E>\n<E>'
            words = sentence.split('\n')

            # go over the words (not including the start and end words)
            for i in range(window_size, len(words) - window_size):
                # insert to the dataset a tuple of label and 5 words when the label is of the middle word
                dataset.append(words[i - window_size: i + window_size + 1])

    return dataset

In [None]:
def save_model(model, train_loss_history, train_accuracy_history, dev_loss_history, dev_accuracy_history, path):
    torch.save(model, f'{path}/model.path')
    torch.save(train_loss_history, f'{path}/train_loss_history.path')
    torch.save(train_accuracy_history, f'{path}/train_accuracy_history.path')
    torch.save(dev_loss_history, f'{path}/dev_loss_history.path')
    torch.save(dev_accuracy_history, f'{path}/dev_accuracy_history.path')


def load_model(model_path, train_loss_history_path, train_accuracy_history_path, dev_loss_history_path, dev_accuracy_history_path):
    model = torch.load(model_path)
    train_loss_history = torch.load(train_loss_history_path)
    train_accuracy_history = torch.load(train_accuracy_history_path)
    dev_loss_history = torch.load(dev_loss_history_path)
    dev_accuracy_history = torch.load(dev_accuracy_history_path)
    return model, train_loss_history, train_accuracy_history, dev_loss_history, dev_accuracy_history

In [None]:
def draw_graphs(train_history, dev_history, n_epochs, plot_title, train_title, dev_title):
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.suptitle(plot_title)
    x = torch.arange(n_epochs) + 1
    ax1.set_title(train_title)
    ax1.plot(x, train_history)
    ax2.set_title(dev_title)
    ax2.plot(x, dev_history)

    plt.show()


In [None]:
class Tagger1Model(nn.Module):
    def __init__(self, vocab_size, embed_size, num_words, hidden_dim, out_dim):
        super(Tagger1Model, self).__init__()
        self.num_words = num_words
        self.embed_size = embed_size
        self.embed_layer = nn.Embedding(vocab_size, embed_size)

        self.layer1 = nn.Linear(num_words * embed_size, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(0.5)

        self.softmax = nn.LogSoftmax(dim=-1)

    def forward(self, words_idxs):
        # get the embedded vectors of each word and concat to a large vector
        x = self.embed_layer(words_idxs).view(-1, self.num_words * self.embed_size)

        x = torch.tanh(self.layer1(x))
        x = self.dropout(x)
        out = self.softmax(self.layer2(x))

        return out

In [None]:
def train_model(train_set, dev_set, model,  n_epochs, lr, device, index2word, index2label, is_pos=False):
    model.to(device)
    model.train()

    optimizer = optim.Adam(params=model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    scheduler = StepLR(optimizer, step_size=4,gamma=0.1)
    # scheduler = ExponentialLR(optimizer, gamma=0.5)

    train_losses = []
    train_accuracy = []
    dev_losses = []
    dev_accuracy = []

    for e in range(n_epochs):
        train_loss = train(model, train_set, optimizer, criterion, device)
        _, train_acc = evaluate(model, train_set, criterion, device, index2label, is_pos)
        train_losses.append(train_loss)
        train_accuracy.append(train_acc)

        dev_loss, accuracy = evaluate(model, dev_set, criterion, device, index2label, is_pos)
        dev_losses.append(dev_loss)
        dev_accuracy.append(accuracy)

        scheduler.step()

        print(f'[{e + 1}/{n_epochs}] train loss: {train_loss}, train accuracy: {train_acc}%,'
              f' dev loss: {dev_loss}, dev accuracy: {accuracy}%')

    save_model(model, train_losses, train_accuracy, dev_losses, dev_accuracy, '.')

    # draw graphs of loss and accuracy history
    draw_graphs(train_losses, dev_losses, n_epochs, 'Loss History', 'Train Loss', 'Validation Loss')
    draw_graphs(train_accuracy, dev_accuracy, n_epochs, 'Accuracy History', 'Train Accuracy', 'Validation Accuracy')b

In [None]:
def train(model, train_set, optimizer, criterion, device):
    running_loss = 0
    for i, data in enumerate(train_set):
        labels_batch, words_batch = data

        words_batch = torch.stack(words_batch, dim=1)

        words_batch = words_batch.to(device)
        labels_batch = labels_batch.to(device)

        optimizer.zero_grad()

        # predict
        outputs = model(words_batch)

        loss = criterion(outputs.squeeze(), labels_batch)
        loss.backward()

        # backwards step
        optimizer.step()

        running_loss += loss.item()

    return running_loss / len(train_set.dataset)


def evaluate(model, dev_set, criterion, device, index2label, is_pos):
    running_loss = 0
    correct = 0.0
    total = 0.0
    for i, data in enumerate(dev_set):
        labels_batch, words_batch = data

        words_batch = torch.stack(words_batch, dim=1)

        words_batch = words_batch.to(device)
        labels_batch = labels_batch.to(device)

        # predict
        outputs = model(words_batch)

        loss = criterion(outputs.squeeze(), labels_batch)

        running_loss += loss.item()

        predictions = torch.argmax(outputs.data, dim=1)

        if is_pos:
            correct += (predictions == labels_batch).sum().item()
            total += labels_batch.size(0)
        else:
            for prediction, real_label in zip(predictions, labels_batch):
                # count how many labels were in this batch
                total += 1

                # check if the prediction in like the real label
                if prediction == real_label:
                    # if both are 'O' skip it because there are many 'O's (don't count it)
                    if index2label[int(prediction)] == 'O':
                        total -= 1
                    else:
                        # otherwise count the correct results
                        correct += 1

    return running_loss / len(dev_set.dataset), round(100 * correct / total, 3)


def predict(test_set, model, device, words2index, index2label):
    model.to(device)
    model.eval()

    predicted_labels = []

    for i, data in enumerate(test_set):
        words_batch = data
        words_batch = torch.stack(words_batch, dim=1)

        words_batch = words_batch.to(device)

        # predict
        outputs = model(words_batch)

        # get the index of the label
        index = torch.argmax(outputs)

        # ge the label from the index
        label = index2label[index]

        predicted_labels.append(label)

    return predicted_labels

In [None]:
# pos_train_set, word2index, index2word, label2index, index2label = utils.parse_POS('./pos/train', window_size=2)
# pos_train_set = utils.convert_dataset_to_index(pos_train_set, word2index, label2index)
#
# pos_dev_set, _, _, _, _ = utils.parse_POS('./pos/dev', window_size=2)
# pos_dev_set = utils.convert_dataset_to_index(pos_dev_set, word2index, label2index)

ner_train_set, word2index, index2word, label2index, index2label = utils.parse_NER('./ner/train', window_size=2)
ner_train_set = utils.convert_dataset_to_index(ner_train_set, word2index, label2index)

ner_dev_set, _, _, _, _ = utils.parse_NER('./ner/dev', window_size=2)
ner_dev_set = utils.convert_dataset_to_index(ner_dev_set, word2index, label2index)

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
is_pos = False

# define model's parameters
vocab_size = len(word2index.keys())
embed_size = 50
num_words = 5
out_dim = len(label2index.keys())

if is_pos:
    lr = 1e-3
    n_epochs = 7
    batch_size_train = 32
    batch_size_dev = 32
    hidden_dim = 150
else:
    lr = 1e-3
    n_epochs = 8
    batch_size_train = 32
    batch_size_dev = 32
    hidden_dim = 150

print(f'Run config - is POS: {is_pos}, vocab size: {vocab_size}, embed size: {embed_size}, window size: {num_words},'
      f' hidden layer size: {hidden_dim}, labels size: {out_dim}, LR: {lr}, epochs: {n_epochs},'
      f' train batch size: {batch_size_train}, dev batch size: {batch_size_dev}')


# define train dataloader
train_data = DataLoader(ner_train_set, batch_size=batch_size_train, shuffle=True, drop_last=True, pin_memory=True, num_workers=4)
# train_data = DataLoader(pos_train_set, batch_size=batch_size_train, shuffle=True, drop_last=True, pin_memory=True, num_workers=4)

# define train dataloader
dev_data = DataLoader(ner_dev_set, batch_size=batch_size_dev, shuffle=False, drop_last=True, pin_memory=True, num_workers=4)
# dev_data = DataLoader(pos_dev_set, batch_size=batch_size_dev, shuffle=False, drop_last=True, pin_memory=True, num_workers=4)

model = tagger1.Tagger1Model(vocab_size, embed_size, num_words, hidden_dim, out_dim)

tagger1.train_model(train_data, dev_data, model, n_epochs, lr, device, index2word, index2label, is_pos)

# path = './pos results part 1'

# model, train_loss_history, train_accuracy_history, dev_loss_history, dev_accuracy_history = utils.load_model(
#     f'{path}/model.path', f'{path}/train_loss_history.path', f'{path}/train_accuracy_history.path',
#     f'{path}/dev_loss_history.path', f'{path}/dev_accuracy_history.path'
# )

NameError: name 'torch' is not defined