# **ACL-19 Paper Experiment: Relational Word Embedding**

[Paper](https://aclanthology.org/P19-1318.pdf) -
[Repository](https://github.com/pedrada88/rwe/)

## **A. Import Packages & Data**

In [1]:
# Import required packages
import sys
import random
import torch
import numpy as np
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

## **B. Define Networks**

In [3]:
# Define neural network model
class RWE_Model(torch.nn.Module):
    def __init__(
        self,
        embedding_size_input,
        embedding_size_output,
        embedding_weights,
        hidden_size,
        dropout,
    ):
        super(RWE_Model, self).__init__()
        self.embeddings = torch.nn.Embedding.from_pretrained(embedding_weights).float()
        self.embeddings.weight.requires_grad = True
        self.linear1 = torch.nn.Linear(embedding_size_input * 2, hidden_size)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout)
        self.linear2 = torch.nn.Linear(hidden_size, embedding_size_output)

    def forward(self, input1, input2):
        embed1 = self.embeddings(input1)
        embed2 = self.embeddings(input2)
        out = self.linear1(
            torch.cat(((embed1 * embed2), (embed1 + embed2) / 2), 2)
        ).squeeze()
        out = self.relu(out)
        out = self.dropout(out)
        out = self.linear2(out)
        return out


# Define function to get the model
def getRWEModel(
    embedding_size_input, embedding_size_output, embedding_weights, hidden_size, dropout
):
    vocab_size = len(embedding_weights)
    model = RWE_Model(
        embedding_size_input,
        embedding_size_output,
        embedding_weights,
        hidden_size,
        dropout,
    )
    criterion = torch.nn.MSELoss()
    return model.cuda(), criterion

In [4]:
# @title Helper functions to train the model
def load_vocab_embeddings(input_path):
    first_line = True
    vocab = set()
    input_file_relations = open(input_file_relations, "r", encoding="utf-8")
    for line in input_file_relations:
        if first_line == True:
            first_line = False
        else:
            vocab.add(line.strip().split(" ")[0])
    return vocab


def load_word_vocab_from_relation_vectors(input_path):
    pre_word_vocab = set()
    first_line = True
    input_file_relations = open(input_path, "r", encoding="utf-8")
    for line in input_file_relations:
        linesplit = line.strip().split(" ")
        if first_line == True:
            first_line = False
        else:
            relation = linesplit[0]
            if "__" not in relation:
                sys.exit("ERROR: Pair '" + relation + "' does not contain underscore")
            relation_split = relation.rsplit("__", 1)
            word1 = relation_split[0]
            word2 = relation_split[1]
            pre_word_vocab.add(word1)
            pre_word_vocab.add(word2)
    return pre_word_vocab


def load_embeddings_filtered_byvocab(input_path, vocab):
    word2index = {}
    index2word = {}
    matrix_word_embeddings = []
    first_line = True
    input_file_relations = open(input_path, "r", encoding="utf-8")
    cont = 0
    for line in input_file_relations:
        linesplit = line.strip().split(" ")
        if first_line == True:
            dimensions = int(linesplit[1])
            first_line = False
        else:
            word = linesplit[0]
            if word in vocab and word not in word2index:
                word2index[word] = cont
                index2word[cont] = word
                cont += 1
                matrix_word_embeddings.append(
                    np.asarray([float(dim) for dim in linesplit[1 : dimensions + 1]])
                )
    return matrix_word_embeddings, word2index, index2word, dimensions


def load_training_data(input_path, matrix_word_embeddings, word2index):
    matrix_input = []
    matrix_output = []
    first_line = True
    input_file_relations = open(input_path, "r", encoding="utf-8")
    for line in input_file_relations:
        linesplit = line.strip().split(" ")
        if first_line == True:
            dimensions = int(str(line.split(" ")[1]))
            first_line = False
        else:
            relation = linesplit[0]
            if "__" not in relation:
                sys.exit("ERROR: Pair '" + relation + "' does not contain underscore")
            relation_split = relation.rsplit("__", 1)
            word1 = relation_split[0]
            word2 = relation_split[1]
            if word1 in word2index and word2 in word2index:
                matrix_input.append(np.asarray([word2index[word1], word2index[word2]]))
                matrix_output.append(
                    np.asarray([float(dim) for dim in linesplit[1 : dimensions + 1]])
                )
    return matrix_input, matrix_output, dimensions


def split_training_data(matrix_input, matrix_output, devsize, batchsize):
    matrix_input_train = []
    matrix_output_train = []
    matrix_input_dev = []
    matrix_output_dev = []
    num_instances = int((len(matrix_input) // batchsize) * batchsize)
    final_size_dev = int(((num_instances * devsize) // batchsize) * batchsize)
    final_size_train = int(((num_instances - final_size_dev) // batchsize) * batchsize)
    print("Size train set: " + str(final_size_train))
    print("Size dev set: " + str(final_size_dev))
    all_instances = range(num_instances)
    list_index_dev = random.sample(all_instances, final_size_dev)
    for i in range(num_instances):
        if i in list_index_dev:
            matrix_input_dev.append(matrix_input[i])
            matrix_output_dev.append(matrix_output[i])
        else:
            matrix_input_train.append(matrix_input[i])
            matrix_output_train.append(matrix_output[i])
    return matrix_input_train, matrix_output_train, matrix_input_dev, matrix_output_dev


def trainIntervals(model, optimizer, criterion, batches, interval=100, lr=0.1):
    i = 0
    n = 0
    trainErr = 0
    for x1, x2, y in zip(*batches):
        model.train()
        optimizer.zero_grad()
        trainErr += gradUpdate(model, x1, x2, y, criterion, optimizer, lr)
        i += 1
        if i == interval:
            n += 1
            prev_train_err = trainErr
            trainErr = 0
            i = 0
    if i > 0 and prev_train_err != 0:
        print("Training error: " + str(prev_train_err / float(i)))


def validate(model, batches, criterion):
    evalErr = 0
    n = 0
    model.eval()
    for x1, x2, y in zip(*batches):
        y = torch.autograd.Variable(y, requires_grad=False)
        x1 = torch.autograd.Variable(x1, requires_grad=False)
        x2 = torch.autograd.Variable(x2, requires_grad=False)
        output = model(x1, x2)
        error = criterion(output, y)
        evalErr += error.item()
        n += 1
    return evalErr / n


def gradUpdate(model, x1, x2, y, criterion, optimizer, lr):
    output = model(x1, x2)
    error = criterion(output, y)
    error.backward()
    optimizer.step()
    return error.item()


def getBatches(data, batchSize):
    embsize = int(data.size(-1))
    return data.view(-1, batchSize, embsize)

In [5]:
# Define function to train the model
def trainEpochs(
    model,
    optimizer,
    criterion,
    trainBatches,
    validBatches,
    epochs=10,
    interval=100,
    lr=0.1,
):
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, patience=2, threshold=1e-7, factor=0.9
    )
    min_error = -1.0
    for epoch in range(1, epochs + 1):
        print("\n     ----------    \n")
        print("EPOCH " + str(epoch))
        print("Starting training epoch " + str(epoch))
        trainIntervals(model, optimizer, criterion, trainBatches, interval, lr)
        validErr = validate(model, validBatches, criterion)
        scheduler.step(validErr)
        print("Validation error : " + str(validErr))
        if validErr < min_error or min_error == -1.0:
            new_model = model
            min_error = validErr
            print(
                "[Model at epoch "
                + str(epoch)
                + " obtained the lowest development error rate so far.]"
            )
        # if epoch % 5 == 0 or epoch == 1: torch.save(model, f"./model-epoch{str(epoch)}.model")
        torch.save(model, f"epoch-{epoch}.model")
        print("Epoch " + str(epoch) + " done")
    return new_model

In [6]:
# @title Define driver function to actually load the data for the model training
def train_rwe(
    word_embeddings_path,
    rel_embeddings_path,
    output_path,
    hidden_size=0,
    dropout=0.5,
    epochs=5,
    interval=100,
    batchsize=10,
    dev_size=0.015,
    lr=0.01,
):
    if dev_size >= 1 or dev_size < 0:
        raise Exception(
            "Development data should be between 0% (0.0) and 100% (1.0) of the training data"
        )

    print("Loading word vocabulary...")
    pre_word_vocab = load_word_vocab_from_relation_vectors(rel_embeddings_path)
    print(
        "Word vocabulary loaded succesfully ("
        + str(len(pre_word_vocab))
        + " words). Now loading word embeddings..."
    )
    (
        matrix_word_embeddings,
        word2index,
        index2word,
        dims_word,
    ) = load_embeddings_filtered_byvocab(word_embeddings_path, pre_word_vocab)
    pre_word_vocab.clear()
    print(
        "Word embeddings loaded succesfully ("
        + str(dims_word)
        + " dimensions). Now loading relation vectors..."
    )
    matrix_input, matrix_output, dims_rels = load_training_data(
        rel_embeddings_path, matrix_word_embeddings, word2index
    )
    print(
        "Relation vectors loaded ("
        + str(dims_rels)
        + " dimensions), now spliting training and dev..."
    )
    random.seed(21)
    s1 = random.getstate()
    random.shuffle(matrix_input)
    random.setstate(s1)
    random.shuffle(matrix_output)
    (
        matrix_input_train,
        matrix_output_train,
        matrix_input_dev,
        matrix_output_dev,
    ) = split_training_data(matrix_input, matrix_output, dev_size, batchsize)
    matrix_input.clear()
    matrix_output.clear()
    print("Done preprocessing all the data, now loading and training the model...\n")

    if hidden_size == 0:
        hidden_size = dims_word * 2
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Device used: " + str(device))
    embedding_weights = torch.tensor(matrix_word_embeddings)
    matrix_word_embeddings.clear()
    tensor_input_train_1 = torch.LongTensor([[x[0]] for x in matrix_input_train])
    tensor_input_train_2 = torch.LongTensor([[x[1]] for x in matrix_input_train])
    matrix_input_train.clear()
    tensor_input_dev_1 = torch.LongTensor([[x[0]] for x in matrix_input_dev])
    tensor_input_dev_2 = torch.LongTensor([[x[1]] for x in matrix_input_dev])
    matrix_input_dev.clear()
    tensor_output_train = torch.FloatTensor(matrix_output_train)
    matrix_output_train.clear()
    tensor_output_dev = torch.FloatTensor(matrix_output_dev)
    matrix_output_dev.clear()
    model, criterion = getRWEModel(
        dims_word, dims_rels, embedding_weights, hidden_size, dropout
    )
    print("RWE model loaded.")
    optimizer = torch.optim.Adam(model.parameters(), lr)
    trainX1batches = getBatches(tensor_input_train_1.cuda(), batchsize)
    trainX2batches = getBatches(tensor_input_train_2.cuda(), batchsize)
    validX1Batches = getBatches(tensor_input_dev_1.cuda(), batchsize)
    validX2Batches = getBatches(tensor_input_dev_2.cuda(), batchsize)
    trainYBatches = getBatches(tensor_output_train.cuda(), batchsize)
    validYBatches = getBatches(tensor_output_dev.cuda(), batchsize)
    print("Now starting training...\n")
    output_model = trainEpochs(
        model,
        optimizer,
        criterion,
        (trainX1batches, trainX2batches, trainYBatches),
        (validX1Batches, validX2Batches, validYBatches),
        epochs,
        interval,
        lr,
    )
    print(
        "\nTraining finished. Now loading relational word embeddings from trained model..."
    )

    parameters = list(output_model.parameters())
    num_vectors = len(parameters[0])
    print("Number of vectors: " + str(num_vectors))
    num_dimensions = len(parameters[0][0])
    print("Number of dimensions output embeddings: " + str(num_dimensions))
    txtfile = open(output_path, "w", encoding="utf8")
    txtfile.write(str(num_vectors) + " " + str(num_dimensions) + "\n")
    if num_vectors != embedding_weights.size()[0]:
        print(
            "Something is wrong in the input vectors: "
            + str(embedding_weights.size()[0])
            + " != "
            + str(num_vectors)
        )
    for i in range(num_vectors):
        word = index2word[i]
        txtfile.write(word)
        vector = parameters[0][i].cpu().detach().numpy()
        for dimension in vector:
            txtfile.write(" " + str(dimension))
        txtfile.write("\n")
    txtfile.close()
    print("\nFINISHED. Word embeddings stored at " + output_path)

## **C. Train Model**

In [None]:
# train the model
train_rwe(
    "ft_word_embeddings.txt",
    "relative_init_vectors.txt",
    "rwe_embeddings.txt",
    epochs=10,
)

Loading word vocabulary...
Word vocabulary loaded succesfully (38896 words). Now loading word embeddings...
Word embeddings loaded succesfully (300 dimensions). Now loading relation vectors...
Relation vectors loaded (300 dimensions), now spliting training and dev...
Size train set: 306910
Size dev set: 4670
Done preprocessing all the data, now loading and training the model...

Device used: cuda


  embedding_weights = torch.tensor(matrix_word_embeddings)


RWE model loaded.
Now starting training...


     ----------    

EPOCH 1
Starting training epoch 1
Training error: 2.0424039929613964
Validation error : 3.102219146080076
[Model at epoch 1 obtained the lowest development error rate so far.]
Epoch 1 done

     ----------    

EPOCH 2
Starting training epoch 2
Training error: 2.0074428584795077
Validation error : 3.093893662940639
[Model at epoch 2 obtained the lowest development error rate so far.]
Epoch 2 done

     ----------    

EPOCH 3
Starting training epoch 3
Training error: 2.007610808239206
Validation error : 3.0983719646819017
Epoch 3 done

     ----------    

EPOCH 4
Starting training epoch 4
Training error: 2.010543911789472
Validation error : 3.098701028404578
Epoch 4 done

     ----------    

EPOCH 5
Starting training epoch 5
Training error: 2.0085147973764075
Validation error : 3.0984100414882763
Epoch 5 done

     ----------    

EPOCH 6
Starting training epoch 6


In [None]:
# from google.colab import files
# files.download('/content/rwe_embeddings.txt')