# Attention-Based Bidirectional Long Short-Term Memory Networks for Relation Classification

## Imports

In [21]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

## Data Vectorization classes

### Vocabulary

In [22]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""
    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """

        if token_to_idx is None:
            token_to_idx = {}

        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}



    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx}
    
    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)
    
    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        
        return index
       
    def add_many(self, tokens):
        """Add a list of tokens into the Vocabulary
        
        Args:
            tokens (list): a list of string tokens
        Returns:
            indices (list): a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]
    
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)
    
    def __len__(self):
        return len(self._token_to_idx)

### SequenceVocabulary

In [23]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx=None, unk_token="<UNK>", mask_token="<MASK>"):
        super(SequenceVocabulary, self).__init__(token_to_idx)

        self._mask_token = mask_token
        self._unk_token = unk_token

        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
    
    def to_serializable(self):
        """
            从原本的to_serializable()方法中获取一个content的dict, 然后再更新一些新的内容
        """
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         })
        return contents
    
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index) # get的意思是，在没有找到正确的index, 就返回unk_index
        else:
            return self._token_to_idx[token]

### Vectorizer

In [24]:
class NREVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""    
    def __init__(self, seq_vocab, relation_vocab):
        """
        Args:
            word_vocab (SequenceVocabulary): maps words to integers
            relation_vocab (Vocabulary): maps relation to integers
        """
        self.seq_vocab = seq_vocab
        self.relation_vocab = relation_vocab
    
    def vectorize(self, seq, vector_length = -1):
        """
        Args:
            seq (str): the string of words 
            vector_length (int): an argument for forcing the length of index vector
        Returns:
            the vetorized title (numpy.array)
        """
        indices = [self.seq_vocab.lookup_token(token) for token in seq]

        
        if vector_length < 0:
            vector_length = len(indices)
            
        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.word_vocab.mask_index

        return out_vector

    @classmethod
    def from_dataframe(cls, news_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the target dataset
        Returns:
            an instance of the NREVectorizer
        """
        relation_vocab = Vocabulary()
        for relation in set(news_df.relation):
            relation_vocab.add_token(relation)
        
        seq_vocab = SequenceVocabulary()
        for word in news_df.sequence:
            seq_vocab.add_many(word)
        return cls(seq_vocab, relation_vocab)
        
    @classmethod
    def from_serializable(cls, contents):
        seq_vocab    =  SequenceVocabulary.from_serializable(contents['seq_vocab'])
        relation_vocab =  Vocabulary.from_serializable(contents['seq_vocab'])

    def to_serializable(self):
        return {'seq_vocab': self.seq_vocab.to_serializable(),
                'relation_vocab': self.relation_vocab.to_serializable()}


### The Dataset

In [8]:
class NREDataset(Dataset):
    def __init__(self, news_df, vectorizer):
        """
        Args:
            news_df (pandas.DataFrame): the dataset
            vectorizer (NewsVectorizer): vectorizer instatiated from dataset
        """
        self.news_df = news_df
        self._vectorizer = vectorizer

        
        self._max_seq_length = 50 

        self.train_df = self.news_df[self.news_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.news_df[self.news_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.news_df[self.news_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}
        
        self.set_split('train')


    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv):
        """Load dataset and make a new vectorizer from scratch
                
            Args:
                surname_csv (str): location of the dataset
            Returns:
                an instance of SurnameDataset
        """
        news_df = pd.read_csv(news_csv)
        train_news_df = news_df[news_df.split=='train']
        return cls(news_df, NREVectorizer.from_dataframe(train_news_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath):
        """Load dataset and the corresponding vectorizer. 
        Used in the case in the vectorizer has been cached for re-use
            
        Args:
            news_csv (str): location of the dataset
            vectorizer_filepath (str): location of the saved vectorizer
        Returns:
            an instance of NREDataset
        """
        news_df = pd.read_csv(news_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_df, vectorizer)

    @staticmethod  # https://blog.csdn.net/lihao21/article/details/79762681 实例方法/类方法/静态方法
    def load_vectorizer_only(vectorizer_filepath):
        """a static method for loading the vectorizer from file
            
        Args:
            vectorizer_filepath (str): the location of the serialized vectorizer
        Returns:
            an instance of SurnameVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return NREVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """saves the vectorizer to disk using json
            
        Args:
            vectorizer_filepath (str): the location to save the vectorizer
        """
        with open(vectorizer_filepath, "w", encoding = "utf-16") as fp:
            json.dump(self._vectorizer.to_serializable(), fp, ensure_ascii = False)

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
        
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
        
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
            
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        seq_vector = self._vectorizer.vectorize(row.sequence, self._max_seq_length)

        relation_index = self._vectorizer.relation_vocab.lookup_token(row.relation)

        index1 = row.sequence.index(row.entity_1)
        index2 = row.sequence.index(row.entity_2)
        pos1 = [(index - index1) for index, word in enumerate(row.sequence)]
        pos2 = [(index - index2) for index, word in enumerate(row.sequence)]

        return {'x_data': seq_vector,
                'y_target': relation_index,
                'pos1': pos1,
                'pos2': pos2
                }
        
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
            
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    A generator function which wraps the PyTorch DataLoader. It will 
    ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                                shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

# The Model: Attention-Based Bidirectional Long Short-Term Memory Networks 

In [9]:
class NREModel(nn.Module):
    def __init__(self, config):
        """
            args:
                config: some configurations of this model
        """
        super(NREModel, self).__init__()

        self.batch           = config.batch
        
        self.embedding_size  = config.embedding_size
        self.embedding_dim   = config.embedding_dim

        self.hidden_dim      = config.hidden_dim
        self.tag_size        = config.tag_size
        
        self.pos_size        = config.pos_size
        self.pos_dim         = config.pos_dim

        self.pretrained      = config.pretrained

        if self.pretrained:
            self.word_embeds = nn.Embedding.from_pretrained(torch.FloatTensor(config.embedding), freeze=False)
        else:
            self.word_embeds = nn.Embedding(self.embedding_size,self.embedding_dim)
        
        self.pos1_embeds = nn.Embedding(self.pos_size,self.pos_dim)
        self.pos2_embeds = nn.Embedding(self.pos_size,self.pos_dim)

        self.relation_embeds = nn.Embedding(self.tag_size,self.hidden_dim)
        self.hidden2tag = nn.Linear(self.hidden_dim,self.tag_size)

        self.dropout_emb=nn.Dropout(p=0.5)
        self.dropout_lstm=nn.Dropout(p=0.5)
        self.dropout_att=nn.Dropout(p=0.5)

        self.hidden = self.init_hidden()

        self.att_weight = nn.Parameter(torch.randn(self.batch,1,self.hidden_dim))
        self.relation_bias = nn.Parameter(torch.randn(self.batch,self.tag_size,1))

    def init_hidden(self):
        return torch.randn(2, self.batch, self.hidden_dim // 2)
        # https://pytorch.org/docs/stable/nn.html#lstm lstm的hidden和cell的形状
        # h_0 of shape(num_layers*num_direction, batch, hidden_size)
        # c_0 of shape(num_layers*num_direction, batch, hidden_size)
    def init_hidden_cell_lstm(self):
        return (torch.randn(2, self.batch, self.hidden_dim // 2),
                torch.randn(2, self.batch, self.hidden_dim // 2))

    def attention(self,H):
        M = torch.tanh(H)
        a = F.softmax(torch.bmm(self.att_weight,M),2)
        a = torch.transpose(a,1,2)
        return torch.bmm(H,a)
    
    def forward(self,sentence,pos1,pos2):

        self.hidden = self.init_hidden_lstm()

        
        embeds = torch.cat((self.word_embeds(sentence),self.pos1_embeds(pos1),self.pos2_embeds(pos2)),2) # (128, 50, 150)
        embeds = torch.transpose(embeds,0,1)    # 50 128 150
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        
        lstm_out = torch.transpose(lstm_out,0,1)
        #lstm_out = torch.transpose(lstm_out,1,2)
        
        lstm_out = self.dropout_lstm(lstm_out)
        att_out = torch.tanh(self.attention(lstm_out))
        #att_out = self.dropout_att(att_out)
        
        relation = torch.tensor([i for i in range(self.tag_size)],dtype = torch.long).repeat(self.batch, 1)

        relation = self.relation_embeds(relation)
        
        res = torch.add(torch.bmm(relation,att_out),self.relation_bias)
        
        res = F.softmax(res,1)

        
        return res.view(self.batch,-1)

In [10]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

## Setting 

In [11]:
args = Namespace(
    # Data and path information
    data_csv         = "data/train_with_splits.csv",
    vectorizer_file  = "vectorizer.json",
    model_state_file = "model.pth",
    save_dir         = "model_storage/RE_classification",
    
    # Model hyper parameter
    hidden_dim     = 200,
    embedding_size     = None,
    tag_size        = None,
    embedding_dim      = 100,
    pos_size        = 82,  #不同数据集这里可能会报错。
    pos_dim         = 25,
    pretrained      = True,
    embedding       = None,
    # Training hyper parameter
    epochs = 100,
    learning_rate = 1e-3,
    batch = 128,
    seed=1337,
    early_stopping_criteria=5,

    # Runtime hyper parameter
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")

print("Using CUDA: {}".format(args.cuda))
if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,args.model_state_file)


# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Using CUDA: True


In [12]:
if args.reload_from_files and os.path.exists(args.vectorizer_file):
    # training from a checkpoint
    dataset = NREDataset.load_dataset_and_load_vectorizer(args.data_csv, args.vectorizer_file)
else:
    # create dataset and vectorizer
    dataset = NREDataset.load_dataset_and_make_vectorizer(args.data_csv)
    dataset.save_vectorizer(args.vectorizer_file)

vectorizer = dataset.get_vectorizer()

### embedding

In [13]:

if args.pretrained:
    word_to_vec = {}
    with open('vec.txt', mode = "r", encoding = "utf-8") as fp:
        for line in fp.readlines():
            word_to_vec[line.split()[0]] = [float(data) for data in line.split()[1:]]

    unkonw_embedding = [1] * 100 #这里100维度是因为vec的每个词对应的向量是100维度 
    embedding = []
    embedding.append(unkonw_embedding)
    embedding.append(unkonw_embedding)
    for word, index in vectorizer.seq_vocab._token_to_idx.items():
        if word in word_to_vec:
            embedding.append(word_to_vec[word])
        else:
            embedding.append(unkonw_embedding)
 
args.tag_size       = len(vectorizer.relation_vocab._token_to_idx)
args.embedding      = np.asarray(embedding)
args.embedding_size = len(embedding)

In [14]:
model = NREModel(args)
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5)
criterion = nn.CrossEntropyLoss(size_average=True)

## Train

In [17]:
for epoch in range(args.epochs):
    # Iterate over training dataset

    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, batch_size=args.batch, device=args.device)
    
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is these 5 steps:

        # --------------------------------------    
        # step 1. zero the gradients
        optimizer.zero_grad()
        # step 2. compute the output
        y_pred = classifier(sentence=batch_dict['x_data'], pos1=batch_dict['pos1'], pos2 = batch_dict['pos2'])

In [2]:
def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [25]:
dataset[0]

AttributeError: 'NREVectorizer' object has no attribute 'lookup_token'