In [1]:
import pandas as pd
pd.options.display.max_colwidth = 800
pd.options.display.max_rows = 200
pd.options.display.max_columns = 100
import torch
import sys

In [76]:
from torch.utils.data import Dataset

class ABSADataset(Dataset):
    def __init__(self, fname, tokenizer):
        """
        loading dataset
        
        fname: data_path
        
        tokenizer must have function called text_to_sequence
        """
        fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
        lines = fin.readlines()
        fin.close()

        all_data = []
        for i in range(0, len(lines), 3):
            text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
            aspect = lines[i + 1].lower().strip()
            
            polarity = lines[i + 2].strip()
            text_raw_indices = tokenizer.text_to_sequence(text_left + " " + aspect + " " + text_right)
            text_raw_without_aspect_indices = tokenizer.text_to_sequence(text_left + " " + text_right)

            text_left_indices = tokenizer.text_to_sequence(text_left)
            text_left_with_aspect_indices = tokenizer.text_to_sequence(text_left + " " + aspect)
            text_right_indices = tokenizer.text_to_sequence(text_right, reverse=True)
            text_right_with_aspect_indices = tokenizer.text_to_sequence(" " + aspect + " " + text_right, reverse=True)
            aspect_indices = tokenizer.text_to_sequence(aspect)
            left_context_len = np.sum(text_left_indices != 0)
            aspect_len = np.sum(aspect_indices != 0)
            aspect_in_text = torch.tensor([left_context_len.item(), (left_context_len + aspect_len - 1).item()])
            polarity = int(polarity) + 1

            text_bert_indices = tokenizer.text_to_sequence('[CLS] ' + text_left + " " + aspect + " " + text_right + ' [SEP] ' + aspect + " [SEP]")
            bert_segments_ids = np.asarray([0] * (np.sum(text_raw_indices != 0) + 2) + [1] * (aspect_len + 1))
            bert_segments_ids = pad_and_truncate(bert_segments_ids, tokenizer.max_seq_len)

            text_raw_bert_indices = tokenizer.text_to_sequence("[CLS] " + text_left + " " + aspect + " " + text_right + " [SEP]")
            aspect_bert_indices = tokenizer.text_to_sequence("[CLS] " + aspect + " [SEP]")

            data = {
                'text_bert_indices': text_bert_indices,
                'bert_segments_ids': bert_segments_ids,
                'text_raw_bert_indices': text_raw_bert_indices,
                'aspect_bert_indices': aspect_bert_indices,
                'text_raw_indices': text_raw_indices,
                'text_raw_without_aspect_indices': text_raw_without_aspect_indices,
                'text_left_indices': text_left_indices,
                'text_left_with_aspect_indices': text_left_with_aspect_indices,
                'text_right_indices': text_right_indices,
                'text_right_with_aspect_indices': text_right_with_aspect_indices,
                'aspect_indices': aspect_indices,
                'aspect_in_text': aspect_in_text, # aspect_in_text needed aspect shoule be in the text
                'polarity': polarity,
            }

            all_data.append(data)
            if i< 3:
                print ('text_left : ',text_left)
                print ('text_right : ',text_right)
                print ('aspect : ',aspect)
                print ('polarity : ', polarity)


        self.data = all_data # list of dict
        
    def get_dataframe(self, tokenizer):
        """
        Conver dataset into DataFrame(Pandas)
        """
        df = []
        columns_name = []
        for i in range(len(self.data)):
            tmp = []
            for k, v in self.data[i].items():
                try:
                    to_str = " ".join(tokenizer.tokenizer.convert_ids_to_tokens(v))
                    #print (to_str)
                    tmp.append(to_str)
                except:
                    if k == 'aspect_in_text':
                        # it's a 1-D tensor wtih shape of (2,), representing the start and end index of the aspect
                        v = v.numpy()  # 1-D tensor
                        #print (v.shape)
                    tmp.append(v)
                if i <= 0:
                    columns_name.append(k)
            df.append(tmp)
        df = pd.DataFrame(df,columns=columns_name)   
        return df
        
    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)

dataset_files = {
'twitter': {
    'train': './datasets/acl-14-short-data/train.raw',
    'test': './datasets/acl-14-short-data/test.raw'
},
'restaurant': {
    'train': './datasets/semeval14/Restaurants_Train.xml.seg',
    'test': './datasets/semeval14/Restaurants_Test_Gold.xml.seg'
},
'laptop': {
    'train': './datasets/semeval14/Laptops_Train.xml.seg',
    'test': './datasets/semeval14/Laptops_Test_Gold.xml.seg'
},
'headphone':{
    "train":"../datasets/headphone/headphone_train.txt",
    "test":"../datasets/headphone/headphone_test.txt"
    
}
}

# tokenizer

from pytorch_pretrained_bert import BertTokenizer
import numpy as np

def pad_and_truncate(sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
    """
    padding: post means pad with '[PAD]'(BERT) in the last of sentence.
    """
    x = (np.ones(maxlen) * value).astype(dtype)
    if truncating == 'pre':
        trunc = sequence[-maxlen:]
    else:
        trunc = sequence[:maxlen]
    trunc = np.asarray(trunc, dtype=dtype)
    if padding == 'post':
        x[:len(trunc)] = trunc
    else:
        x[-len(trunc):] = trunc
    return x

class Tokenizer4Bert:
    def __init__(self, max_seq_len, pretrained_bert_name = "bert-base-uncased"):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
        self.max_seq_len = max_seq_len

    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        ls_tokens = self.tokenizer.tokenize(text)
        sequence = self.tokenizer.convert_tokens_to_ids(ls_tokens)
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)

# path
train_path = dataset_files['headphone']['train']
test_path = dataset_files['headphone']['test']

model = 'aen'
model = 'bert'

if model == 'bert':
    # paras for tokenizer
    max_seq_len = 100
    #pretrained_bert_name = "bert-base-uncased"
    pretrained_bert_name = "bert-base-multilingual-cased"
    tokenizer = Tokenizer4Bert(max_seq_len, pretrained_bert_name)
    # model
    from pytorch_pretrained_bert import BertModel
    bert = BertModel.from_pretrained(pretrained_bert_name)
else:
    def embedding_ratio(embedding_matrix):
        """return how many percent of words is no pre-trained embedding
        The higher, the worse.
        """
        c = 0
        for word, i in tokenizer.word2idx.items():
            if sum(embedding_matrix[i]) == 0:
                c+=1
        return c/len(embedding_matrix)
    
    import sys
    sys.path.append('../')
    from data_utils import build_tokenizer, build_embedding_matrix
    # paras
    max_seq_len = 80
    data_path = "/dev/data"
    dataset = "headphone"
    embed_dim = 300
    tokenizer = build_tokenizer(
        fnames=[train_path, test_path],
        max_seq_len=max_seq_len,
        data_path=data_path,
        dat_fname='{0}_tokenizer.dat'.format(dataset))
    embedding_matrix = build_embedding_matrix(
        word2idx=tokenizer.word2idx,
        embed_dim=embed_dim,
        data_path=data_path,
        dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(embed_dim), dataset))
    print ('vocabulary size : ', len(embedding_matrix))
    print ("how many percent of words is no pre-trained embedding : ", embedding_ratio(embedding_matrix), '%')

trainset = ABSADataset(train_path, tokenizer)
testset = ABSADataset(test_path, tokenizer)
print ('number of traning data', len(trainset))
print ('number of testing data', len(testset))


The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


text_left :  "beställde nyligen mitt andra ex av dessa lurar då min 2-åriga son lyckades med bedriften att ha sönder de jag haft i drygt 1,5 år. de absolut bästa lurarna jag varit med om i denna prisklass, sitter perfekt, trevlig bas och tåliga (de som jag nu bytte ut har fungerat klockrent trots mycket...
text_right :  
aspect :  ljud
polarity :  2
text_left :  tvättbara kuddar och huvudband, och mikrofon samt fjärrkontroll och ett okej ljud för bara 250 kronor. de åker av om man gör ett hastigt kast framåt med huvudet, och de klämmer lite hårt på glasögonen.
text_right :  
aspect :  ljud
polarity :  2
number of traning data 2972
number of testing data 744


In [77]:
train_df = trainset.get_dataframe(tokenizer)
test_df = testset.get_dataframe(tokenizer)

# model's input 

In [79]:
# model's input for inference (aen_bert)
input_colses = {
    'lstm': ['text_raw_indices'],
    'td_lstm': ['text_left_with_aspect_indices', 'text_right_with_aspect_indices'],
    'atae_lstm': ['text_raw_indices', 'aspect_indices'],
    'ian': ['text_raw_indices', 'aspect_indices'],
    'memnet': ['text_raw_without_aspect_indices', 'aspect_indices'],
    'ram': ['text_raw_indices', 'aspect_indices', 'text_left_indices'],
    'cabasc': ['text_raw_indices', 'aspect_indices', 'text_left_with_aspect_indices', 'text_right_with_aspect_indices'],
    'tnet_lf': ['text_raw_indices', 'aspect_indices', 'aspect_in_text'],
    'aoa': ['text_raw_indices', 'aspect_indices'],
    'mgan': ['text_raw_indices', 'aspect_indices', 'text_left_indices'],
    'bert_spc': ['text_bert_indices', 'bert_segments_ids'], # Sentence Pair Classification
    'aen_bert': ['text_raw_bert_indices', 'aspect_bert_indices'],
}
train_df[input_colses['aen_bert']]

Unnamed: 0,text_raw_bert_indices,aspect_bert_indices
0,"[CLS] "" best ##ällde ny ##ligen mitt andra ex av dessa lu ##rar då min 2 - år ##iga son lyckades med bed ##rif ##ten att ha sö ##nder de jag haft i dry ##gt 1 , 5 år . de absolut bästa lu ##rar ##na jag varit med om i denna pris ##klas ##s , sitter per ##fekt , tre ##v ##lig bas och t ##ål ##iga ( de som jag nu bytte ut har fun ##gera ##t kl ##ock ##rent trots mycket . . . l ##jud [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",[CLS] l ##jud [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
1,"[CLS] jag k ##öp ##te dessa innan hel ##gen för att jag hade hör ##t v ##äl ##digt mycket bra om dessa . för ##del : + l ##jud ##et var bra . klar l ##jud ##bild på c ##sg ##o . + kan kör ##a flera ing ##ånga ##r samtidigt , typ kop ##pla in telefon med kabel samtidigt som du kör tr ##åd ##l ##öst via dato ##rn . + kan st ##äng ##a av bel ##ys ##ningen på lu ##rar ##na . bat ##teri [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",[CLS] bat ##teri [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
2,[CLS] anv ##änt to ##r på en lång fly ##gres ##a . fun ##gera ##de ut ##mä ##rkt . vilken tys ##tna ##d ! kom ##fort [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD],[CLS] kom ##fort [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
3,"[CLS] till att b ##ör ##ja med är hör ##lur ##arna ganska små i stor ##lek ##en ( trots att de är re ##gler ##bara ) vilket fun ##gera ##r bra för mig som har lite ##t hu ##vud , men har man större kan jag t ##än ##ka mig att det kan kl ##äm ##ma lite . product [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",[CLS] product [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
4,"[CLS] pass ##formen är bra , l ##jud ##et är o ##x ##å bra & med tank ##e på prise ##t så är lu ##rar ##na helt klart v ##är ##da att k ##öp ##as . product [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",[CLS] product [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
...,...,...
2967,[CLS] en mycket pris ##vä ##rd produkt som varmt kan re ##kommen ##dera ##s ! gör l ##öp ##ningen till ett sant n ##ö ##je product [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD],[CLS] product [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
2968,"[CLS] k ##öp ##te dessa efter att ha haft m ##dr - x ##b ##9 ##50 ##ap ##1 ( med kabel ) v ##äl ##digt län ##ge och var v ##äl ##digt n ##ö ##jd med l ##jud ##åt ##erg ##iv ##ningen samt det "" pass ##iva "" noise can ##celli ##ng . m ##dr - x ##b ##9 ##50 ##bt ##1 är pre ##cis lika bra , det som gör dem ännu b ##ätt ##re är att man sl ##ipper kabel ##n . l ##jud [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]",[CLS] l ##jud [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
2969,[CLS] lite hår ##da i vad ##dering ##en men sitter än ##d ##å relativt bra . v ##äl ##digt ir ##rite ##rande med r ##östen som går ig ##ån ##g när man try ##cker på hör ##lur ##arna ##s funk ##tioner . bra l ##jud ! bat ##teri [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD],[CLS] bat ##teri [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
2970,[CLS] hår ##da o ra ##mlar ur ö ##ronen . pg ##a att de är hår ##da får jag ont i ö ##ronen . s ##äm ##sta k ##öp ##et och bort ##kasta ##de pen ##gar ! l ##jud [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD],[CLS] l ##jud [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


# model logic
# Attentional Encoder Network (AEN)

In [85]:
import torch.nn as nn
import math

class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''
    def __init__(self, d_hid, d_inner_hid=None, dropout=0):
        super(PositionwiseFeedForward, self).__init__()
        if d_inner_hid is None:
            d_inner_hid = d_hid
        self.w_1 = nn.Conv1d(d_hid, d_inner_hid, 1)  # position-wise
        self.w_2 = nn.Conv1d(d_inner_hid, d_hid, 1)  # position-wise
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):

        output = self.relu(self.w_1(x.transpose(1, 2)))
        output = self.w_2(output).transpose(2, 1)
        output = self.dropout(output)
        return output
    
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, embed_dim, hidden_dim=None, out_dim=None, n_head=1, score_function='dot_product', dropout=0):
        ''' Attention Mechanism
        :param embed_dim:
        :param hidden_dim:
        :param out_dim:
        :param n_head: num of head (Multi-Head Attention)
        :param score_function: scaled_dot_product / mlp (concat) / bi_linear (general dot)
        :return (?, q_len, out_dim,)
        '''
        super(Attention, self).__init__()
        if hidden_dim is None:
            hidden_dim = embed_dim // n_head
        if out_dim is None:
            out_dim = embed_dim
        self.embed_dim = embed_dim
        self.hidden_dim = hidden_dim
        self.n_head = n_head
        self.score_function = score_function
        self.w_k = nn.Linear(embed_dim, n_head * hidden_dim)
        self.w_q = nn.Linear(embed_dim, n_head * hidden_dim)
        self.proj = nn.Linear(n_head * hidden_dim, out_dim)
        self.dropout = nn.Dropout(dropout)
        if score_function == 'mlp':
            self.weight = nn.Parameter(torch.Tensor(hidden_dim*2))
        elif self.score_function == 'bi_linear':
            self.weight = nn.Parameter(torch.Tensor(hidden_dim, hidden_dim))
        else:  # dot_product / scaled_dot_product
            self.register_parameter('weight', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.hidden_dim)
        if self.weight is not None:
            self.weight.data.uniform_(-stdv, stdv)

    def forward(self, k, q):
        if len(q.shape) == 2:  # q_len missing
            q = torch.unsqueeze(q, dim=1)
        if len(k.shape) == 2:  # k_len missing
            k = torch.unsqueeze(k, dim=1)
        mb_size = k.shape[0]  # ?
        k_len = k.shape[1]
        q_len = q.shape[1]
        # k: (?, k_len, embed_dim,)
        # q: (?, q_len, embed_dim,)
        # kx: (n_head*?, k_len, hidden_dim)
        # qx: (n_head*?, q_len, hidden_dim)
        # score: (n_head*?, q_len, k_len,)
        # output: (?, q_len, out_dim,)
        kx = self.w_k(k).view(mb_size, k_len, self.n_head, self.hidden_dim)
        kx = kx.permute(2, 0, 1, 3).contiguous().view(-1, k_len, self.hidden_dim)
        qx = self.w_q(q).view(mb_size, q_len, self.n_head, self.hidden_dim)
        qx = qx.permute(2, 0, 1, 3).contiguous().view(-1, q_len, self.hidden_dim)
        if self.score_function == 'dot_product':
            kt = kx.permute(0, 2, 1)
            score = torch.bmm(qx, kt)
        elif self.score_function == 'scaled_dot_product':
            kt = kx.permute(0, 2, 1)
            qkt = torch.bmm(qx, kt)
            score = torch.div(qkt, math.sqrt(self.hidden_dim))
        elif self.score_function == 'mlp':
            kxx = torch.unsqueeze(kx, dim=1).expand(-1, q_len, -1, -1)
            qxx = torch.unsqueeze(qx, dim=2).expand(-1, -1, k_len, -1)
            kq = torch.cat((kxx, qxx), dim=-1)  # (n_head*?, q_len, k_len, hidden_dim*2)
            # kq = torch.unsqueeze(kx, dim=1) + torch.unsqueeze(qx, dim=2)
            score = F.tanh(torch.matmul(kq, self.weight))
        elif self.score_function == 'bi_linear':
            qw = torch.matmul(qx, self.weight)
            kt = kx.permute(0, 2, 1)
            score = torch.bmm(qw, kt)
        else:
            raise RuntimeError('invalid score_function')
        score = F.softmax(score, dim=-1)
        output = torch.bmm(score, kx)  # (n_head*?, q_len, hidden_dim)
        output = torch.cat(torch.split(output, mb_size, dim=0), dim=-1)  # (?, q_len, n_head*hidden_dim)
        output = self.proj(output)  # (?, q_len, out_dim)
        output = self.dropout(output)
        return output, score
    
class SqueezeEmbedding(nn.Module):
    """
    Squeeze sequence embedding length to the longest one in the batch
    """
    def __init__(self, batch_first=True):
        super(SqueezeEmbedding, self).__init__()
        self.batch_first = batch_first

    def forward(self, x, x_len):
        """
        sequence -> sort -> pad and pack -> unpack ->unsort
        :param x: sequence embedding vectors
        :param x_len: numpy/tensor list
        :return:
        """
        """sort"""
        x_sort_idx = torch.sort(-x_len)[1].long()
        x_unsort_idx = torch.sort(x_sort_idx)[1].long()
        x_len = x_len[x_sort_idx]
        x = x[x_sort_idx]
        """pack"""
        x_emb_p = torch.nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=self.batch_first)
        """unpack: out"""
        out = torch.nn.utils.rnn.pad_packed_sequence(x_emb_p, batch_first=self.batch_first)  # (sequence, lengths)
        out = out[0]  #
        """unsort"""
        out = out[x_unsort_idx]
        return out

class AEN_BERT(nn.Module):
    def __init__(self, bert, opt):
        """
        In the constructor, you declare all the layers you want to use.
        """
        super(AEN_BERT, self).__init__()
        self.opt = opt
        self.bert = bert
        self.squeeze_embedding = SqueezeEmbedding()
        self.dropout = nn.Dropout(opt.dropout)
        
        # multi_head attetion
        self.attn_k = Attention(opt.bert_dim, out_dim=opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout)
        self.attn_q = Attention(opt.bert_dim, out_dim=opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout)
        self.ffn_c = PositionwiseFeedForward(opt.hidden_dim, dropout=opt.dropout)
        self.ffn_t = PositionwiseFeedForward(opt.hidden_dim, dropout=opt.dropout)

        self.attn_s1 = Attention(opt.hidden_dim, n_head=8, score_function='mlp', dropout=opt.dropout)

        self.dense = nn.Linear(opt.hidden_dim*3, opt.polarities_dim)

    def forward(self, inputs):
        """
        In the forward function, we define how our model is going to be run, from input to output
        
        """
        context, target = inputs[0], inputs[1]
        context_len = torch.sum(context != 0, dim=-1)
        target_len = torch.sum(target != 0, dim=-1)
        context = self.squeeze_embedding(context, context_len)
        context, _ = self.bert(context, output_all_encoded_layers=False)
        context = self.dropout(context)
        target = self.squeeze_embedding(target, target_len)
        target, _ = self.bert(target, output_all_encoded_layers=False)
        target = self.dropout(target)

        hc, _ = self.attn_k(context, context)
        hc = self.ffn_c(hc)
        ht, _ = self.attn_q(context, target)
        ht = self.ffn_t(ht)

        s1, _ = self.attn_s1(hc, ht)

        context_len = torch.tensor(context_len, dtype=torch.float).to(self.opt.device)
        target_len = torch.tensor(target_len, dtype=torch.float).to(self.opt.device)

        hc_mean = torch.div(torch.sum(hc, dim=1), context_len.view(context_len.size(0), 1))
        ht_mean = torch.div(torch.sum(ht, dim=1), target_len.view(target_len.size(0), 1))
        s1_mean = torch.div(torch.sum(s1, dim=1), context_len.view(context_len.size(0), 1))

        x = torch.cat((hc_mean, s1_mean, ht_mean), dim=-1)
        out = self.dense(x)
        return out

In [86]:
import torch.nn as nn

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
print ("loss",criterion)

class objectview(object):
    def __init__(self, d):
        self.__dict__ = d
        
opt = {
    "device":'cpu',
    "dropout":0.1,
    "hidden_dim":300,
    "bert_dim":768,
    "polarities_dim":3,
    "learning_rate":2e-5,
    "l2reg":0.01,
    "num_epoch":10,
    "batch_size":64,
    "optimizer":torch.optim.Adam,
    "inputs_cols":input_colses['aen_bert']
}
opt = objectview(opt)

model = AEN_BERT(bert,opt)
_params = filter(lambda p: p.requires_grad, model.parameters())
optimizer = opt.optimizer(_params, lr=opt.learning_rate, weight_decay=opt.l2reg)
print ("optimizer", optimizer)

loss CrossEntropyLoss()
optimizer Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 2e-05
    weight_decay: 0.01
)


In [87]:
help(model)

Help on AEN_BERT in module __main__ object:

class AEN_BERT(torch.nn.modules.module.Module)
 |  Base class for all neural network modules.
 |  
 |  Your models should also subclass this class.
 |  
 |  Modules can also contain other Modules, allowing to nest them in
 |  a tree structure. You can assign the submodules as regular attributes::
 |  
 |      import torch.nn as nn
 |      import torch.nn.functional as F
 |  
 |      class Model(nn.Module):
 |          def __init__(self):
 |              super(Model, self).__init__()
 |              self.conv1 = nn.Conv2d(1, 20, 5)
 |              self.conv2 = nn.Conv2d(20, 20, 5)
 |  
 |          def forward(self, x):
 |              x = F.relu(self.conv1(x))
 |              return F.relu(self.conv2(x))
 |  
 |  Submodules assigned in this way will be registered, and will have their
 |  parameters converted too when you call :meth:`to`, etc.
 |  
 |  Method resolution order:
 |      AEN_BERT
 |      torch.nn.modules.module.Module
 |      bui

# training procedure

In [88]:
from torch.utils.data import DataLoader, random_split
valset = testset 
train_data_loader = DataLoader(dataset=trainset, batch_size=opt.batch_size, shuffle=True)
test_data_loader = DataLoader(dataset=testset, batch_size=opt.batch_size, shuffle=False)
val_data_loader = DataLoader(dataset=valset, batch_size=opt.batch_size, shuffle=False)

In [89]:
trainset

<__main__.ABSADataset at 0x7f4ddf298240>

In [90]:
max_val_acc = 0
max_val_f1 = 0
global_step = 0
path = None
for epoch in range(opt.num_epoch):
    print('>' * 100)
    print('epoch: {}'.format(epoch))
    n_correct, n_total, loss_total = 0, 0, 0
    # switch model to training mode
    model.train()
    for i_batch, sample_batched in enumerate(train_data_loader):
        if i_batch <= 0 :
            print (type(sample_batched))
            print (sample_batched["text_bert_indices"].shape)
        global_step += 1
        # clear gradient accumulators
        optimizer.zero_grad()

        inputs = [sample_batched[col].to(opt.device) for col in opt.inputs_cols]
        if i_batch <= 0 :
            print ("=" * 100)
            print ("inputs")
            print (len(inputs))
            print (inputs)
        outputs = model(inputs)
        if i_batch <= 0 :
            print ("=" * 100)
            print ("outputs")
            print (len(outputs))
            print (outputs)

        targets = sample_batched['polarity'].to(self.opt.device)

        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        n_correct += (torch.argmax(outputs, -1) == targets).sum().item()
        n_total += len(outputs)
        loss_total += loss.item() * len(outputs)
        if global_step % self.opt.log_step == 0:
            train_acc = n_correct / n_total
            train_loss = loss_total / n_total
            logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc))

    val_acc, val_f1 = self._evaluate_acc_f1(val_data_loader)
    logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format(val_acc, val_f1))
    if val_acc > max_val_acc:
        max_val_acc = val_acc
        if not os.path.exists('state_dict'):
            os.mkdir('state_dict')
        path = 'state_dict/{0}_{1}_val_acc{2}'.format(self.opt.model_name, self.opt.dataset, round(val_acc, 4))
        torch.save(self.model.state_dict(), path)
        logger.info('>> saved: {}'.format(path))
    if val_f1 > max_val_f1:
        max_val_f1 = val_f1

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
epoch: 0
<class 'dict'>
torch.Size([64, 100])
inputs
2
[tensor([[  101, 14758, 67603,  ...,     0,     0,     0],
        [  101, 21566, 14302,  ...,     0,     0,     0],
        [  101, 64715, 15260,  ...,     0,     0,     0],
        ...,
        [  101, 16694, 20150,  ...,     0,     0,     0],
        [  101, 29956, 51984,  ...,     0,     0,     0],
        [  101, 29956, 51984,  ...,     0,     0,     0]]), tensor([[  101, 21535,   102,  ...,     0,     0,     0],
        [  101, 12240, 23044,  ...,     0,     0,     0],
        [  101, 11519, 35441,  ...,     0,     0,     0],
        ...,
        [  101, 21535,   102,  ...,     0,     0,     0],
        [  101, 12240, 23044,  ...,     0,     0,     0],
        [  101, 21535,   102,  ...,     0,     0,     0]])]




outputs
64
tensor([[-1.9470e-03,  9.7370e-03, -2.6082e-03],
        [-1.2993e-02,  5.0002e-03, -1.6051e-03],
        [-8.6599e-03,  1.0666e-03,  3.7746e-03],
        [-2.2136e-03,  2.2099e-02,  2.0789e-02],
        [-1.4120e-03,  1.8203e-02, -7.0842e-03],
        [-1.6799e-02,  1.6138e-03, -6.5281e-03],
        [-1.4859e-02,  3.5414e-03,  9.7817e-04],
        [-1.8900e-02, -1.7585e-03, -5.4602e-03],
        [-1.6442e-02,  1.2549e-02,  5.0792e-03],
        [-5.1415e-03,  3.4307e-03, -1.1050e-02],
        [-1.3798e-02,  1.2192e-02,  1.6272e-02],
        [-1.3384e-02, -1.5115e-03, -3.3038e-03],
        [-1.4694e-02,  1.4597e-02, -1.4277e-02],
        [-1.3411e-02,  1.5804e-02,  5.3398e-03],
        [-2.0829e-02,  7.4175e-03, -7.7206e-03],
        [-1.1988e-02,  3.6343e-03, -1.6516e-02],
        [-1.8091e-02,  4.8730e-03, -1.1795e-02],
        [-6.9306e-03,  2.0435e-02, -2.8080e-03],
        [-1.4381e-02,  1.4003e-02,  1.2246e-02],
        [-8.1476e-03,  1.8408e-03, -1.6672e-02],
        [



NameError: name 'self' is not defined

In [92]:
outputs.shape

torch.Size([64, 3])

In [9]:
# from pytorch_transformers import XLNetTokenizer
# tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
# model = XLNetModel.from_pretrained('xlnet-large-cased')
# input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
# outputs = model(input_ids)
# last_hidden_states = outputs[0] 
import torch
torch.zeros([2, 4], dtype=torch.int32)


tensor([[0, 0, 0, 0],
        [0, 0, 0, 0]], dtype=torch.int32)

In [14]:
cuda0 = torch.device('cuda')
print (cuda0)
torch.ones([2, 4], dtype=torch.float64, device=cuda0)

cuda


tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.]], device='cuda:0', dtype=torch.float64)

In [None]:
"""
Working log:
9/2
1. we have 4 model's result.

Today's to do list
1. try multi-lingual bert embedding(done)
2. try swedish embedding(no need)
3. understand model's logic (tmr)
    -aen_bert
    -bert_spc
4. write experiment for finding best parameters
    -try larger max_seq_length: 100

Tmr's to do list
1. stack embedding
2. data augmentation
3. write inference.py(the model will be trained on both train+test dataset )
"""