In [2]:
import os, sys
sys.path.insert(1, '../dataset')
import data_preparation
import numpy as np
import pandas as pd

In [3]:
#prepare data
if not os.path.exists('../dataset/prepared'):
    os.makedirs('../dataset/prepared')
data_preparation.to_csv('../dataset/atepc/restaurants_test.csv', '../dataset/prepared/restaurants_test.csv',)
data_preparation.to_csv('../dataset/atepc/restaurants_train.csv', '../dataset/prepared/restaurants_train.csv',)

In [4]:
#load
data = pd.read_csv('../dataset/prepared/restaurants_train.csv')
data_test = pd.read_csv('../dataset/prepared/restaurants_test.csv')

# ABTE

In [5]:
from transformers import BertTokenizer
import torch
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [186]:
from transformers import BertModel
import torch
from torch.utils.data import Dataset
import time
import numpy as np
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import os,sys

class ATEDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokens, tags, pols = self.df.iloc[idx, :3].values

        tokens = tokens.replace("'", "").strip("][").split(', ')
        tags = tags.strip('][').split(', ')
        pols = pols.strip('][').split(', ')

        bert_tokens = []
        bert_tags = []
        bert_pols = []
        for i in range(len(tokens)):
            t = self.tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)
            bert_pols += [int(pols[i])]*len(t)
        
        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        tags_tensor = torch.tensor(bert_tags)
        pols_tensor = torch.tensor(bert_pols)
        return bert_tokens, ids_tensor, tags_tensor, pols_tensor

    def __len__(self):
        return len(self.df)

class ATEBert(torch.nn.Module):
    def __init__(self, pretrain_model):
        super(ATEBert, self).__init__()
        self.bert = BertModel.from_pretrained(pretrain_model)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, tags_tensors, masks_tensors):
        bert_outputs= self.bert(input_ids=ids_tensors, attention_mask=masks_tensors, return_dict=False)
        bert_outputs = bert_outputs[0]

        linear_outputs = self.linear(bert_outputs)
        if tags_tensors is not None:
            tags_tensors = tags_tensors.view(-1)
            linear_outputs = linear_outputs.view(-1,3)
            loss = self.loss_fn(linear_outputs, tags_tensors)
            return loss
        else:
            return linear_outputs

class ATEModel ():
    def __init__(self, tokenizer):
        self.model = ATEBert('bert-base-uncased')
        self.tokenizer = tokenizer
        self.trained = False

    def padding(self, samples):
        from torch.nn.utils.rnn import pad_sequence
        ids_tensors = [s[1] for s in samples]
        ids_tensors = pad_sequence(ids_tensors, batch_first=True)
        print(ids_tensors)

        tags_tensors = [s[2] for s in samples]
        tags_tensors = pad_sequence(tags_tensors, batch_first=True)

        pols_tensors = [s[3] for s in samples]
        pols_tensors = pad_sequence(pols_tensors, batch_first=True)
        
        masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
        masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)
        print(masks_tensors)
    
        return ids_tensors, tags_tensors, pols_tensors, masks_tensors

    def load_model(self, model, path):
        model.load_state_dict(torch.load(path), strict=False)
        
    def save_model(self, model, name):
        torch.save(model.state_dict(), name)        
                

    def train(self, data, epochs, device, batch_size=32, lr=1e-5):

        # dataset and loader
        ds = ATEDataset(data, self.tokenizer)
        loader = DataLoader(ds, batch_size=batch_size, shuffle=True, collate_fn=self.padding)
        
        self.model = self.model.to(device)
        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr)

        all_data = len(loader)-1
        for epoch in range(epochs):
            finish_data = 0
            self.losses = []
            current_times = []

            n_batches = int(len(data)/batch_size)
            # batch = next(iter(loader))
            # print (batch[0].shape, batch[1].shape, batch[2].shape, batch[3].shape)
            for nb in range((n_batches)):
                t0 = time.time()

                ids_tensors, tags_tensors, _, masks_tensors = next(iter(loader))
                ids_tensor = ids_tensors.to(device)
                tags_tensor = tags_tensors.to(device)
                masks_tensor = masks_tensors.to(device)
                loss = self.model(ids_tensors=ids_tensor, tags_tensors=tags_tensor, masks_tensors=masks_tensor)
                self.losses.append(loss.item())
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                finish_data += 1
                current_time = round(time.time() - t0,3)
                current_times.append(current_time)
                print("epoch: {}\tbatch: {}/{}\tloss: {}\tbatch time: {}\ttotal time: {}"\
                    .format(epoch, finish_data, all_data, loss.item(), current_time, sum(current_times)))

            self.save_model(self.model, 'model_lr{}_epochs{}_batch{}.pkl'.format(lr, epoch, batch_size))
            self.trained = True

    def history (self):
        if self.trained:
            return self.losses
        else:
            raise Exception('Model not trained')

    def unpack_sequence(self, packed_sequence, mask):
        unpacked_sequence = []
        for i in range(len(packed_sequence)):
            if mask[i] == 1:
                unpacked_sequence.append(packed_sequence[i])
    
        return unpacked_sequence

    def test(self, data, device='cpu', batch_size=256, lr=1e-4, epochs=2):

        ds = ATEDataset(data, self.tokenizer)
        loader = DataLoader(ds, shuffle=True,batch_size =len(data), collate_fn=self.padding)
        pred = []
        trueth = []
        
        tags_real = [t.strip('][').split(', ') for t in data['Tags']]
        tags_real = [[int(i) for i in t] for t in tags_real]

        ids, tags, pols, masks = next(iter(loader))
        ids_tensors = ids.clone()
        tags_tensors = tags.clone()
        masks_tensors = masks.clone()
        
        # load model if exists
        if os.path.exists('model_lr{}_epochs{}_batch{}.pkl'.format(lr, epochs, batch_size)):
            self.load_model(self.model, 'model_lr{}_epochs{}_batch{}.pkl'.format(lr, epochs, batch_size))
        if not self.trained and not os.path.exists('model_lr{}_epochs{}_batch{}.pkl'.format(lr, epochs, batch_size)):
            raise Exception('model not trained and does not exist')
            
        with torch.no_grad():
            ids_tensors = ids_tensors.to(device)
            tags_tensors = tags_tensors.to(device)
            masks_tensors = masks_tensors.to(device)

            outputs = self.model(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)
            _, predictions = torch.max(outputs, dim=2)

            pred += list([int(j)  for i in predictions for j in i ])
            trueth += list([int(j) for i in tags_tensors for j in i ])
        acc = np.mean(np.array(trueth) == np.array(pred))

        predictions = [self.unpack_sequence(p,m) for p, m in zip(predictions, masks_tensors)]
        # predictions = [int(p) for p in predictions[0]]
        return acc, predictions, tags_real

    def accuracy(self, data, device='cpu', batch_size=256, lr=1e-4, epochs=2):
        a, p = self.test(data, device, batch_size, lr, epochs)
        return a

    def predict (self, data, device='cpu', batch_size=256, lr=1e-4, epochs=2):
        t, p = self.test(data, device, batch_size, lr, epochs)
        return p


In [187]:
# from abte import ATEModel
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = ATEModel(tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [188]:
a, pred, t = model.test(data_test[:100], batch_size=256, lr=1e-4, epochs=2)

tensor([[2065, 2017, 1000,  ...,    0,    0,    0],
        [1012,    0,    0,  ...,    0,    0,    0],
        [2013, 1996, 2927,  ...,    0,    0,    0],
        ...,
        [2018, 1037, 2307,  ...,    0,    0,    0],
        [1996, 5869, 8490,  ...,    0,    0,    0],
        [2307, 2833, 1010,  ...,    0,    0,    0]])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [None]:
model.train(data, batch_size=256, lr=1e-4, epochs=2)

In [180]:

pred = [[int(i) for i in pred[row]] for row in range(len(pred))]
np.mean(np.array(pred).flatten() == np.array(t).flatten())

  np.mean(np.array(pred).flatten() == np.array(t).flatten())


0.04

In [181]:
def tag_to_word(sentence, predictions):
    """
    predictions: list of tags
    sentence: list of words
    """
    terms = []
    for i, word in enumerate(sentence):
        if predictions[i] == 1:
            terms.append(word)
    return terms

def tag_to_word_df(df, column_name, tags):
    """
    predictions: list of tags
    sentence: list of words
    """
    terms_list = []
    for i in range(len(df)):
        sentence = df.iloc[i]['Tokens']
        sentence = sentence.replace("'", "").strip("][").split(', ')
        terms = tag_to_word(sentence, tags[i])
        terms_list.append(terms)
    df[column_name] = terms_list
    return df

In [185]:
pred[1], t[1]

([0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0])

In [183]:
tags_real = [t.strip('][').split(', ') for t in data_test['Tags'][:100]]
tags_real = [[int(i) for i in t] for t in tags_real]
pred_string = [''.join(str(i) for i in p) for p in pred]
data_test = tag_to_word_df(data_test[:100], 'gold terms', tags_real)
data_test = tag_to_word_df(data_test[:100], 'pred terms', pred)
data_test['predicted_tags'] = pred_string
acc = np.mean(np.array(tags_real).flatten() == np.array(pred).flatten())
data_test

IndexError: list index out of range