#Singe Domain Experiments

In [None]:
#we first connect to my google drive in order to collect all the data
#we will have three domains (restaurant, hotels and electronics)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

#load all the datasets
data_lap = pd.read_csv("/content/drive/My Drive/Tesi_ABSA/dataset/laptops_2014.csv", encoding="latin1", sep="\t")
data_lap = data_lap.fillna(method="ffill")

data_comp = pd.read_csv("/content/drive/My Drive/Tesi_ABSA/dataset/Computer.csv", encoding="latin1", sep="\t")
data_comp = data_comp.fillna(method="ffill")

data_router = pd.read_csv("/content/drive/My Drive/Tesi_ABSA/dataset/Router.csv", encoding="latin1", sep="\t")
data_router = data_router.fillna(method="ffill")

data_speaker = pd.read_csv("/content/drive/My Drive/Tesi_ABSA/dataset/Speaker.csv", encoding="latin1", sep="\t")
data_speaker = data_speaker.fillna(method="ffill")

data_rest = pd.read_csv("/content/drive/My Drive/Tesi_ABSA/dataset/restaurants1.csv", encoding="latin1", sep="\t")
data_rest = data_rest.fillna(method="ffill")

data_hotels = pd.read_csv("/content/drive/My Drive/Tesi_ABSA/dataset/hotels.csv", encoding="latin1", sep="\t")
data_hotels = data_hotels.fillna(method="ffill")

#We also concatenate all the datasets
data = pd.concat([data_comp,data_router,data_speaker,data_rest,data_hotels], ignore_index=True)

In [None]:
#Class for handling the dataset
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w,t) for w,t in zip(s["TOKEN"].values.tolist(), s["TAG"].values.tolist())]
        self.grouped = self.data.groupby("SENTENCE").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
getter_lap = SentenceGetter(data_lap)
getter_comp = SentenceGetter(data_comp)
getter_router = SentenceGetter(data_router)
getter_speaker = SentenceGetter(data_speaker)
getter_rest = SentenceGetter(data_rest)
getter_hotels = SentenceGetter(data_hotels)

#We get all the sentences
#Each sentence is a string (not splitted into tokens)
sentences_lap = [" ".join([str(s[0]) for s in sent]) for sent in getter_lap.sentences]
sentences_comp = [" ".join([str(s[0]) for s in sent]) for sent in getter_comp.sentences]
sentences_router = [" ".join([str(s[0]) for s in sent]) for sent in getter_router.sentences]
sentences_speaker = [" ".join([str(s[0]) for s in sent]) for sent in getter_speaker.sentences]
sentences_rest = [" ".join([str(s[0]) for s in sent]) for sent in getter_rest.sentences]
sentences_hotels = [" ".join([str(s[0]) for s in sent]) for sent in getter_hotels.sentences]

In [None]:
#We print a sentence for each dataset
print(sentences_lap[0])
print(sentences_comp[0])
print(sentences_speaker[0])
print(sentences_router[0])
print(sentences_rest[0])
print(sentences_hotels[0])

In [None]:
labels_lap = [[s[1] for s in sent] for sent in getter_lap.sentences]
labels_comp = [[s[1] for s in sent] for sent in getter_comp.sentences]
labels_router = [[s[1] for s in sent] for sent in getter_router.sentences]
labels_speaker = [[s[1] for s in sent] for sent in getter_speaker.sentences]
labels_rest = [[s[1] for s in sent] for sent in getter_rest.sentences]
labels_hotels = [[s[1] for s in sent] for sent in getter_hotels.sentences]

In [None]:
print(sentences_lap[0])
print(labels_lap[0])
print(sentences_comp[0])
print(labels_comp[0])
print(sentences_speaker[0])
print(labels_speaker[0])
print(sentences_router[0])
print(labels_router[0])
print(sentences_rest[0])
print(labels_rest[0])
print(sentences_hotels[0])
print(labels_hotels[0])

In [None]:
#We get a dictionary for mapping tags do ids
tags_vals = list(set(data["TAG"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}
tag2idx

In [None]:
!pip install pytorch-pretrained-bert==0.4.0
import torch
from torch.optim import Adam, SGD
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

In [None]:
#We check if the gpu is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
n_gpu

In [None]:
#We tokenize all the sentences
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts_lap = [tokenizer.tokenize(sent) for sent in sentences_lap]
tokenized_texts_comp = [tokenizer.tokenize(sent) for sent in sentences_comp]
tokenized_texts_speaker = [tokenizer.tokenize(sent) for sent in sentences_speaker]
tokenized_texts_router = [tokenizer.tokenize(sent) for sent in sentences_router]
tokenized_texts_rest = [tokenizer.tokenize(sent) for sent in sentences_rest]
tokenized_texts_hotels = [tokenizer.tokenize(sent) for sent in sentences_hotels]

In [None]:
MAX_LEN = 75
bs = 32

input_ids_lap = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_lap],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids_comp = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_comp],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids_speaker = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_speaker],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids_router = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_router],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids_rest = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_rest],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
input_ids_hotels = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_hotels],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
tags_lap = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_lap],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
tags_comp = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_comp],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
tags_speaker = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_speaker],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
tags_router = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_router],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
tags_rest = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_rest],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
tags_hotels = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_hotels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks_lap = [[float(i>0) for i in ii] for ii in input_ids_lap]
attention_masks_comp = [[float(i>0) for i in ii] for ii in input_ids_comp]
attention_masks_speaker = [[float(i>0) for i in ii] for ii in input_ids_speaker]
attention_masks_router = [[float(i>0) for i in ii] for ii in input_ids_router]
attention_masks_rest = [[float(i>0) for i in ii] for ii in input_ids_rest]
attention_masks_hotels = [[float(i>0) for i in ii] for ii in input_ids_hotels]

In [None]:
!pip install seqeval
from sklearn.model_selection import KFold
from seqeval.metrics import f1_score, classification_report
cv = KFold(n_splits=5, random_state=42, shuffle=False)
i = 0


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


for train_index, test_index in cv.split(attention_masks_comp):
  training_inputs = input_ids_comp[train_index]
  test_inputs = input_ids_comp[test_index]
  training_tags = tags_comp[train_index]
  test_tags = tags_comp[test_index]
  training_masks = np.array(attention_masks_comp)[train_index]
  test_masks = np.array(attention_masks_comp)[test_index]
  validation_inputs = training_inputs[-(int(0.2*len(training_inputs))):]
  validation_tags = training_tags[-(int(0.2*len(training_inputs))):]
  validation_masks = training_masks[-(int(0.2*len(training_inputs))):]
  training_inputs = training_inputs[:(int(0.8*len(training_inputs)))]
  training_tags = training_tags[:(int(0.8*len(training_tags)))]
  training_masks = training_masks[:(int(0.8*len(training_masks)))]

  tr_inputs = torch.tensor(training_inputs)
  val_inputs = torch.tensor(validation_inputs)
  te_inputs = torch.tensor(test_inputs)
  tr_tags = torch.tensor(training_tags)
  val_tags = torch.tensor(validation_tags)
  te_tags = torch.tensor(test_tags)
  tr_masks = torch.tensor(training_masks)
  val_masks = torch.tensor(validation_masks)
  te_masks = torch.tensor(test_masks)
  train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

  valid_data = TensorDataset(val_inputs, val_masks, val_tags)
  valid_sampler = SequentialSampler(valid_data)
  valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

  test_data = TensorDataset(te_inputs, te_masks, te_tags)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)
  model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
  model.cuda();
  FULL_FINETUNING = True
  if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}]
  else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
  optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
  epochs = 10
  max_grad_norm = 1.0

  for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))


  model.eval()
  predictions = []
  true_labels = []
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

  pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
  valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
  print("Validation loss: {}".format(eval_loss/nb_eval_steps))
  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
  print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
  print(classification_report(valid_tags,pred_tags))

# Cross Domain Experiments

In [None]:
#We generate training and test ids, masks and tags
tr_inputs_lap, val_inputs_lap, tr_tags_lap, val_tags_lap = train_test_split(input_ids_lap, tags_lap, 
                                                            random_state=2018, test_size=0.2)
tr_masks_lap, val_masks_lap, _, _ = train_test_split(attention_masks_lap, input_ids_lap,
                                             random_state=2018, test_size=0.2)

tr_inputs_comp, val_inputs_comp, tr_tags_comp, val_tags_comp = train_test_split(input_ids_comp, tags_comp, 
                                                            random_state=2018, test_size=0.2)
tr_masks_comp, val_masks_comp, _, _ = train_test_split(attention_masks_comp, input_ids_comp,
                                             random_state=2018, test_size=0.2)

tr_inputs_speaker, val_inputs_speaker, tr_tags_speaker, val_tags_speaker = train_test_split(input_ids_speaker, tags_speaker, 
                                                            random_state=2018, test_size=0.2)
tr_masks_speaker, val_masks_speaker, _, _ = train_test_split(attention_masks_speaker, input_ids_speaker,
                                             random_state=2018, test_size=0.2)

tr_inputs_router, val_inputs_router, tr_tags_router, val_tags_router = train_test_split(input_ids_router, tags_router, 
                                                            random_state=2018, test_size=0.2)
tr_masks_router, val_masks_router, _, _ = train_test_split(attention_masks_router, input_ids_router,
                                             random_state=2018, test_size=0.2)

tr_inputs_rest, val_inputs_rest, tr_tags_rest, val_tags_rest = train_test_split(input_ids_rest, tags_rest, 
                                                            random_state=2018, test_size=0.2)
tr_masks_rest, val_masks_rest, _, _ = train_test_split(attention_masks_rest, input_ids_rest,
                                             random_state=2018, test_size=0.2)

tr_inputs_hotels, val_inputs_hotels, tr_tags_hotels, val_tags_hotels = train_test_split(input_ids_hotels, tags_hotels, 
                                                            random_state=2018, test_size=0.2)
tr_masks_hotels, val_masks_hotels, _, _ = train_test_split(attention_masks_hotels, input_ids_hotels,
                                             random_state=2018, test_size=0.2)

In [None]:
#We concatenate the ids of five over six datasets
training_inputs = np.concatenate((tr_inputs_speaker,tr_inputs_comp, tr_inputs_router, tr_inputs_rest, tr_inputs_lap))
validation_inputs = np.concatenate((val_inputs_speaker,val_inputs_comp, val_inputs_router, val_inputs_rest, val_inputs_lap))
training_tags = np.concatenate((tr_tags_speaker,tr_tags_comp, tr_tags_router, tr_tags_rest, tr_tags_lap))
validation_tags = np.concatenate((val_tags_speaker,val_tags_comp,val_tags_router, val_tags_rest, val_tags_lap))
training_masks = np.concatenate((tr_masks_speaker,tr_masks_comp, tr_masks_router, tr_masks_rest, tr_masks_lap))
validation_masks = np.concatenate((val_masks_speaker,val_masks_comp, val_masks_router, val_masks_rest, val_masks_lap))
test_inputs = np.concatenate((tr_inputs_hotels,val_inputs_hotels))
test_tags = np.concatenate((tr_tags_hotels,val_tags_hotels))
test_masks = np.concatenate((tr_masks_hotels,val_masks_hotels))

In [None]:
#We generate the torch tensors to feed the model
tr_inputs = torch.tensor(training_inputs)
val_inputs = torch.tensor(validation_inputs)
te_inputs = torch.tensor(test_inputs)
tr_tags = torch.tensor(training_tags)
val_tags = torch.tensor(validation_tags)
te_tags = torch.tensor(test_tags)
tr_masks = torch.tensor(training_masks)
val_masks = torch.tensor(validation_masks)
te_masks = torch.tensor(test_masks)

In [None]:
#We generate the DataLoader
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

test_data = TensorDataset(te_inputs, te_masks, te_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=bs)

In [None]:
#We perform five iterations on the cross domain experiments
!pip install seqeval
from seqeval.metrics import f1_score, classification_report

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

#function to extract the aspects from the sentence 
def print_f1_on_file(filename,true_aspects,extracted_aspects):
  count = 0
  f1 = 0.0
  f = open(filename,"w+")
  for ea,ta in zip(extracted_aspects,true_aspects):
    if 'B-aspect' in ta:
      f.write(str(f1_score([ta],[ea]))+"\n")
      count += 1
      f1 += f1_score([ta],[ea])
  f.close()
  print(f1)
  print(count)
  print(f1/count)
  
for i in range(5):
  model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))
  model.cuda();
  FULL_FINETUNING = True
  if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate': 0.0}]
  else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
  optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)
  epochs = 10
  max_grad_norm = 1.0

  for _ in trange(epochs, desc="Epoch"):
    # TRAIN loop
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
    # print train loss per epoch
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    print("Validation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags_vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags_vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))


  model.eval()
  predictions = []
  true_labels = []
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0
  for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      tmp_eval_loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

  pred_tags = [[tags_vals[p_i] for p_i in p] for p in predictions]
  valid_tags = [[tags_vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
  print("Validation loss: {}".format(eval_loss/nb_eval_steps))
  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
  print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
  print(classification_report(valid_tags,pred_tags))
  print_f1_on_file("/content/drive/My Drive/Tesi_ABSA/hotels_sampled_bert_f1_"+str(i+1)+".txt",valid_tags,pred_tags)

In [None]:
import json
import re

idx2tag = {i: w for w, i in tags2index.items()}
print(idx2tag)
#Function to convert the predictions to tags
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i].replace("PADword", "O"))
        out.append(out_i)
    return out

max_len = 100

#We extract the aspects of 32.000 sentences at each iteration in order to avoid memory problems
start = 0
while start < 2112337:
  f_rev = open("/content/drive/My Drive/Tesi_ABSA/dataset/Aspect Extraction Task/review_divided.txt")
  reviews = []
  id_review = []
  id_sentence = []
  user_ids = []
  business_ids = []
  comp_list = []
  neg_list = []
  neu_list = []
  pos_list = []
  count = 0
  #We skip all the sentences already processed
  for line in f_rev:
    if count < start:
      count +=1
      continue
    splitted = line.split("\t")
    compund = float(splitted[2])
    neg = float(splitted[3])
    neu = float(splitted[4])
    pos = float(splitted[5])
    sentence = splitted[8]
    num_review = int(splitted[0])
    num_sentence = int(splitted[1])
    user_id = splitted[6]
    business_id = splitted[7]
    reviews.append(sentence)
    id_review.append(num_review)
    id_sentence.append(num_sentence)
    user_ids.append(user_id)
    business_ids.append(business_id)
    comp_list.append(compund)
    neg_list.append(neg)
    neu_list.append(neu)
    pos_list.append(pos)
    count +=1
    #If we have loaded 32.000 sentences, we stop the loop
    if count == start+32000:
      break

  #We extract all the aspects in the sentences with the trained model
  new_X_rest = []
  rev_splitted = []
  for rev in reviews:
    rev_splitted.append(rev.split())
  for seq in rev_splitted:
      new_seq = []
      for i in range(max_len):
        try:
          new_seq.append(seq[i])
        except:
          new_seq.append("PADword")
      new_X_rest.append(new_seq)
  for i in range(16):
    new_seq=[]
    for j in range(max_len):
      new_seq.append("PADword")
    new_X_rest.append(new_seq)
  print(int(len(reviews)/32)*32)
  

  print(len(new_X_rest))
  test_pred = model.predict(np.array(new_X_rest[:(int(len(new_X_rest)/32)*32)]), verbose=1)
  pred_labels = pred2label(test_pred)
  aspect_words = []
  begin_end_indeces = []
  f = open("/content/drive/My Drive/Tesi_ABSA/dataset/Aspect Extraction Task/extracted_aspect_terms_wo_restaurants.txt","a+")
  f_ind = open("/content/drive/My Drive/Tesi_ABSA/dataset/Aspect Extraction Task/extracted_aspect_index_wo_restaurants.txt","a+")
  #We convert the tags into aspect terms
  for (tokens, labels,rev_n,sent_n,user_id,business_id) in zip(new_X_rest, pred_labels,id_review,id_sentence,user_ids,business_ids):
      f.write(str(rev_n)+"\t")
      f.write(str(sent_n)+"\t")
      f.write(user_id+"\t")
      f.write(business_id+"\t")
      sentence_aspects = []
      begin_indices = [i for i, x in enumerate(labels) if x == "B-aspect"]
      for bi in begin_indices:
        current_aspect = tokens[bi]
        count = 1
        while (count < len(labels)-bi and labels[bi+count] == 'I-aspect'):
          current_aspect += ' '+tokens[bi+count]
          count += 1
        begin_end_indeces.append([bi,(bi+count-1)])
        f.write(current_aspect+"\t")
      f_ind.write(str(rev_n)+"\t"+str(sent_n)+"\t"+str(begin_end_indeces)+"\n")
      begin_end_indeces = []
      f.write("\n")
  f_ind.close()
  f.close()
  start += 32000

# Aspect clustering 

In [None]:
#We save all the aspects into a list
aspects = []
f = open("/content/drive/My Drive/Tesi_ABSA/dataset/Aspect Extraction Task/extracted_aspect_terms_wo_restaurants.txt")
aspects = []
ids_reviews = []
ids_sentences = []
for line in f.readlines():

  splitted_line = line.split("\t")
  ids_reviews.append(splitted_line[0])
  ids_sentences.append(splitted_line[1])
  aspects.append(splitted_line[4:])
  
print(len(aspects))
full_aspects_list = []
for asp in aspects:
  for el in asp:
    if el != "\n":
      full_aspects_list.append(el.lower())

print(full_aspects_list[:10])

In [None]:
#reading all the indeces from the file 
f_ind = open("/content/drive/My Drive/Tesi_ABSA/dataset/Aspect Extraction Task/extracted_aspect_index_wo_restaurants.txt")
positions = []
for line in f_ind:
  positions.append(line.split("\t")[2])
print(positions[2])

In [None]:
#We download the pre-trained word2vec model
import gensim
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gunzip GoogleNews-vectors-negative300.bin.gz

In [None]:
#We generate the embedding matrix
import numpy as np
filepath = "GoogleNews-vectors-negative300.bin"

embeddings = {}
from gensim.models import KeyedVectors
print("Loading the Word2Vec model...")
wv_from_bin = KeyedVectors.load_word2vec_format(filepath, binary=True) 
for word, vector in zip(wv_from_bin.vocab, wv_from_bin.vectors):
    coefs = np.asarray(vector, dtype='float32')
    embeddings[word] = coefs
print('# vectors:',  len(embeddings))

In [None]:
import string
import re
#We get the vectorial representation of each aspect term
aspects_embeddings = []
final_aspects = []
print(len(aspects))
for term in full_aspects_list:
  current_vector = [0]*300
  splitted_term = term.split()
  count = 0
  for t in splitted_term:
    try:
      current_vector += embeddings[t]
      count += 1
    except:
      print('I got a KeyError - reason: ',t)
      continue
  try:
    current_vector[:] = [x / count for x in current_vector]
  except:
    current_vector = current_vector
  aspects_embeddings.append(current_vector)
  final_aspects.append(term)

In [None]:
#We use the k-means on the vector representations of aspect terms
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=50, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_pred = kmeans.fit_predict(aspects_embeddings)

In [None]:
#We print on the same file the aspect terms belonging to the same cluster

for aspect,cluster in zip(full_aspects_list,y_pred):
  f = open("aspect_cluster_k50_wo_restaurants_"+str(cluster)+".txt","a+")
  f.write(aspect+"\n")
  f.close()

In [None]:
#We generate a final json containing all the information extracted during this process
import json
json_list = []
count = 0
f_rev = open("/content/drive/My Drive/Tesi_ABSA/dataset/Aspect Extraction Task/review_divided.txt")
reviews = []
id_review = []
id_sentence = []
user_ids = []
business_ids = []
comp_list = []
neg_list = []
neu_list = []
pos_list = []
for line in f_rev:
    splitted = line.split("\t")
    compund = float(splitted[2])
    neg = float(splitted[3])
    neu = float(splitted[4])
    pos = float(splitted[5])
    num_review = int(splitted[0])
    num_sentence = int(splitted[1])
    user_id = splitted[6]
    business_id = splitted[7]
    id_review.append(num_review)
    id_sentence.append(num_sentence)
    user_ids.append(user_id)
    business_ids.append(business_id)
    comp_list.append(compund)
    neg_list.append(neg)
    neu_list.append(neu)
    pos_list.append(pos)

print(len(comp_list))
for rev_id, sent_id, asp, pos,compound,negative,neutral,positive,user,business in zip (id_review,id_sentence,aspects,positions,comp_list,neg_list,neu_list,pos_list,user_ids,business_ids):
  idx = json.loads(pos[:-1])
  aspects_list = []
  if len(asp) != 0:
    for id,aspect in zip(idx,asp):
      a = {
          "aspect_term":aspect,
          "id_starting_token":id[0],
          "id_ending_token":id[1],
          "cluster":str(y_pred[count]),
          "compund":str(compound),
          "negative":str(negative),
          "neutral":str(neutral),
          "positive":str(positive)
      }
      aspects_list.append((a))
      count+=1
    x = {
        "id_reviews":rev_id,
        "id_sentence":sent_id,
        "aspects":aspects_list
    }
    json_list.append(x)

final_json = json.dumps(json_list,indent=2)
f = open("json_extracted_aspects_k50_wo_restaurants.json","w+")
f.write(final_json)
f.close()
