# Single Domain Experiments

In [None]:
#function to extract the aspects from the sentence and print the f1 for each sentence in a file
def print_f1_on_file(filename,true_aspects,extracted_aspects):
  count = 0
  f1 = 0.0
  f = open(filename,"w+")
  for ea,ta in zip(extracted_aspects,true_aspects):
    if 'B-aspect' in ta:
      f.write(str(f1_score([ta],[ea]))+"\n")
      count += 1
      f1 += f1_score([ta],[ea])
  f.close()
  print(f1)
  print(count)
  print(f1/count)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gensim

import os
%tensorflow_version 1.x
import tensorflow as tf
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 

In [None]:
#Connessione a drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#class for get sentences in the format of tuples [(TOKEN,TAG),(TOKEN,TAG)...]
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["TOKEN"].values.tolist(),s["TAG"].values.tolist())]
        self.grouped = self.data.groupby("SENTENCE").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [None]:
#Download of pre-trained word2vec model
!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
!gunzip GoogleNews-vectors-negative300.bin.gz

In [None]:
#converting the downloaded model into a dictionary {word:vector}
import numpy as np
filepath = "GoogleNews-vectors-negative300.bin"

embeddings = {}
from gensim.models import KeyedVectors
print("Loading the Word2Vec model...")
wv_from_bin = KeyedVectors.load_word2vec_format(filepath, binary=True) 
for word, vector in zip(wv_from_bin.vocab, wv_from_bin.vectors):
    coefs = np.asarray(vector, dtype='float32')
    embeddings[word] = coefs
print('# vectors:',  len(embeddings))

In [None]:
#We load all the datasets
data_rest = pd.read_csv('/content/drive/My Drive/Tesi_ABSA/dataset/restaurants1.csv', encoding="latin-1",sep="\t")
data_lap = pd.read_csv('/content/drive/My Drive/Tesi_ABSA/dataset/laptops_2014.csv', encoding="latin-1",sep="\t")
data_hotels = pd.read_csv('/content/drive/My Drive/Tesi_ABSA/dataset/hotels.csv', encoding="latin-1",sep="\t")
data_comp = pd.read_csv('/content/drive/My Drive/Tesi_ABSA/dataset/Computer.csv', encoding="latin-1",sep="\t")
data_speaker = pd.read_csv('/content/drive/My Drive/Tesi_ABSA/dataset/Speaker.csv', encoding="latin-1",sep="\t")
data_router = pd.read_csv('/content/drive/My Drive/Tesi_ABSA/dataset/Router.csv', encoding="latin-1",sep="\t")

In [None]:
#We read all the sentences in the datasets
sentences_rest = SentenceGetter(data_rest).sentences
sentences_lap = SentenceGetter(data_lap).sentences
sentences_hotels = SentenceGetter(data_hotels).sentences
sentences_comp = SentenceGetter(data_comp).sentences
sentences_speaker = SentenceGetter(data_speaker).sentences
sentences_router = SentenceGetter(data_router).sentences

In [None]:
#We create the X and y sets for each domain
max_len = 85
sentences_words_rest = [[str(w[0]) for w in s] for s in sentences_rest]
sentences_tags_rest = [[w[1] for w in s] for s in sentences_rest]
sentences_words_lap = [[str(w[0]) for w in s] for s in sentences_lap]
sentences_tags_lap = [[w[1] for w in s] for s in sentences_lap]
sentences_words_hotels = [[str(w[0]) for w in s] for s in sentences_hotels]
sentences_tags_hotels = [[w[1] for w in s] for s in sentences_hotels]
sentences_words_comp = [[str(w[0]) for w in s] for s in sentences_comp]
sentences_tags_comp = [[w[1] for w in s] for s in sentences_comp]
sentences_words_speaker = [[str(w[0]) for w in s] for s in sentences_speaker]
sentences_tags_speaker = [[w[1] for w in s] for s in sentences_speaker]
sentences_words_router = [[str(w[0]) for w in s] for s in sentences_router]
sentences_tags_router = [[w[1] for w in s] for s in sentences_router]

#We create a list with all the words in the six datasets
sentences_words = sentences_words_rest + sentences_words_lap + sentences_words_hotels + sentences_words_comp + sentences_words_speaker + sentences_words_router
sentences_tags = sentences_tags_rest + sentences_tags_lap +sentences_tags_hotels +sentences_tags_comp+ sentences_tags_speaker+sentences_tags_router

In [None]:
#Function for converting all the tags into indeces
def convert_tags_to_id(tags):
  indexes = []
  for tag in tags:
    i = []
    for t in tag:
      if t=="O":
        i.append(0)
      elif t=="I-aspect":
        i.append(1)
      elif t=='B-aspect':
        i.append(2)
    indexes.append(i)
  return indexes

In [None]:
!pip install seqeval

In [None]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
from keras import backend as K
from keras.models import Model, Input
from keras.layers.merge import add
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
import itertools

In [None]:
#We perform a 5-fold cross validation on the restaurant datasets
cv = KFold(n_splits=5, random_state=42, shuffle=False)
run = 0
for train_index, test_index in cv.split(sentences_words_rest):
  run += 1

  #creation of training and test set
  print("Run n° ", run)
  train_sentences_words = list(pd.Series(sentences_words_rest)[train_index])
  train_sentences_tags = list(pd.Series(sentences_tags_rest)[train_index])
  test_sentences_words = list(pd.Series(sentences_words_rest)[test_index])
  test_sentences_tags = list(pd.Series(sentences_tags_rest)[test_index])
  sentences_words = train_sentences_words + test_sentences_words

  #Generation of vocabulary and tags
  vocab = set(itertools.chain(*[[w for w in s] for s in train_sentences_words])) 
  tags = set(itertools.chain(*[[w for w in s] for s in train_sentences_tags]))
  sentenecs_lens = map(len, train_sentences_words)
  MAX_LEN = 85
  VOCAB_SIZE = len(vocab)

  #We define a mapping between words and indeces
  words_tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters=[], oov_token='__UNKNOWN__')
  words_tokenizer.fit_on_texts(map(lambda s: ' '.join(s), sentences_words))
  word_index = words_tokenizer.word_index
  print(word_index)
  word_index['__PADDING__'] = 0
  index_word = {i:w for w, i in word_index.items()}
  print('Unique tokens:', len(word_index))

  #we define train and test sequences and convert the tags into indeces
  train_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), train_sentences_words))
  test_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), test_sentences_words))
  train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LEN)
  test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LEN)
  tags_tokenizer = Tokenizer(num_words=len(tags), filters='', oov_token='__UNKNOWN__', lower=False)
  tags_tokenizer.fit_on_texts(map(lambda s: ' '.join(s), train_sentences_tags))
  tag_index = tags_tokenizer.word_index
  tag_index['__PADDING__'] = 0
  index_tag = {i:w for w, i in tag_index.items()}
  index_tag_wo_padding = dict(index_tag)
  index_tag_wo_padding[tag_index['__PADDING__']] = '0'
  print('Unique tags:', len(tag_index))
  train_tags = []
  test_tags = []
  train_tags = convert_tags_to_id(train_sentences_tags)
  test_tags = convert_tags_to_id(test_sentences_tags)
  train_tags_padded = pad_sequences(train_tags, maxlen=MAX_LEN)
  test_tags_padded = pad_sequences(test_tags, maxlen=MAX_LEN)
  train_tags_padded = np.expand_dims(train_tags_padded, -1)
  test_tags_padded = np.expand_dims(test_tags_padded, -1)
  
  #We create an embedding matrix containing only the words in our training set
  num_words = min(VOCAB_SIZE, len(word_index) + 1)
  embedding_matrix = np.zeros((num_words, 300))
  for word, i in word_index.items():
    if i >= VOCAB_SIZE:
      continue
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  #We define our model
  pretrained_embedding_layer = Embedding(num_words,
                                300,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_LEN,
                                trainable=True)

  sequence_input = Input(shape=(MAX_LEN,), dtype='int32')
  embedded_sequences = pretrained_embedding_layer(sequence_input)
  x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedded_sequences)
  x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
  x = add([x, x_rnn])  # residual connection to the first biLSTM
  out = TimeDistributed(Dense(len(tag_index), activation="softmax"))(x)
  model = Model(sequence_input, out)
  model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
  
  #fitting the model
  model.fit(train_sequences_padded[:((int)(len(train_sentences_words)*0.8))], train_tags_padded[:((int)(len(train_sentences_words)*0.8))],
            batch_size=32,
            epochs=10,
            validation_data=(train_sequences_padded[-((int)(len(train_sentences_words)*0.2)):], train_tags_padded[-((int)(len(train_sentences_words)*0.2)):]))
  
  #prediction on the test set
  lstm_predicted = model.predict(test_sequences_padded)
  lstm_predicted_tags = []
  index_tag_wo_padding = {0:"O",1:"I-aspect", 2:"B-aspect"}
  for s, s_pred in zip(test_sentences_words, lstm_predicted):
    tags = np.argmax(s_pred, axis=1)
    tags = list(map(index_tag_wo_padding.get,tags))[-len(s):]
    lstm_predicted_tags.append(tags)
  
  #print the performances
  print(classification_report(test_sentences_tags, lstm_predicted_tags))

# Cross-Domain Experiments

In [None]:
#We split the six datasets into training and test set
train_sentences_words_rest, test_sentences_words_rest, train_sentences_tags_rest, test_sentences_tags_rest = train_test_split(sentences_words_rest, sentences_tags_rest, test_size=0.2, random_state=2018)
train_sentences_words_lap, test_sentences_words_lap, train_sentences_tags_lap, test_sentences_tags_lap = train_test_split(sentences_words_lap, sentences_tags_lap, test_size=0.2, random_state=2018)
train_sentences_words_hotels, test_sentences_words_hotels, train_sentences_tags_hotels, test_sentences_tags_hotels = train_test_split(sentences_words_hotels, sentences_tags_hotels, test_size=0.2, random_state=2018)
train_sentences_words_comp, test_sentences_words_comp, train_sentences_tags_comp, test_sentences_tags_comp = train_test_split(sentences_words_comp, sentences_tags_comp, test_size=0.2, random_state=2018)
train_sentences_words_speaker, test_sentences_words_speaker, train_sentences_tags_speaker, test_sentences_tags_speaker = train_test_split(sentences_words_speaker, sentences_tags_speaker, test_size=0.2, random_state=2018)
train_sentences_words_router, test_sentences_words_router, train_sentences_tags_router, test_sentences_tags_router = train_test_split(sentences_words_router, sentences_tags_router, test_size=0.2, random_state=2018)

#We concatenate 5 of the 6 datasets
train_sentences_words = train_sentences_words_comp + train_sentences_words_router + train_sentences_words_lap + train_sentences_words_rest + train_sentences_words_speaker
train_sentences_tags = train_sentences_tags_comp + train_sentences_tags_router + train_sentences_tags_lap + train_sentences_tags_rest + train_sentences_tags_speaker
val_sentences_words = test_sentences_words_comp + test_sentences_words_router + test_sentences_words_lap +test_sentences_words_rest + test_sentences_words_speaker
val_sentences_tags = test_sentences_tags_comp + test_sentences_tags_router + test_sentences_tags_lap + test_sentences_tags_rest +test_sentences_tags_speaker

#The remaining dataset is used as test set
test_sentences_words = train_sentences_words_hotels + test_sentences_words_hotels
test_sentences_tags = train_sentences_tags_hotels + test_sentences_tags_hotels

In [None]:
#We follow the same steps of the single domain experiments
#We perform five iterations to handle the problem of weights that are randomly instantiated
vocab = set(itertools.chain(*[[w for w in s] for s in sentences_words])) 
tags = set(itertools.chain(*[[w for w in s] for s in sentences_tags]))
for j in range(5):
  j += 1
  print("Run n° ", j)

  sentenecs_lens = map(len, train_sentences_words)
  MAX_LEN = 75#max(sentenecs_lens)
  VOCAB_SIZE = len(vocab)
  words_tokenizer = Tokenizer(num_words=VOCAB_SIZE, filters=[], oov_token='__UNKNOWN__')
  words_tokenizer.fit_on_texts(map(lambda s: ' '.join(s), sentences_words))
  word_index = words_tokenizer.word_index
  word_index['__PADDING__'] = 0
  index_word = {i:w for w, i in word_index.items()}
  print('Unique tokens:', len(word_index))
  #we define train and test sequences
  train_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), train_sentences_words))
  test_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), test_sentences_words))
  val_sequences = words_tokenizer.texts_to_sequences(map(lambda s: ' '.join(s), val_sentences_words))
  train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LEN)
  test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LEN)
  val_sequences_padded = pad_sequences(val_sequences, maxlen=MAX_LEN)

  tags_tokenizer = Tokenizer(num_words=len(tags), filters='', oov_token='__UNKNOWN__', lower=False)
  tags_tokenizer.fit_on_texts(map(lambda s: ' '.join(s), train_sentences_tags))
  tag_index = {"O":0, "I-aspect":1,"B-aspect":2}
  index_tag = {i:w for w, i in tag_index.items()}


  print('Unique tags:', len(tag_index))
  train_tags = []
  test_tags = []
  train_tags = convert_tags_to_id(train_sentences_tags)
  test_tags = convert_tags_to_id(test_sentences_tags)
  val_tags = convert_tags_to_id(val_sentences_tags)
  train_tags_padded = pad_sequences(train_tags, maxlen=MAX_LEN)
  test_tags_padded = pad_sequences(test_tags, maxlen=MAX_LEN)
  val_tags_padded = pad_sequences(val_tags, maxlen=MAX_LEN)
  
  train_tags_padded = np.expand_dims(train_tags_padded, -1)
  test_tags_padded = np.expand_dims(test_tags_padded, -1)
  val_tags_padded = np.expand_dims(val_tags_padded, -1)

  num_words = min(VOCAB_SIZE, len(word_index) + 1)
  embedding_matrix = np.zeros((num_words, 300))
  for word, i in word_index.items():
    if i >= VOCAB_SIZE:
      continue
    embedding_vector = embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

  print(num_words)
  pretrained_embedding_layer = Embedding(num_words,
                                300,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_LEN,
                                trainable=True)

  sequence_input = Input(shape=(MAX_LEN,), dtype='int32')
  embedded_sequences = pretrained_embedding_layer(sequence_input)
  x = Bidirectional(LSTM(units=512, return_sequences=True,
                       recurrent_dropout=0.2, dropout=0.2))(embedded_sequences)
  x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                           recurrent_dropout=0.2, dropout=0.2))(x)
  x = add([x, x_rnn])  # residual connection to the first biLSTM
  out = TimeDistributed(Dense(len(tag_index), activation="softmax"))(x)
  model = Model(sequence_input, out)
  model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
  model.fit(train_sequences_padded, train_tags_padded,
            batch_size=32,
            epochs=10,
            validation_data=(val_sequences_padded, val_tags_padded))
  lstm_predicted = model.predict(test_sequences_padded)
  lstm_predicted_tags = []
  index_tag_wo_padding = {0:"O",1:"I-aspect", 2:"B-aspect"}
  for s, s_pred in zip(test_sentences_words, lstm_predicted):
    tags = np.argmax(s_pred, axis=1)
    tags = list(map(index_tag_wo_padding.get,tags))[-len(s):]
    lstm_predicted_tags.append(tags)
  for s,s1 in zip(lstm_predicted_tags[0],test_sentences_tags[0]):
    print(s, " ", s1)
  print(classification_report(test_sentences_tags, lstm_predicted_tags))
  print_f1_on_file("/content/drive/My Drive/Tesi_ABSA/hotels_w2v_full_f1_"+str(j+1)+".txt",test_sentences_tags,lstm_predicted_tags)