Check the version of Python

In [None]:
!python -V

Python 3.7.13


## Data Preparation

This step will prepare the data. We will convert the data in .xlm format to a be in dataframe, which is useful for further processing. We can read raw files from local folder if we run the model on our computer.

In [None]:
# Setup the data path. You may need to change this depending on where you keep the files on your local
directory = r'C:\Users\Asus\Downloads\Subtask1_ABSA_Aspect_Term_Extraction\data'
laptop_train = directory+'\Laptops_Train.xml'
restaurant_train = directory+'\Restaurants_Train.xml'

Or we can run file on Colab.

In [None]:
# codes to mount your google drive folder
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_NLP/

Mounted at /content/drive
/content/drive/MyDrive/SMU_MITB_NLP


In [None]:
# paths to files in Google drive
laptop_train = './Projects/Laptops_Train.xml'
restaurant_train = './Projects/Restaurants_Train.xml'

The below functions will read data from .xlm files and convert it into dataframe. 

In [None]:
#@title Raw data processing
# The raw datasets are given in .xml format
# This function turns the raw data into data frame .

import pandas as pd
import xml.etree.ElementTree as ET

def data_processor(input_path):
    tree = ET.parse(input_path)
    root = tree.getroot()
    
    review_id_list = []
    review_list = []
    term_list = []
    term_polarity_list = []

    # Start reading the tree file
    for sen in root.findall("sentence"):
        
        # Find aspect terms
        if sen.find("aspectTerms"): 
            for branch in sen.find("aspectTerms").findall("aspectTerm"):
                review_id_list.append(sen.attrib["id"])
                review_list.append(sen[0].text)
                term_list.append(branch.get("term"))
    # convert  to Pandas DF           
    df = pd.DataFrame({"Review ID": review_id_list, 
                         "Review": review_list,
                         "Aspect Term": term_list
                         })
    df.drop_duplicates(inplace = True)
    
    return df


In [None]:
# This function splits data into train and valid test with a ratio of 80:20
def data_split(df,train_size=0.8,random_state=2022):
    train_df = df.sample(frac=train_size,random_state=random_state)
    valid_df = df.drop(train_df.index)
    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)
    return train_df, valid_df

We'll create 2 datasets, one for train and one for valid. The valid dataset will be used to measure the performance of our models. 

In [None]:
# Process the data
df_laptop = data_processor(laptop_train) 
df_rest = data_processor(restaurant_train)

In [None]:
# Let's see how the our dataframe looklike.
df_laptop.head(2)

Unnamed: 0,Review ID,Review,Aspect Term
0,2339,I charge it at night and skip taking the cord ...,cord
1,2339,I charge it at night and skip taking the cord ...,battery life


In [None]:
# Currently, each aspect term is in different rows even though some aspect terms can be of the same review
# We'll get the aspect terms of the same review to be in one row

df_laptop_collapsed = df_laptop.copy()
df_laptop_collapsed['Aspect Term']= df_laptop_collapsed.groupby(['Review ID', 'Review'])['Aspect Term'].transform(lambda x: ','.join(x))
df_laptop_collapsed.drop_duplicates(inplace = True,ignore_index=True)
 
df_rest_collapsed = df_rest.copy()
df_rest_collapsed['Aspect Term']= df_rest_collapsed.groupby(['Review ID', 'Review'])['Aspect Term'].transform(lambda x: ','.join(x))
df_rest_collapsed.drop_duplicates(inplace = True,ignore_index=True)

In [None]:
# See the result
df_laptop_collapsed.head(2)

Unnamed: 0,Review ID,Review,Aspect Term
0,2339,I charge it at night and skip taking the cord ...,"cord,battery life"
1,1316,The tech guy then said the service center does...,"service center,""sales"" team,tech guy"


In [None]:
# Create train and validation dataset
# Note: We must split on the df_laptop_collapsed and df_rest_collasped, not df_laptop and df_rest
train_df_laptop, valid_df_laptop = data_split(df_laptop_collapsed)
train_df_rest, valid_df_rest = data_split(df_rest_collapsed)

# Concatenating both laptop and restaurant datasets for training at the same time
# Before concatenating, create a new ID column to distinguish the same ID from laptop and restaurant
train_df_laptop['Review ID'] = train_df_laptop['Review ID'].apply(lambda x: str(x)+'_laptop')
train_df_rest['Review ID'] = train_df_rest['Review ID'].apply(lambda x: str(x)+'_restaurant')
valid_df_laptop['Review ID'] = valid_df_laptop['Review ID'].apply(lambda x: str(x)+'_laptop')
valid_df_rest['Review ID'] = valid_df_rest['Review ID'].apply(lambda x: str(x)+'_restaurant')

train_df = pd.concat([train_df_laptop, train_df_rest], ignore_index=True)
valid_df = pd.concat([valid_df_laptop, valid_df_rest], ignore_index=True)

In [None]:
# Preview our traning data
train_df.head(5)

Unnamed: 0,Review ID,Review,Aspect Term
0,2909_laptop,Have had many higher priced computers crash an...,"memory,speed,priced"
1,2494_laptop,The big screen allows you to enjoy watching mo...,screen
2,2578_laptop,Additional caveat: the base installation comes...,"base installation,software"
3,888_laptop,The mousepad is a huge pain in the arse!,mousepad
4,577_laptop,It gives me the power and speed that I need to...,"power,speed,programs"


In [None]:
# Preview our valid data
valid_df.head(5)

Unnamed: 0,Review ID,Review,Aspect Term
0,2568_laptop,I love the way the entire suite of software wo...,suite of software
1,295_laptop,The speed is incredible and I am more than sat...,speed
2,2980_laptop,I can barely use any usb devices because they ...,usb devices
3,2202_laptop,Pairing it with an iPhone is a pure pleasure -...,syncing
4,2227_laptop,"I also got the added bonus of a 30"" HD Monitor...","30"" HD Monitor,screen"


In [None]:
# See our data sizes
print("Train data size:", len(train_df))
print("Valid data size:", len(valid_df))

Train data size: 2812
Valid data size: 703


To solve the Aspect Term Extraction (ATE) task, in this project, we'll explore 3 approaches:
- The first approach is simply dictionary-based. In this approach, we will use the aspect term in our training dataset to create a dictionary of aspect terms. Then, we'll use that dictionary to look up the terms in the valid dataset
- The second approach is using Word2Vec with a convolutional model. Let's see if word representation can do a better job. The word reprentation we'll use is from "Glove.6B.300d.txt".
- The third approach is based on pretrained models, which have achieved SOTA performance on many NLP tasks. The pretrain model we'll mainly explore is BERT. 

## Approach 1: Dictionary-based

In [None]:
# Preview our traning data
train_df.head(2)

Unnamed: 0,Review ID,Review,Aspect Term
0,2909_laptop,Have had many higher priced computers crash an...,"memory,speed,priced"
1,2494_laptop,The big screen allows you to enjoy watching mo...,screen


In [None]:
# We extract terms from our training data to make the dictionary
term_dictionary = set([j for i in list(train_df['Aspect Term']) for j in i.split(',')])
print("Length of dictionary:", len(term_dictionary))

Length of dictionary: 1917


In [None]:
from nltk import ngrams

# For each review, extract aspect terms from the review based on the dictionary
data_l = [] # this list will be used to generate the a dataframe displaying results
for row in valid_df.itertuples():
    # Tokenize the review in valid_df into words. 
    # We'll tokenize the review sentence in 1,2,and 3grams. We can tokenize upto as a high number as we want.
    ngram_list = [ngrams(row[2].split(), n) for n in [1,2,3]]
    aspect_terms = []
    # Extract aspect term
    for i in range(len(ngram_list)):
        for w in ngram_list[i]:
            if ' '.join(w) in term_dictionary:
                aspect_terms.append(' '.join(w))
    data = {}
    data['Review ID'] = row[1]
    data['True Aspect Term'] = row[3]
    data['Predicted Aspect Term List'] = aspect_terms
    data_l.append(data)
# Create dataframe including true aspect term and predicted aspect term
result = pd.DataFrame(data_l)
result.head(5)

Unnamed: 0,Review ID,True Aspect Term,Predicted Aspect Term List
0,2568_laptop,suite of software,"[software, works]"
1,295_laptop,speed,[speed]
2,2980_laptop,usb devices,[use]
3,2202_laptop,syncing,[]
4,2227_laptop,"30"" HD Monitor,screen","[HD, screen]"


In [None]:
# Check performance
import numpy as np
true_term = list(result['True Aspect Term'])
pred_term = list(result['Predicted Aspect Term List'])
# Get True positive
correct = sum([1 for i in range(len(true_term)) for j in true_term[i].split(',') if j in pred_term[i]])
# Calculate recall
recall = correct / len([j for i in range(len(true_term)) for j in true_term[i].split(',')])
# Calculate precision
pred = result.groupby(['Review ID']).agg({'Predicted Aspect Term List': 'first'})
precision = correct / sum([len(i) for i in pred['Predicted Aspect Term List']])
# Calculate f1
f1_score = 2*precision*recall/(precision+recall)
print(f"Recall: {round(recall*100,2)}%, ", f"Precision: {round(precision*100,2)}%, ", f"F1 Score: {round(f1_score*100,2)}% ")

Recall: 46.15%,  Precision: 49.07%,  F1 Score: 47.57% 


## Approach 2: Word2Vec

In [None]:
# Download Word Represtation 'glove.6B.300d.txt'
# Locate the file 
# paths to files in Google drive
word_vec = './Projects/glove.6B.300d.txt'

In [None]:
### Create a dictionary containing word representation: {word: embeddings}
import numpy as np
embeddings_index = {}
f = open(word_vec)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print(f'Found {len(embeddings_index)} word vectors.')

Found 400000 word vectors.


Now, we'll need to edit the input format a litte to make it better suit our preprocessing method that we'll use later on our data.

In [None]:
# Let's see how the current data looklike:
train_df[train_df['Review ID'] == '2909_laptop']

Unnamed: 0,Review ID,Review,Aspect Term
0,2909_laptop,Have had many higher priced computers crash an...,"memory,speed,priced"


In [None]:
import re
train_df1 = train_df.copy()
train_df1['Aspect Term']= train_df1['Aspect Term'].apply(lambda x: re.sub(',',' ',x))

valid_df1 = valid_df.copy()
valid_df1['Aspect Term']= valid_df1['Aspect Term'].apply(lambda x: re.sub(',',' ',x))

print(len(train_df1))
print(len(valid_df1))

2812
703


In [None]:
# After
train_df1[train_df1['Review ID'] == '2909_laptop']
# So basically, we just remove comma ',' between aspect terms. We don't want it appear in the result when we tokenize the aspect terms

Unnamed: 0,Review ID,Review,Aspect Term
0,2909_laptop,Have had many higher priced computers crash an...,memory speed priced


In [None]:
train_df1.shape

(2812, 3)

Vectorize the input

In [None]:
# first let's preprocess the input to remove stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re

stop_words = list(stopwords.words('english'))
stop_words.extend(['The','I'])
stemmer = PorterStemmer()

def preprocess(review):
    review = [stemmer.stem(w.lower()) for w in word_tokenize(re.sub('[^a-zA-Z0-9.,]+', ' ', review))]
    #review = [stemmer.stem(w.lower()) for w in word_tokenize(review)]
    #review = ' '.join(word_tokenize(re.sub('[^a-zA-Z]+', ' ', review)))
    return review
# Preprocessing the input
train_df1['Review_Preprocessed']= train_df1['Review'].apply(preprocess)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
train_df1.head(3)

Unnamed: 0,Review ID,Review,Aspect Term,Review_Preprocessed
0,2909_laptop,Have had many higher priced computers crash an...,memory speed priced,"[have, had, mani, higher, price, comput, crash..."
1,2494_laptop,The big screen allows you to enjoy watching mo...,screen,"[the, big, screen, allow, you, to, enjoy, watc..."
2,2578_laptop,Additional caveat: the base installation comes...,base installation software,"[addit, caveat, the, base, instal, come, with,..."


In [None]:
# Fit token on review corpus
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=5000, lower=False) #This number can be tuned.

reviews = list(train_df1['Review_Preprocessed'])
tokenizer.fit_on_texts(reviews) # Tokenize the corpus
sequences = tokenizer.texts_to_sequences(reviews) # turn sequences of words into sequences of word_id

word_index = tokenizer.word_index # Word_index will return the unique words in our corpus
print('Found %s unique words in the corpus.' % len(word_index))
print ("Let's have a look at the word_index")
print (list(word_index.items())[:10])

Found 3818 unique words in the corpus.
Let's have a look at the word_index
[('the', 1), ('.', 2), (',', 3), ('and', 4), ('to', 5), ('a', 6), ('i', 7), ('is', 8), ('it', 9), ('of', 10)]


In [None]:
# Padding the sequence
MAX_SEQ_LENGTH = train_df1['Review_Preprocessed'].apply(lambda x: len(x)).max()
from keras.preprocessing.sequence import pad_sequences
data = pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post')
print('Shape of data tensor:', data.shape)

Shape of data tensor: (2812, 80)


In [None]:
## Setup training data. Extracting the aspects to create target
train_out= np.zeros(shape=data.shape)

for item in range(len(train_df1)):

    r = train_df1.loc[item,'Review_Preprocessed']
    indices = np.zeros(MAX_SEQ_LENGTH) # This is like an aspect-term mask in which aspect terms are 1, others are 0
    aspect_term = preprocess(train_df1.loc[item,'Aspect Term'])

    for term in aspect_term:
        if term in r: # By right, 'term' should be in 'r'. But there sometimes might be error in the training data.
            indices[r.index(term)] = 1      
    train_out[item] = indices


Reviewing an example of training data

In [None]:
train_df1.iloc[1:2]

Unnamed: 0,Review ID,Review,Aspect Term,Review_Preprocessed
1,2494_laptop,The big screen allows you to enjoy watching mo...,screen,"[the, big, screen, allow, you, to, enjoy, watc..."


In [None]:
# this is how the review look like after being tokenized into sequence
data[1]

array([  1, 307,  79, 541,  14,   5, 211, 627, 674,   3, 675,   4, 399,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0], dtype=int32)

In [None]:
# The target
train_out[1]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
#@title Preparing the Embedding layer
### Preparing the Embedding Layer
print('Preparing embedding matrix.')

# prepare embedding matrix
embeddings_index_processed ={}
for k, v in embeddings_index.items():
    k_processed = preprocess(k)
    if len(k_processed) > 0:  # preprocess(k) may return an empty list
        k_process = k_processed[0] #[0] since the function preprocess() returns a list
        embeddings_index_processed[k_process] = v

nb_words = len(word_index)
embedding_matrix = np.zeros((nb_words + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index_processed.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
from keras.layers import Embedding
embedding_layer = Embedding(nb_words + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LENGTH,
                            trainable=False)
print("Embedding matrix shape:", embedding_matrix.shape)

Preparing embedding matrix.
Embedding matrix shape: (3819, 300)


In [None]:
# Setup valid data. This is similar to what we have performed on training data.
valid_df1['Review_Preprocessed']= valid_df1['Review'].apply(preprocess)
reviews_valid = list(valid_df1['Review_Preprocessed'])
tokenizer.fit_on_texts(reviews_valid) # Tokenize the corpus
sequences_valid = tokenizer.texts_to_sequences(reviews_valid)
valid_data = pad_sequences(sequences_valid, maxlen=MAX_SEQ_LENGTH, padding='post')
valid_out= np.zeros(shape=valid_data.shape)

for item in range(len(valid_df1)):
    r = valid_df1.loc[item,'Review_Preprocessed']
    indices = np.zeros(MAX_SEQ_LENGTH) # This is like an aspect-term mask in which aspect terms are 1, others are 0
    
    aspect_term = preprocess(valid_df1.loc[item,'Aspect Term'])
    for term in aspect_term:
        if term in r: # By right, 'term' should be in 'r'. But there sometimes might be error in the training data.
            indices[r.index(term)] = 1

    valid_out[item] = indices

valid = (valid_data, valid_out)

In [None]:
valid_df1.iloc[0:1]

Unnamed: 0,Review ID,Review,Aspect Term,Review_Preprocessed
0,2568_laptop,I love the way the entire suite of software wo...,suite of software,"[i, love, the, way, the, entir, suit, of, soft..."


In [None]:
valid_data[0]

array([   7,  103,    1,  140,    1,  475, 2320,   10,  151,   61,  806,
          2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0], dtype=int32)

In [None]:
valid_out[0]

array([0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
#@title Convolutional Model
#### Defining and Training Model
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.core import Activation, Flatten, Dense, Dropout
from keras.preprocessing import sequence
from keras.optimizers import *
from keras.regularizers import l2
print('Training model.')

model = Sequential()
model.add(embedding_layer)
model.add(Convolution1D(2048, 5, input_shape=(MAX_SEQ_LENGTH-4, 300)))
model.add(Activation("tanh"))
model.add(Convolution1D(1024, 3))
model.add(Activation("tanh"))
model.add(Convolution1D(512, 3))
model.add(Activation("tanh"))
model.add(Flatten())
model.add(Dense(256))
model.add(Activation("tanh"))
model.add(Dropout(0.4))
model.add(Dense(train_out.shape[1], activation='relu'))
model.add(Activation("softmax"))

optimizer = tf.optimizers.Adam(learning_rate=3e-5)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['acc'])

Training model.


In [None]:
####  Train Model
model.fit(data, train_out,
          validation_split=0.0,
          validation_data=valid,
          batch_size=8,
          epochs=10
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f582c299910>

In [None]:
#@title Calculate Precision and Recall
####  Calculate Precision Recall
y_pred  = model.predict(valid_data)
processed_output = []
for i in range(y_pred.shape[0]):
    processed_label =[]
    for j in range(y_pred.shape[1]):
        if y_pred[i][j] > 0.1:
            processed_label.append(1)
        else:
            processed_label.append(0)
    #print(" processed_label ",processed_label)
    processed_output.append(processed_label)

total_pos = 0.0
true_pos = 0.0
total_neg = 0.0
true_neg = 0.0
for i in range(valid_out.shape[0]):
    for j in range(valid_out.shape[1]):
        if valid_out[i][j] == 1:
            total_pos += 1
            if processed_output[i][j] ==1:
                true_pos +=1
        if valid_out[i][j] == 0:
            total_neg += 1
            if processed_output[i][j] ==0:
                true_neg += 1

false_pos = total_neg-true_neg
false_neg = total_pos-true_pos
precision = true_pos/(true_pos+false_pos)
recall = true_pos/total_pos

f1_score = 2*precision*recall/(precision+recall)
print(f"Recall: {round(recall*100,2)}%, ", f"Precision: {round(precision*100,2)}%, ", f"F1 Score: {round(f1_score*100,2)}% ")


Recall: 36.25%,  Precision: 40.15%,  F1 Score: 38.1% 


## Approach 3: Pretrained-model BERT

The first step is to prepare the input in the format that suite our model. 

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')


train_df2 = train_df.copy()
valid_df2 = valid_df.copy()

# We'll create token embeddings for aspect terms. 
# If a word in the review is an aspect term, the embedding of that word is 1.
# If a word in the review is not an aspect term, the embedding code is 0.
def create_embeddings(df):
    review_tokens_list = []
    term_encoding_list = []
    for i, row in df.iterrows():
        review_tokens = word_tokenize(row['Review'])
        term = row['Aspect Term'].split(',')

        term_encoding = np.zeros(len(review_tokens), dtype=np.int64)
        for t in term: 
            if (len(t.split(" ")) <= 1):
                if t in review_tokens:
                    index=review_tokens.index(t)
                    term_encoding[index]=1
            else: 
                t_sub = t.split(" ")
                code = 1
                for t_s in t_sub:
                    if t_s in review_tokens:
                        index = review_tokens.index(t_s)
                        n = 1 # if we want to encode the second, third word... of the same aspect term to 2,3,and so on, then change this.
                              # In this project, we just encode all words in aspect terms to 1
                        if code>n: 
                           code = n
                        term_encoding[index]=n
                        code+=1
        review_tokens_list.append(review_tokens)   
        term_encoding_list.append(term_encoding)

    df['Review Tokens'] = review_tokens_list
    df['Term Encoding'] = term_encoding_list
    return df

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
train_df2 = create_embeddings(train_df2)
valid_df2 = create_embeddings(valid_df2)

In [None]:
# The results are in column Term Encoding
train_df2.head(3)

Unnamed: 0,Review ID,Review,Aspect Term,Review Tokens,Term Encoding
0,2909_laptop,Have had many higher priced computers crash an...,"memory,speed,priced","[Have, had, many, higher, priced, computers, c...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2494_laptop,The big screen allows you to enjoy watching mo...,screen,"[The, big, screen, allows, you, to, enjoy, wat...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,2578_laptop,Additional caveat: the base installation comes...,"base installation,software","[Additional, caveat, :, the, base, installatio...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ..."


In [None]:
valid_df2.head(3)

Unnamed: 0,Review ID,Review,Aspect Term,Review Tokens,Term Encoding
0,2568_laptop,I love the way the entire suite of software wo...,suite of software,"[I, love, the, way, the, entire, suite, of, so...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0]"
1,295_laptop,The speed is incredible and I am more than sat...,speed,"[The, speed, is, incredible, and, I, am, more,...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,2980_laptop,I can barely use any usb devices because they ...,usb devices,"[I, can, barely, use, any, usb, devices, becau...","[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [None]:
# This function will collect data into tensors
from torch.utils.data import Dataset
class dataset_ATM(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokens, tags = self.df.loc[idx,['Review Tokens','Term Encoding']].values

        bert_tokens = []
        bert_tags = []
        for i in range(len(tokens)):
            t = self.tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)
        
        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        tags_tensor = torch.tensor(bert_tags)

        return bert_tokens, ids_tensor, tags_tensor

    def __len__(self):
        return len(self.df)

In [None]:
#### Install transformer if needed
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import BertTokenizer, BertModel 
pretrain_model_name ="bert-base-uncased"
#We can explore more pretrained models if we would like.
#pretrain_model_name ="xlm-roberta-base"
#pretrain_model_name ="bert-large-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)

train_tensor = dataset_ATM(train_df2, tokenizer)
valid_tensor = dataset_ATM(valid_df2, tokenizer)

In [None]:
# Create data loader used to load data into our model
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)
    
    return ids_tensors, tags_tensors, masks_tensors
from torch.utils.data import DataLoader
train_loader = DataLoader(train_tensor, batch_size=32, collate_fn=create_mini_batch, shuffle = True)
test_loader = DataLoader(valid_tensor, batch_size=valid_df2.shape[0], collate_fn=create_mini_batch, shuffle = True)

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import time
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
#@title Utility functions
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

def load_model(model, path):
    model.load_state_dict(torch.load(path), strict=False)
    return model
    
def save_model(model, name):
    torch.save(model.state_dict(), name)

In [None]:
# BERT Model
class bert_ATE(torch.nn.Module):
    def __init__(self, pretrain_model):
        super(bert_ATE, self).__init__()
        self.bert = BertModel.from_pretrained(pretrain_model,return_dict=False)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 2) #2 is the number of classes
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, tags_tensors, masks_tensors):
        bert_outputs,_ = self.bert(input_ids=ids_tensors, attention_mask=masks_tensors)
        linear_outputs = self.linear(bert_outputs)

        if tags_tensors is not None:
            tags_tensors = tags_tensors.view(-1)
            linear_outputs = linear_outputs.view(-1,2) #2 is the number of classes
            loss = self.loss_fn(linear_outputs, tags_tensors)
            return loss
        else:
            return linear_outputs

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_ATE = bert_ATE(pretrain_model_name).to(DEVICE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Train model
def train_model_ATE(loader, epochs, lr):
    all_data = len(loader)
    
    optimizer_ATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, tags_tensors, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model_ATE(ids_tensors=ids_tensors, tags_tensors=tags_tensors, masks_tensors=masks_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer_ATE.step()
            optimizer_ATE.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

In [None]:
%time train_model_ATE(train_loader, epochs=3, lr=3e-5)

epoch: 0  batch: 1 / 88  loss: 0.538198709487915  hr: 0  min: 1  sec: 26
epoch: 0  batch: 2 / 88  loss: 0.4057324230670929  hr: 0  min: 1  sec: 15
epoch: 0  batch: 3 / 88  loss: 0.3300570597251256  hr: 0  min: 1  sec: 24
epoch: 0  batch: 4 / 88  loss: 0.32493847236037254  hr: 0  min: 1  sec: 17
epoch: 0  batch: 5 / 88  loss: 0.28296603858470915  hr: 0  min: 1  sec: 21
epoch: 0  batch: 6 / 88  loss: 0.27119701852401096  hr: 0  min: 1  sec: 19
epoch: 0  batch: 7 / 88  loss: 0.26740674461637226  hr: 0  min: 1  sec: 16
epoch: 0  batch: 8 / 88  loss: 0.26241897232830524  hr: 0  min: 1  sec: 13
epoch: 0  batch: 9 / 88  loss: 0.24742806123362648  hr: 0  min: 1  sec: 17
epoch: 0  batch: 10 / 88  loss: 0.24576722532510759  hr: 0  min: 1  sec: 13
epoch: 0  batch: 11 / 88  loss: 0.23410021987828342  hr: 0  min: 1  sec: 15
epoch: 0  batch: 12 / 88  loss: 0.22400179939965406  hr: 0  min: 1  sec: 15
epoch: 0  batch: 13 / 88  loss: 0.22298545161118874  hr: 0  min: 1  sec: 13
epoch: 0  batch: 14 / 88 

In [None]:
def test_model_ATE(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, tags_tensors, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model_ATE(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)
            # print(" outputs ",outputs)
            _, predictions = torch.max(outputs, dim=2)
            #print(" predictions ", predictions)
            pred += list([int(j) for i in predictions for j in i ])
            # print(" pred ", pred)
            # print(" tags_tensors ", tags_tensors)
            trueth += list([int(j) for i in tags_tensors for j in i ])
            # print(" trueth ", trueth)
    return trueth, pred

x, y = test_model_ATE(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(2)]))


              precision    recall  f1-score   support

           0       0.99      0.99      0.99     61122
           1       0.85      0.84      0.84      2148

    accuracy                           0.99     63270
   macro avg       0.92      0.92      0.92     63270
weighted avg       0.99      0.99      0.99     63270



The results achieved by using BERT are significant compared to those by the two previous approaches. Look at Class 1, the Recall, Precision and F1-Score are 84%, 85% and 84%, respectively.