In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib as plt
import nltk
import xml.etree.ElementTree as et
import re

# Data Cleaning, XML To DataFrame Conversion

The training XML file has missing values for 'category' and 'polarity' fields as the Opinion element doesn't exists. This is handled by explicity adding Opinion Element. It it populated with NO_ASPECT as Entity and Atrribute, and 'noaspect' as polarity.

In [2]:
# DATA PREPROCESSING

def preprocess_xml(xml_to_process, output_xml):
    # Read and format XML for missing values

    tree = et.parse(xml_to_process)
    root = tree.getroot()

    # Normalize the XML to have empty opinions where it doesn't exist
    for sentence in root.iter('sentence'):
        # Check if Opinions element exists, if not, add it
        if not sentence.findall('Opinions'):
            new_opinion = et.SubElement(sentence,'Opinions')
            opinion_cat = et.Element('Opinion')
            new_opinion.append(opinion_cat)
            opinion_cat.set('category', 'NO_ASPECT#NO_ASPECT')
            opinion_cat.set('polarity', 'noaspect')

    tree.write(output_xml, encoding="utf-8")

In [3]:
# Read XML into a dataframe, save it in a csv

def parse_xml_to_df(xml):
    # Read xml and get the root
    tree = et.parse(xml)
    root = tree.getroot()
    # create an empty list to store the data
    data = []
    # iterate over the elements in the XML file
    for review in root:
        # get review id
        review_id = review.attrib
        # create an empty dictionary to store the element's data
        elem_dict = {}
        # iterate over the sentences and add the sentence and opinion to the dictionary
        for sentences in review[0]:
            # get the review id, sentence id
            elem_dict.update(review_id)
            elem_dict['id'] = (sentences.attrib['id'])
            # get the sentence text
            elem_dict['sent_text'] = sentences[0].text
            # Add Entity#Attribute pair along with polarity
            for opinion in sentences[1]:
                row_dict = {}
                # Add review id, sentence id, and the sentence to the row
                row_dict.update(elem_dict)
                # Read and add the entity-attribute pair, polarity to the row
                entityAttrib = opinion.attrib
                row_dict.update(entityAttrib)
                # We create unique row for each Entity#Attribute pair of a sentence
                data.append(row_dict)

    # create a pandas DataFrame from the data
    df = pd.DataFrame(data)
    # Reordering columns of dataframe for better readability
    df = df[['rid', 'id', 'sent_text', 'category', 'polarity']]
    
    return df

# Structuring Dataframe for Training

The function get_training_df() takes in the dataframe with parsed xml data as input.
Based on the flags set, the function will return a dataframe with one-hot encoding of different classes that can be used for training the classifiers.

To train the entity-attribute classifier, the dataframe should have sent_text and all unique entity-attribute class pairs one-hot encoded. The columns are - sent_text, NO_ASPECT#NO_ASPECT, LAPTOP#GENERAL,BATTERY#OPERATION_PERFORMANCE,... so on.

To train the polarity classifier, dataframe with sent_text, one-hot encoded entity-attribute pairs, polarity is required

In [4]:
# Format the dataframe and one-hot encode the classes

# Create one-hot encoded dataframe with columns - sentence text, Entity#Attribute one-hot-encoded
# ENTITY#ATTRIBUTE ONE-HOT ENCODED
def get_training_df(df, classes, ea_flag = True, polarity_flag = False):
    entity_labels = ['LAPTOP', 'DISPLAY', 'KEYBOARD', 'MOUSE', 'MOTHERBOARD', 'CPU', 
                     'FANS_COOLING', 'PORTS', 'MEMORY', 'POWER_SUPPLY', 'OPTICAL_DRIVES', 
                     'BATTERY', 'GRAPHICS', 'HARD_DISK', 'MULTIMEDIA_DEVICES', 'HARDWARE', 
                     'SOFTWARE', 'OS', 'WARRANTY', 'SHIPPING', 'SUPPORT', 'COMPANY']
    attribute_labels = ['GENERAL', 'PRICE', 'QUALITY', 'DESIGN_FEATURES', 
                        'OPERATION_PERFORMANCE', 'USABILITY', 'PORTABILITY', 
                        'CONNECTIVITY', 'MISCELLANEOUS']

    if(ea_flag == True):
        # concatenate each possible combination of entity and attribute labels
        entity_attribute_combinations = classes
        #entity_attribute_combinations = [f'{entity}#{attribute}' for entity in entity_labels for attribute in attribute_labels]
        np.append(entity_attribute_combinations,['NO_ASPECT#NO_ASPECT'])

        # create a dictionary to map each entity-attribute combination to an index in the one-hot encoded vectors
        aspect_map = {aspect: i for i, aspect in enumerate(entity_attribute_combinations)}

        # load the data into a pandas DataFrame
        ea_df = df[['sent_text','category','polarity']]

        # add a new column for each possible combination of entity and attribute
        for aspect_str in entity_attribute_combinations:
            ea_df[aspect_str] = ea_df['category'].apply(lambda x: 1 if aspect_str==x else 0)

        ea_df.drop('category', axis=1, inplace=True)
        if(polarity_flag == True):
            training_df = ea_df
        else:
            ea_df.drop('polarity', axis=1, inplace=True)
            # We group the rows, so that for each sentence we have only one row
            # with all E#A marked in it.
            training_df = ea_df.groupby(ea_df['sent_text'], as_index=False).max().reindex(columns=ea_df.columns)
        
    else:
        # ENTITY ONE-HOT ENCODED
        # Used when entity and attributes are predicted seperately
        entity_df = df[['sent_text', 'category']]
        entity_df['category'] = [re.sub(r'(\#)\w*','', str(ea_pair)) for ea_pair in entity_df['category']]

        # add a new column for each entity
        for entity in entity_labels:
            entity_df[entity] = entity_df['category'].apply(lambda x: 1 if entity==x else 0)

        entity_df.drop('category', axis=1, inplace=True)
        training_df = entity_df
        training_df=entity_df.groupby(entity_df['sent_text'], as_index=False).max().reindex(columns=entity_df.columns)
    
    return training_df 

In [5]:
# Preprocessing the textual data
# Preprocessing the sentences
# Data type to string, store in lower case, remove symbols, non-words, tokenize

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import spacy

def preprocess_text(sent_list) :
    
    # Convert all text to lower case
    sent_list = [sent.lower() for sent in sent_list]
    
    # Remove Punctuation and non alpha-numeric characters
    sent_list = [re.sub(r'[^\w\s]','', sent) for sent in sent_list]
    
    # Tokenization
    sent_list = [nltk.word_tokenize(sent) for sent in sent_list]
    
    # Stop Word Removal
    # Load the stop words
    stop_words = set(stopwords.words('english'))
    for sent in sent_list:
        for word in sent:
            if word in stop_words:
                sent.remove(word)
        
    # TO-DO: SpellCheck
    
    # Stemming/Lemmatization
    load_model = spacy.load('en_core_web_sm', disable = ['parser','ner'])
    for sent in sent_list :
        sent = [word.lemma_ for word in load_model(str(sent))]
    return sent_list

In [6]:
# TRAINING DATASET

# Clean the training XML, and save the output
preprocess_xml('Laptops_Train_p1.xml', 'training_p1_output.xml')

# Parse the cleaned XML to create a dataframe with
# columns - rid, id, sent_text, category, polarity
xml_df = parse_xml_to_df('training_p1_output.xml')

# Unique Entity-Attribute pairs to use for class encoding
class_labels = xml_df['category'].unique()

# Get training dataframe
training_df = get_training_df(xml_df, class_labels, True)
training_polarity_df = get_training_df(xml_df, class_labels, True, True)

#User defined functions for text preprocessing
#training_df.sent_text = preprocess_text(training_df.sent_text)

# Save the dataframe with preprocessed text and one-hot encoding for E#A pairs, so you can directly load this and use it
training_df.to_csv('TrainingData_ABSA_P1.csv')


#TESTING DATASET

# Perform similar operations on the testing gold dataset
# Clean the training XML, and save the output
preprocess_xml('Laptops_Test_p1_gold.xml', 'testing_p1_output.xml')

# Parse the cleaned XML to create a dataframe with
# columns - rid, id, sent_text, category, polarity
xml_test_df = parse_xml_to_df('testing_p1_output.xml')

# Get training dataframe
testing_df = get_training_df(xml_test_df, class_labels, True)

testing_polarity_df = get_training_df(xml_test_df, class_labels, True, True)

# Save the dataframe with preprocessed text and one-hot encoding for E#A pairs, so you can directly load this and use it
testing_df.to_csv('TestingData_ABSA_P1.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Sentiment Prediction Approach

APPROACH 1  (Used)

Step 1: Use vectorized sentence text to predict the combined Entity#Attribute Classes

Step 2: Use vectorized sentence text and Entity#Attribute Classes to predict polarity of each Entity#Attribute pair.

APPROACH 2  (Code added at the end)

Step 1: Use vectorized sentence text to predict the Entity Classes

Step 2: Use vectorized sentence text and Entity Classes to predict the Attribute Classes

Step 3: Use vectorized sentence text, Entity Classes, Attribute Classes to predict polarity of each Aspect.


Both the Approaches were tried. For Approach 1, neural network performed quite well, but simpler multilabel classifiers had a very poor performance.

For Approach 2, Entity prediction using multilabel SVM gave really good results (0.7-0.8 accuracy). However, Attribute Prediction accuracy maxed at around 0.5-0.54 when tried with multiclass variants of Neural Network, SVM, Gaussian Naive-Bayes, Logistic Regression.

Polarity prediction in both the approaches gave high accuracy when provided with correct aspects for the sentences. A simple multiclass SVM is implemented for that.

# Numerical Vectorization of Textual Data

Different Approaches were used to vectorize the text and convert to some numerical representation.

scikit-learn library has built in classes like the Count Vectorizer (Bag-of-words), Tf-IDF Vectorizer. The performance with Count Vectorizer was much better and has been used in all the classifiers.

A word2vec model was also trained and feature vectors were extracted from that but the classifier performance was poor compared to the classifier trained with Count Vectorizer features. (Sample Code for word2vec added at the end)

In [7]:
# TRAINING DATA for ENTITY-ATTRIBUTE PREDICTION
# Vectorising and Splitting Training Data

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Split the training data into train-test sets: x_train, y_train, x_test, y_test
x_train_ea, x_test_ea, y_train_ea, y_test_ea = train_test_split(training_df['sent_text'],
                                                    training_df.drop('sent_text', axis=1), 
                                                    test_size=0.1)

# FEATURE VECTOR
# Load CountVectorizer
count_vect = CountVectorizer(max_df=1.0, stop_words='english', max_features=1000)

# Vectorize the textual data
x_train_ea = count_vect.fit_transform(x_train_ea)

# x_test to be used for validation, and so we don't use fit_transform it
x_test_ea = count_vect.transform(x_test_ea)

y_train_arr = (y_train_ea.iloc[:,1:]).to_numpy()


# GOLD TEST DATASET
# We use the same count vectorizer object to transform the gold test data
# so that we have similar sized feature matrix of X

# Read the training data into X and Y
x_test_predict = count_vect.transform(testing_df.sent_text)
y_test_predict = testing_df.drop('sent_text', axis=1)

y_ground_truth = (y_test_predict.iloc[:,1:]).to_numpy()

# The feature vector sizes are different for training and test(validation) set
# We apply padding to have same sized input feature vectors for both training and validating
if x_test_predict.shape[1] < x_train_ea.shape[1]:
    # Pad the test input features with zeros to match the number of columns in the training input features
    padding = np.zeros((x_test_predict.shape[0], x_train_ea.shape[1] - x_test_predict.shape[1]))
    x_test_predict = np.concatenate((x_test_predict, padding), axis=1)
elif x_test_predict.shape[1] > x_train_ea.shape[1]:
    # Truncate the test input features to match the number of columns in the training input features
    x_test_predict = x_test_predict[:, :x_train_ea.shape[1]]

# ENTITY-ATTRIBUTE Classifier

A Neural Network model has been used as multilabel classifier to predict all possible classes for an input sentence/review.

Other models like multilabel SVM, RandomForest, Multi-layer Perceptron were also tried but the accuracy on joint entity-attribute prediction was very low (20-40%). However, SVM performed quite well when used to predict just the entity (accuracy between 70-80% achieved)

The Sequential Neural Network model is selected due to its small training time, good accuracy on training data, and similarly high accuracy on gold test dataset.

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import regularizers

# create a neural network model
entity_classifier = Sequential()
entity_classifier.add(Dense(64, activation='relu', input_shape=(x_train_ea.shape[1],), kernel_regularizer=regularizers.l2(0.01)))
#entity_classifier.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
entity_classifier.add(Dense(81, activation='sigmoid'))

# compile the model
entity_classifier.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# fit the model to the data
entity_classifier.fit(x_train_ea, y_train_arr, epochs=10, batch_size=32)

Train on 2243 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x2132e9de390>

## EVALUATION for ENTITY-ATTRIBUTE Classifier

While the high accuracy on validation set might indicate overfitting, it is suprising to see that around about same accuracy is achieved on the Gold Test Dataset. 

In [9]:
# Evaluate accuracy against the validation set
y_test_arr = (y_test_ea.iloc[:,1:]).to_numpy()
score_val = entity_classifier.evaluate(x_test_ea, y_test_arr, verbose=0)
print('Validation Set\nTest loss:', score_val[0])
print('Test accuracy:', score_val[1])

# Evaluate accuracy on the gold testing dataset

predicted_labels = entity_classifier.predict(x_test_predict)
score_gold_test = entity_classifier.evaluate(x_test_predict, y_ground_truth, verbose=0)

print('Testing Gold Dataset\nTest loss:', score_gold_test[0])
print('Test accuracy:', score_gold_test[1])

Validation Set
Test loss: 0.056145722836256025
Test accuracy: 0.9860247
Testing Gold Dataset
Test loss: 0.06699925952998964
Test accuracy: 0.983065


# Polarity Prediction

This is a multiclass classification problem. With only 4 classes to choose from, a simple model like SVC gives high accuracy and is fast too.

In [10]:
# POLARITY PREDICTION

# TRAINING DATA

polarity_pred_df = training_polarity_df
#X = pd.concat([entity_df.reset_index(drop=True), attribute_encoded.reset_index(drop=True)], axis=1)

polarity_labels = polarity_pred_df['polarity'].unique()
polarity_labels_dict = {polarity_labels[i]: i for i in range(len(polarity_labels))}

train_pol_vals = polarity_pred_df['polarity'].values
Y_polarity_pred = np.array([polarity_labels_dict[label] for label in train_pol_vals])

X_polarity_pred = polarity_pred_df.drop('polarity', axis=1)
# Split the training data into train-test sets: x_train, y_train, x_test, y_test
x_train_pol, x_test_pol, y_train_pol, y_test_pol = train_test_split(X_polarity_pred, Y_polarity_pred, test_size=0.1)

# FEATURE VECTOR
# Load CountVectorizer
count_vectorizer = CountVectorizer(max_df=1.0,stop_words='english', max_features=1200)

# Read the training data into X and Y
x_feat = count_vectorizer.fit_transform(x_train_pol.sent_text)
x_train_pol = x_train_pol.drop('sent_text', axis=1)
x_train_pol = pd.concat([x_train_pol.reset_index(drop=True), 
                         pd.DataFrame(x_feat.todense()).reset_index(drop=True)], axis=1)
x_train_pol = x_train_pol.reindex(columns=x_train_pol.columns)

# x_test to be used for validation, and so we don't use fit_transform it
x_test_feat = count_vectorizer.transform(x_test_pol.sent_text)
x_test_pol = x_test_pol.drop('sent_text', axis=1)
x_test_pol = pd.concat([x_test_pol.reset_index(drop=True),
                    pd.DataFrame(x_test_feat.todense()).reset_index(drop=True)], axis=1)
x_test_pol = x_test_pol.reindex(columns=x_test_pol.columns)

print(x_train_pol.shape[1])
print(x_test_pol.shape[1])
if x_test_pol.shape[1] < x_train_pol.shape[1]:
    # Pad the test input features with zeros to match the number of columns in the training input features
    padding = np.zeros((x_test_pol.shape[0], x_train_pol.shape[1] - x_test_pol.shape[1]))
    x_test_pol = np.concatenate((x_test_pol, padding), axis=1)
elif x_test_pol.shape[1] > x_train_pol.shape[1]:
    # Truncate the test input features to match the number of columns in the training input features
    x_test_pol = x_test_pol[:, :x_train_pol.shape[1]]
    
# GOLD TEST DATASET

x_pol_predict = testing_polarity_df.drop('polarity', axis=1)
x_features = count_vectorizer.transform(x_pol_predict.sent_text)
x_pol_predict = x_pol_predict.drop('sent_text', axis=1)
x_pol_predict = pd.concat([x_pol_predict.reset_index(drop=True), 
                         pd.DataFrame(x_features.todense()).reset_index(drop=True)], axis=1)
x_pol_predict = x_pol_predict.reindex(columns=x_pol_predict.columns)

train_pol_vals = testing_polarity_df['polarity'].values
y_pol_predict = np.array([polarity_labels_dict[label] for label in train_pol_vals])

if x_pol_predict.shape[1] < x_train_pol.shape[1]:
    # Pad the test input features with zeros to match the number of columns in the training input features
    padding = np.zeros((x_pol_predict.shape[0], x_train_pol.shape[1] - x_pol_predict.shape[1]))
    x_pol_predict = np.concatenate((x_pol_predict, padding), axis=1)
elif x_pol_predict.shape[1] > x_train_pol.shape[1]:
    # Truncate the test input features to match the number of columns in the training input features
    x_pol_predict = x_pol_predict[:, :x_train_pol.shape[1]]


1282
1282


In [11]:
# POLARITY CLASSIFIER

from sklearn.svm import SVC
import pickle

# create a multiclass SVM model
clf_polarity = SVC(kernel='linear', C=1, decision_function_shape='ovr')

# fit the model to the training data
clf_polarity.fit(x_train_pol, y_train_pol)

# save the model to disk
filename = 'polarityPredictionSVC.sav'
pickle.dump(clf_polarity, open(filename, 'wb'))

In [12]:
# Evaluate performance of Polarity Classifier
from sklearn.metrics import accuracy_score, f1_score   

# Evaluate the performance of the model on validation set

y_predictions = clf_polarity.predict(x_test_pol)
acc_clf = accuracy_score(y_test_pol, y_predictions)
f1_clf = f1_score(y_test_pol, y_predictions, average='micro')
print('On validation set\nAccuracy:', acc_clf)
print('F1 score:', f1_clf)

# Evaluate against Gold Test Dataset

y_predict_gold = clf_polarity.predict(x_pol_predict)
acc_clf_gold = accuracy_score(y_pol_predict, y_predict_gold)
f1_clf_gold = f1_score(y_pol_predict, y_predict_gold, average='micro')
print('On Gold Test Dataset\nAccuracy:', acc_clf_gold)
print('F1 score:', f1_clf_gold)

On validation set
Accuracy: 0.8338278931750742
F1 score: 0.8338278931750742
On Gold Test Dataset
Accuracy: 0.6803995006242197
F1 score: 0.6803995006242197


# Other models trained for ABSA

### Word2Vec

import gensim
from gensim.models import Word2Vec

#Load the preprocessed data (lowercased, stopwords removed, non-words removed, tokenized)
#training_df['sent_text'] preprocessed in user defined functions

#Train and save the word2vec model for laptop reviews
model = Word2Vec(training_df.sent_text, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")

#Create feature vectors by summing up on word embeddings for each sentence

model = Word2Vec.load("word2vec.model")

feature_vectors = []
for sentence in grouped_df.sent_text:
    embeddings = []
    for word in sentence:
        if word in model.wv.vocab:
            embeddings.append(model.wv[word])
    if len(embeddings) > 0:
        feature_vectors.append(np.mean(embeddings, axis=0))
    else:
        feature_vectors.append(np.zeros(model.vector_size))

### Multilabel Entity-Attribute Prediction

### Multilabel SVM for Entity Prediction

#Sample Code of SVM Classifier implemented in Approach 2
#PREDICTING ENTITY CLASS USING SUPPORT VECTOR MACHINES for MULTILABEL CLASSIFICATION

from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

#Create the SVM classifier with OvR strategy
svm_clf_entity = OneVsRestClassifier(SVC())

#Define the hyperparameter grid to search over
param_grid = {'estimator__C': [0.1, 1, 10],'estimator__kernel': ['linear', 'rbf']}

#Use grid search to find the optimal hyperparameters
svm_grid = GridSearchCV(svm_clf_entity, param_grid, cv=3)
svm_grid.fit(x_train, y_array)

#MAKE PREDICTIONS 

#Make predictions on the test set
y_pred = svm_grid.predict(x_test)

y_ground_truth = np.array((y_test.iloc[:,1:]).to_numpy())

#Evaluate the performance of the model
acc = accuracy_score(y_ground_truth, y_pred)
f1 = f1_score(y_ground_truth, y_pred, average='micro')
print('Accuracy:', acc)
print('F1 score:', f1)

### Multiclass Sequential NN Classifier for ATTRIBUTE Prediction

### Attribute Prediction