In [34]:
import csv
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
import re

## Initial functions and simple setup

In [35]:
# a function to load data from a file and append it to the rawData
def loadData(path, Text=None,):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        
        for line in reader:
            # skip the header
            if line[0] == "DOC_ID":
                continue
            # create a tuple with data information
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))

In [36]:
# a function to split the data between trainData and testData
def splitData(percentage, updated=False, ngram=1): 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    if updated == False:
        for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
            trainData.append((toFeatureVector(preProcess(Text)),Label))
        for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
            testData.append((toFeatureVector(preProcess(Text)),Label))

    elif updated == True:
        for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
            trainData.append((toFeatureVector_updated(preProcess_updated(Text, ngram)),Label))
        for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
            testData.append((toFeatureVector_updated(preProcess_updated(Text, ngram)),Label))

In [37]:
# a function to convert a line from input file into id/text/label tuple
def parseReview(reviewLine):
  
    # convert id to an integer
    id = int(reviewLine[0])
    text = reviewLine[8]
    label_text = reviewLine[1]
    # convert the label to either real or fake
    label = 'fake' if label_text == '__label1__' else 'real'

    return (id, text, label)

In [38]:
# turn review text inot a list of tokens
# includes some preprocessing methods
def preProcess(text):
    
    # adds space before punctuation signs
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    # adds space after punctuation signs
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
    # splits on multiple occurances of white spaces
    tokens = re.split(r"\s+",text)
    # converts all tokens to lower case
    tokens = [t.lower() for t in tokens]
    # returns the produced tokens
    return tokens

In [39]:
# a function to convert tokens to feature vectors
# returns a dictionary containing features as keys
# and counts as values
def toFeatureVector(tokens):

    # create an empty dictionary to be returned at the end
    v = {}
    # for each token
    for t in tokens:
        # check if that token already exists in the local dictionary
        try:
            i = v[t]
            # if it does, increment the count
            v[t] = i + 1
        # otherwise, assign a count of 1
        except KeyError:
            v[t] = 1

    # return the dictionary
    return v

In [40]:
# a function for training and validating the classifier
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [41]:
# a function to cross validate
# takes the dataset as input and a value of folds to be used
# it returns 4 metric averages over all folds
def crossValidate(dataset, folds = 10):
    
    # shuffle the dataset
    shuffle(dataset)
    # create an empty dictionary for the results
    results = {}
    foldSize = int(len(dataset)/folds)
    # set up empty lists for precisions, recalls, f1 scores and accuracies
    precisions = []
    recalls = []
    f1_scores = []
    accuracies = []

    # create an empty list to store the division of indices
    i_list=[]
    # divide the dataset into segments
    for i in range(0,len(dataset),foldSize):
        i_list.append(i)
    # append the length as the last item
    i_list.append(len(dataset))

    # iterate through that list
    for i in range(len(i_list)-1):
        # and set up training data (9 out of 10 segments)
        training_data = trainData[:i_list[i]] + trainData[i_list[i+1]:]
        # and validation data (1 out of 10 segments)
        validation_data = trainData[i_list[i]:i_list[i+1]]

        # train the model using that training data
        model = trainClassifier(training_data)
        # obtain validation labels from the validation data
        validation_labels = [_[1] for _ in validation_data]
        # predict the validation data using trained model
        predictions = predictLabels(validation_data, model)

        # append the values for precision, recall, f1 score and accuracy
        # for each iteration over the segments
        precisions.append(precision_score(validation_labels, predictions, average='weighted'))
        recalls.append(recall_score(validation_labels, predictions, average='weighted'))
        f1_scores.append(f1_score(validation_labels, predictions, average='weighted'))
        accuracies.append(accuracy_score(validation_labels, predictions))

    # append the averages of each score to the dictionary
    results['precision'] = sum(precisions)/len(precisions)
    results['recall'] = sum(recalls)/len(recalls)
    results['f1_score'] = sum(f1_scores)/len(f1_scores)
    results['accuracy'] = sum(accuracies)/len(accuracies)
    # return the values in the dictionary
    return results

In [42]:
# a function to predict labels
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: t[0], reviewSamples))

# a function to predict a single label
def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

The functions below consier additional pre-processing methods aimed at improving the effectiveness of the classifier.

In [43]:
# import stopwords list from NLTK
import nltk
# from nltk.corpus import stopwords
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))
# define stemmer for stemming
stemmer = nltk.stem.PorterStemmer()
# define stemmer for lemmatising
lemmatiser = nltk.stem.WordNetLemmatizer()
# import wordnet words for lemmatiser
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Main procedure



### Simple setup

In [44]:
# loading reviews
# initialise global lists that will be appended to by the methods below
rawData = []
trainData = []
testData = []

# input file location
reviewPath = './amazon_reviews.txt'

## Run the functions
print(f"Preparing the dataset")

# load the data
loadData(reviewPath) 

# split data into 80% training and 20% test
splitData(0.8)

print(f"Number of reviews in raw data: {len(rawData)}")
print(f"Number of reviews in training data: {len(trainData)}")
print(f"Number of reviews in test data: {len(testData)}")

# cross validate on training data
cv_results = crossValidate(trainData)
# print the results
print("Results of cross validation:")
print("Precision: %f" % cv_results['precision'])
print("Recall: %f" % cv_results['recall'])
print("F1 score: %f" % cv_results['f1_score'])
print("Accuracy: %f" % cv_results['accuracy'])

Preparing the dataset
Number of reviews in raw data: 21000
Number of reviews in training data: 16800
Number of reviews in test data: 4200
Training Classifier...




Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results of cross validation:
Precision: 0.614237
Recall: 0.614048
F1 score: 0.613918
Accuracy: 0.614048


#### Evaluate using the test set
We can now train the classifier on 80% of the data and evaluate its performance using the 20% of the data we left out earlier.

In [45]:
# train the classifier using training data
classifier = trainClassifier(trainData)
# get the ground-truth labels from the data
testTrue = [t[1] for t in testData]
# classify the test data to get predicted labels   
testPred = predictLabels(testData, classifier) 
# evaluate
finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted')
accuracy = accuracy_score(testTrue, testPred)
print(f"Done training!")
print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])
print(f"Accuracy: {accuracy}")

Training Classifier...




Done training!
Precision: 0.603594
Recall: 0.603571
F Score:0.603550
Accuracy: 0.6035714285714285


### Additional pre-processing and token weights

Amended relevant functions below:

In [46]:
# turn review text inot a list of tokens
# includes additional preprocessing methods
# apostrophese removal, stopwords removal, punctuation removal,
# stemming, lemmatising, n-grams
def preProcess_updated(text, ngram=1):
    
    """
    
    # removal of apostrophes
    # replace "'" with ""
    text.replace("'", "")
    """

    # adds space before punctuation signs
    text = re.sub(r"(\w)([.,;:!?'\"”\)])", r"\1 \2", text)
    # adds space after punctuation signs
    text = re.sub(r"([.,;:!?'\"“\(])(\w)", r"\1 \2", text)
    # splits on multiple occurances of white spaces
    tokens = re.split(r"\s+",text)
    # converts all tokens to lower case
    tokens = [t.lower() for t in tokens]
    
    """
    # removal of stopwords
    tokens = [token for token in tokens if not token in stop_words]
    
    # removal of punctuation
    tokens = [token for token in tokens if token.isalpha()]
    """
    
    # stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]
    
    # lemmatise the tokens
    tokens = [lemmatiser.lemmatize(token) for token in tokens]
    
    # an attempt to create bigrams and use them for training
    if (ngram == 2):
        new_tokens = []      
        tokens = ['<s>'] + tokens + ['</s>']
        for i in range(len(tokens)-1):
            new_tokens.append(tokens[i] + ' ' + tokens[i+1])
        tokens = new_tokens

   # an attempt to create trigrams and use them for training
    if (ngram == 3):
        new_tokens = []      
        tokens = ['<s>'] + tokens + ['</s>']
        for i in range(len(tokens)-2):
            new_tokens.append(tokens[i] + ' ' + tokens[i+1] + ' ' + tokens[i+2])
        tokens = new_tokens

    # returns the produced tokens
    return tokens

In [47]:
# a function to convert tokens to feature vectors
# returns a dictionary containing features as keys
# and weights as values
# this updated function returns weights instead of counts
def toFeatureVector_updated(tokens):

    # create an empty dictionary to be returned at the end
    v = {}
    # for each token
    for t in tokens:
        # check if that token already exists in the local dictionary
        try:
            i = v[t]
            # if it does, increment the count
            v[t] = i + 1
        # otherwise, assign a count of 1
        except KeyError:
            v[t] = 1

    # for every item in the dictionary, turn counts into weights
    v_weighted = {key:value/sum(v.values()) for (key, value) in v.items()}

    # return the ditionary
    return v_weighted

#### Rerun cross-validation

In [48]:
# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []
trainData = []
testData = []

# input file location
reviewPath = './amazon_reviews.txt'

## Run the functions
print(f"Preparing the dataset")

# load the data
loadData(reviewPath) 

# split data into 80% training and 20% test
splitData(0.8, True, 1)

print(f"Number of reviews in raw data: {len(rawData)}")
print(f"Number of reviews in training data: {len(trainData)}")
print(f"Number of reviews in test data: {len(testData)}")

# cross validate on training data
cv_results = crossValidate(trainData)
# print the results
print("Results of cross validation:")
print("Precision: %f" % cv_results['precision'])
print("Recall: %f" % cv_results['recall'])
print("F1 score: %f" % cv_results['f1_score'])
print("Accuracy: %f" % cv_results['accuracy'])

Preparing the dataset
Number of reviews in raw data: 21000
Number of reviews in training data: 16800
Number of reviews in test data: 4200
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results of cross validation:
Precision: 0.652599
Recall: 0.652024
F1 score: 0.651830
Accuracy: 0.652024


#### Evaluate using the test set
We can now train the classifier on 80% of the data and evaluate its performance using the 20% of the data we left out earlier.

In [49]:
# train the classifier using training data
classifier = trainClassifier(trainData)
# get the ground-truth labels from the data
testTrue = [t[1] for t in testData]
# classify the test data to get predicted labels   
testPred = predictLabels(testData, classifier) 
# evaluate
finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted')
accuracy = accuracy_score(testTrue, testPred)
print(f"Done training!")
print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])
print(f"Accuracy: {accuracy}")

Training Classifier...
Done training!
Precision: 0.636256
Recall: 0.636190
F Score:0.636147
Accuracy: 0.6361904761904762


### Looking beyond textual features of the review
As the review data contains additional features (rating, verified purchase, product category, product ID, product title, review title). I want to explore of using any of them can improve the performance of the classifier.

Amended relevant functions below:

In [50]:
# amended loadData function
def loadData(path, Text=None,):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        
        for line in reader:
            if line[0] == "DOC_ID":  # skip the header
                continue
            # amend to include Verified, Category and ProductID
            (Id, Text, Label, Verified, Category, ProductID) = parseReview(line)
            rawData.append((Id, Text, Label, Verified, Category, ProductID))

In [51]:
# amended parseReview function
def parseReview(reviewLine):
    
    id = int(reviewLine[0])
    text = reviewLine[8]
    label_text = reviewLine[1]
    label = 'fake' if label_text == '__label1__' else 'real'
    # verified purchase as it is stored as Y or N, I want to convert
    # it to 1 or 0
    
    # amended to include verified, category and productID
    verified = 1 if reviewLine[3] == 'Y' else 0
    category = reviewLine[4]
    productID = reviewLine[5]

    # the function now returnsn 6 values for each review
    return (id, text, label, verified, category, productID)

In [52]:
# amended toFeatureVector_updated function
def toFeatureVector_updated(tokens, verified, category, productID):

    # create an empty dictionary to be returned at the end (local dictionary)
    v = {}
    # for each token
    for t in tokens:
        # check if that token already exists in the local dictionary
        try:
            i = v[t]
            # if it does, increment the count
            v[t] = i + 1
        # otherwise, assign a count of 1
        except KeyError:
            v[t] = 1

    # for every item in the dictionary, turn counts into weights
    v_weighted = {key:value/sum(v.values()) for (key, value) in v.items()}
    
    
    # amended so that the dictionary now returned the additional 3 features
    v_weighted['Verified'] = verified
    v_weighted['Category'] = category
    v_weighted['ProductID'] = productID

    # return the local ditionary
    return v_weighted

In [53]:
# amended splitData function
def splitData(percentage):
    # A method to split the data between trainData and testData 
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)

    # amended so that training and test datasets include the additional functions
    for (_, Text, Label, Verified, Category, ProductID) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector_updated(preProcess_updated(Text),Verified, Category, ProductID),Label))
    for (_, Text, Label, Verified, Category, ProductID) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector_updated(preProcess_updated(Text),Verified, Category, ProductID),Label))

I found that the additional features that improved the performance were:
- verified purchase
- product category
- product id

#### Rerun cross-validation

In [54]:
# loading reviews
# initialize global lists that will be appended to by the methods below
rawData = []
trainData = []
testData = []

# input file location
reviewPath = './amazon_reviews.txt'

## Run the functions
print(f"Preparing the dataset")

# load the data
loadData(reviewPath) 

# split data into 80% training and 20% test
splitData(0.8)

print(f"Number of reviews in raw data: {len(rawData)}")
print(f"Number of reviews in training data: {len(trainData)}")
print(f"Number of reviews in test data: {len(testData)}")

# cross validate on training data
cv_results = crossValidate(trainData)
# print the results
print("Results of cross validation:")
print("Precision: %f" % cv_results['precision'])
print("Recall: %f" % cv_results['recall'])
print("F1 score: %f" % cv_results['f1_score'])
print("Accuracy: %f" % cv_results['accuracy'])

Preparing the dataset
Number of reviews in raw data: 21000
Number of reviews in training data: 16800
Number of reviews in test data: 4200
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Results of cross validation:
Precision: 0.807451
Recall: 0.806905
F1 score: 0.806816
Accuracy: 0.806905


#### Evaluation on test data

In [55]:
# train the classifier using training data
classifier = trainClassifier(trainData)
# get the ground-truth labels from the data
testTrue = [t[1] for t in testData]
# classify the test data to get predicted labels   
testPred = predictLabels(testData, classifier) 
# evaluate
finalScores = precision_recall_fscore_support(testTrue, testPred, average='weighted')
accuracy = accuracy_score(testTrue, testPred)
print(f"Done training!")
print("Precision: %f\nRecall: %f\nF Score:%f" % finalScores[:3])
print(f"Accuracy: {accuracy}")

Training Classifier...
Done training!
Precision: 0.818593
Recall: 0.815000
F Score:0.814477
Accuracy: 0.815


## Conclusions

Below, I present a list of results by running the model on the test dataset using the base functions, amended pre-processing and using additonal features. Each time the model is run, the results might be slightly different as the model splits the whole dataset into training and test at random.

| Metrics | Base functions | Additional pre-processing | Additional features |
| --- | --- | --- | --- |
| Precision | 0.6036 | 0.6363 | 0.8186 |
| Recall | 0.6036 | 0.6362 | 0.8150 |
| F1 Score | 0.6036 | 0.6361 | 0.8144 |
| Accuracy | 0.6036 | 0.6362 | 0.815 |

Overall, the improvement is significant, particularly when additional features not from the text of the review were included. The performance of the classifier started ar around 60% and improved to over 81%, an improvement of over 21%.

The full list of improvements made:
- stemming of tokens
- lemmatisation of tokens
- feature weighing of vectorised tokens
- inclusion of 'Verified purchase' as a feature
- inclusion of 'Product category' as a feature
- inclusion of 'Product ID' as a feature