In [29]:
import sys
if sys.version_info[0] < 3:
    raise Exception("Must be using Python 3. This notebook was created on 3.6.5")

import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from random import randint
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
import numpy as np


In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        # We want to skip the first line as it is the column title
        next(reader)
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [3]:
# QUESTION 1

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    
    id = int(reviewLine[0])
    text = str(reviewLine[8])
    label = str(reviewLine[1])
    return (id, text, label)


In [1]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
import contractions
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
from nltk.tokenize import RegexpTokenizer
from bs4 import BeautifulSoup
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Input: a string of one review
def preProcess(text):
    #Initialisation steps:
    new_text = []
    
    porter = PorterStemmer()
    
    # remove html tags
    text = BeautifulSoup(text,"lxml").get_text()
    # replace i'd with i would and other similar contractions
    text = contractions.fix(text)
    
    # split by whitespace and remove non-alphanumeric characters like punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(text)
    
    for token in tokens:
        
        # Loop through words and remove capital letters
        new_token = token.lower()
        
        # If token is a stop word we don't want to include it
        if new_token in stop_words:
            continue;
        
        # Use the porter algorithm to stem the word e.g. rationalise -> rational
        new_text.append(porter.stem(new_token))
    
    # Should return a list of tokens
    return new_text

[nltk_data] Downloading package stopwords to /home/leem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    featureVec = {}
    
    for token in tokens:
        # For each token, we want to increment the global feature count
        if token in featureDict:
            featureDict[token] += 1
        else:
            featureDict[token] = 1
        # We want to add the token to the dictionary to create a simple vector
        if token in featureVec:
            featureVec[token] += 1
        else:
            featureVec[token] = 1

    return featureVec

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    #print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC(max_iter=1000,C=globalCostCoeff))])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# QUESTION 3

def crossValidate(dataset, folds):
    # Shuffle the data so we will split the data randomly
    shuffle(dataset)
    
    # Perform preprocessing and vectorisation on the data
    data = prepData(dataset)
    
    # Initialise an empty confusion matrix
    totalConfusionMatrix = np.zeros((2,2))
    
    # Get the length of the testData slice 
    testDataLen = int(len(data)/folds)
    
    # For each slice of the data
    for i in range(0,len(data),testDataLen):
        testEnd = i+testDataLen
        
        #Slice up the data into test and training
        testingData = data[i:testEnd]
        trainingData = data[:i] + data[testEnd:]
        
        #print("Split:", i, "Len train data:",len(trainingData),"Len test data:",len(testingData),"Scores below")
        
        # Train the model then get its predictions 
        classifier = trainClassifier(trainingData)
        yPred = [predictVector(x[0], classifier) for x in testingData]
        
        # Get the true labels
        yTrue = [x[1] for x in testingData]
        
        # Get a confusion matrix to describe the performance of this fold
        confusionMatrix = confusion_matrix(yTrue,yPred)
        
        # Add to the overall confusion matrix. This will allow us to get average performance metrics later
        totalConfusionMatrix = np.add(confusionMatrix, totalConfusionMatrix)
        
    print(totalConfusionMatrix)
    
    # Get the average performance metrics from overall confusion matrix
    averagePrecision = totalConfusionMatrix[0][0] / (totalConfusionMatrix[0][0] + totalConfusionMatrix[0][1])
    averageRecall = totalConfusionMatrix[0][0] / (totalConfusionMatrix[0][0] + totalConfusionMatrix[1][0])
    averageF1Score = 2*averagePrecision*averageRecall / (averagePrecision + averageRecall)
    averageAccuracy = (totalConfusionMatrix[0][0] + totalConfusionMatrix[1][1])/ float(np.sum(totalConfusionMatrix))
    
    # Return results in a tuple
    cv_results = (averagePrecision,averageRecall,averageF1Score,averageAccuracy)
    return cv_results

def prepData(data):
    newData = []
    for (_, Text, Label) in data:
        newData.append((toFeatureVector(preProcess(Text)),Label))
    return newData

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

# Takes in a list of strings as review samples and returns list of predictions
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

# Takes in string as a review sample and returns a prediction
def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

# Easier to pass the vector to this predict func
def predictVector(textVec, classifier):
    return classifier.classify(textVec)

In [9]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
24768


In [10]:
#Cell for sense-checking the preprocessing function
example = rawData[randint(0,len(rawData))][1]
print(example)
print("After preprocessing")
print(preProcess(example))
print(len(preProcess(example)))


This book is a great source of information for those studying to take the medical administrative assistant exam, as it's incredibly knowledgeable and full of insightful resources. The author really knows her stuff! There are so many informative questions to help study from, and one of the best study guides I've ever looked into. Well done!
After preprocessing
['book', 'great', 'sourc', 'inform', 'studi', 'take', 'medic', 'administr', 'assist', 'exam', 'incred', 'knowledg', 'full', 'insight', 'resourc', 'author', 'realli', 'know', 'stuff', 'mani', 'inform', 'question', 'help', 'studi', 'one', 'best', 'studi', 'guid', 'ever', 'look', 'well', 'done']
32


In [11]:

bestF1 = 0
# intialise list of candidate cost coefficients
costCoeffs = [0.0001,0.001,0.01,0.1,1.0,1.5]

for c in costCoeffs:
    # For each candidate cost coefficient, print the confusion matrix and the performance metrics
    globalCostCoeff = c
    print('Cost coeff:', globalCostCoeff)
    myResults = crossValidate(rawData,10)
    print(f"Precision: {myResults[0]} Recall: {myResults[1]} F1Score: {myResults[2]} Accuracy: {myResults[3]}")
    
    # Determine the best cost coefficient based off of F1Score. 
    # Depending on the objectives of the business, a different metric might be better e.g. recall.
    if myResults[2] > bestF1:
        bestF1 = myResults[2]
        bestC = c

print("Best costCoeff:",bestC)
    

Cost coeff: 0.0001
[[6773. 3727.]
 [4330. 6170.]]
Precision: 0.6450476190476191 Recall: 0.6100153111771593 F1Score: 0.6270425403879091 Accuracy: 0.6163333333333333
Cost coeff: 0.001
[[7065. 3435.]
 [4191. 6309.]]
Precision: 0.6728571428571428 Recall: 0.6276652452025586 F1Score: 0.6494760066188637 Accuracy: 0.6368571428571429
Cost coeff: 0.01
[[7014. 3486.]
 [4132. 6368.]]
Precision: 0.668 Recall: 0.6292840480890005 F1Score: 0.6480643074933012 Accuracy: 0.6372380952380953
Cost coeff: 0.1
[[6943. 3557.]
 [4192. 6308.]]
Precision: 0.6612380952380953 Recall: 0.6235294117647059 F1Score: 0.6418303674601341 Accuracy: 0.631
Cost coeff: 1.0
[[6714. 3786.]
 [4407. 6093.]]
Precision: 0.6394285714285715 Recall: 0.6037226868087402 F1Score: 0.6210628555570973 Accuracy: 0.6098571428571429
Cost coeff: 1.5
[[6637. 3863.]
 [4430. 6070.]]
Precision: 0.632095238095238 Recall: 0.5997108520827686 F1Score: 0.6154773496545649 Accuracy: 0.6050952380952381
Best costCoeff: 0.001


Excercise 4:
I first tried to improve the accuracy by performing better pre-processing on the data. 
With only whitespace separating occuring in pre-processing the model achieved:
Precision: 0.62 Recall: 0.60 F1Score: 0.61 Accuracy: 0.61

With html tag removal, word stemming, removal of stop words, expansion of contractions and lowering of word case the model achieved: 
Precision: 0.64 Recall: 0.60 F1Score: 0.62 Accuracy: 0.61

The pre-processing resulted in a small increase in model precision but there is still a lot more to be done

I then tried tuning the SVC model to improve the model. I first increased the max iterations from the default 1000 to 10000 to see if there was an issue with the model not converging. This had a negligible effect on model performance and so I did not vary that parameter.

Changing the cost value caused a significant increase in model performance, to investigate this I then tried several values of the cost value to find the best. The best Cost coefficient was 0.01 which resulted in the following metrics:
Precision: 0.67 Recall: 0.63 F1Score: 0.65 Accuracy: 0.64
It should be noted that the C=0.01 and C=0.001 have very similar performance. This similarity, combined with the small random variations expected from cross-validation's data shuffle, means that repeated runs might choose C=0.001 instead of C=0.01.  

This smaller cost value than the default 1.0, means that the model learns more slowly, correcting itself with smaller increments. That can increase the number of iterations required to reach the best result but the smaller steps also mean it is less likely to overshoot a cost minima. 


In [26]:
#Exercise 5:
# I will include the stars in rating, the verified purchase and the product category in the model

def exercise5Parse(reviewLine):
    # Should return a tuple of id, the review text, the label, the star rating normalised, boolean verified purchase, product category
    id = int(reviewLine[0])
    text = str(reviewLine[8])
    label = str(reviewLine[1])
    #  normalising rating
    rating = float(int(reviewLine[2])/5)
    
    # Convert to int to ease processing
    verifiedPurchase = 1 if reviewLine[3] == 'Y' else 0
    
    # Will need to convert this to a feature later
    productCategory = str(reviewLine[5])
    
    return (id, text, label, rating, verifiedPurchase, productCategory)

def ex5PrepData(data):
    newData = []
    for (_, Text, Label, Rating, VerifiedPurchase, ProductCat) in data:
        fullFeaturesDict = toFeatureVector(preProcess(Text))
        
        # Add other features using * to avoid confusion with words
        fullFeaturesDict['*Rating'] = Rating
        fullFeaturesDict['*VerifiedPurchase'] = VerifiedPurchase
        
        # Here i add the product category in the same way as with any of the words
        fullFeaturesDict[ProductCat] = 1
                                        
        newData.append((fullFeaturesDict,Label))
    return newData

def ex5LoadData(path):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        # We want to skip the first line as it is the column title
        next(reader)
        for line in reader:
            (Id, Text, Label, Rating, VerifiedPurchase, ProductCat) = exercise5Parse(line)
            ex5RawData.append((Id, Text, Label, Rating, VerifiedPurchase, ProductCat))
            
def ex5CrossValidate(dataset, folds):
    # Shuffle the data so we will split the data randomly
    shuffle(dataset)
    
    # Perform preprocessing and vectorisation on the data
    data = ex5PrepData(dataset)
    
    # Initialise an empty confusion matrix
    totalConfusionMatrix = np.zeros((2,2))
    
    # Get the length of the testData slice 
    testDataLen = int(len(data)/folds)
    
    # For each slice of the data
    for i in range(0,len(data),testDataLen):
        testEnd = i+testDataLen
        
        #Slice up the data into test and training
        testingData = data[i:testEnd]
        trainingData = data[:i] + data[testEnd:]
        
        #print("Split:", i, "No train data:",len(trainingData),"No test data:",len(testingData),"Scores below")
        # Train the model then get its predictions 
        classifier = trainClassifier(trainingData)
        yPred = [predictVector(x[0], classifier) for x in testingData]
        
        # Use a lambda function to get the true labels
        yTrue = [x[1] for x in testingData]
        
        # Get a confusion matrix to describe the performance of this fold
        confusionMatrix = confusion_matrix(yTrue,yPred)
        
        # Add to the overall confusion matrix. This will allow us to get average performance metrics later
        totalConfusionMatrix = np.add(confusionMatrix, totalConfusionMatrix)
        
    print(totalConfusionMatrix)
    
    # Get the average performance metrics from overall confusion matrix
    averagePrecision = totalConfusionMatrix[0][0] / (totalConfusionMatrix[0][0] + totalConfusionMatrix[0][1])
    averageRecall = totalConfusionMatrix[0][0] / (totalConfusionMatrix[0][0] + totalConfusionMatrix[1][0])
    averageF1Score = 2*averagePrecision*averageRecall / (averagePrecision + averageRecall)
    averageAccuracy = (totalConfusionMatrix[0][0] + totalConfusionMatrix[1][1])/ float(np.sum(totalConfusionMatrix))
    
    # Return results in a tuple
    cv_results = (averagePrecision,averageRecall,averageF1Score,averageAccuracy)
    return cv_results            

ex5RawData = []
ex5LoadData(reviewPath)

ex5MyResults = ex5CrossValidate(ex5RawData,10)
print(f"Precision: {ex5MyResults[0]} Recall: {ex5MyResults[1]} F1Score: {ex5MyResults[2]} Accuracy: {ex5MyResults[3]}")




[[8275. 2225.]
 [2138. 8362.]]
Precision: 0.7880952380952381 Recall: 0.7946797272639969 F1Score: 0.7913737866398891 Accuracy: 0.7922380952380952


By including the three new features the performance of the model greatly increased to: 
Precision: 0.79 Recall: 0.79 F1Score: 0.79 Accuracy: 0.79

This increase is expected as additional data should always help the model. In particular, I expect that the verified purchase feature will have made the largest improvement as intuitively I think fake reviews are unlikely to make a genuine purchase of the item. 

I expect the star rating could also have helped because fake reviews would often be in the extremes. i.e. a 1 star ofr 5 star rating. linear SVM will only be able to make partial use of this, i.e. it will either determine that more stars increase the likelihood of a fake review or it will determine that fewer stars increases the likelihood of a fake review. It will not be able to say that reviews at either extreme are fake. To do that, we should consider using a different model or a non-linear svm such as a kernal SVM. 

The product category could be helping because it may be more common for people to leave fake reviews in some product categories, in particular ones where there is strong competition between manufacturers. 

                
