In [1]:
import csv                               # csv reader
import nltk
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.stem import WordNetLemmatizer
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn import metrics as Skmet #used for precision_recall_fscore_support()
from operator import itemgetter #used to unpack turples
import re

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        count = 0
        for line in reader:
            if count == 0:  #edit to remove the headers from the text data.
                count += 1
            else:
                (Id, Text, Label) = parseReview(line)
                rawData.append((Id, Text, Label))
                preprocessedData.append((Id, preProcess(Text), Label))
                count += 1
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [3]:
# QUESTION 1

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    #print(reviewLine)
    Id = reviewLine[0]
    Text = reviewLine[8]
    if reviewLine[1] == '__label2__':
        Label = 'real'
    else:
        Label = 'fake'
    return (Id, Text, Label)



In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
def padWithBlank(found):
    if found == "%":
        return " % "
    elif found == "!":
        return " ! "
    elif found == "?":
        return " ? "
    
# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    #I noticed html tags in the text. Often joining words without a space
    #so all <*> occurances are replaced with " "
    text = re.sub(r"<.*>", " ", text) # html tag removal
    
    
    #Set a minium document frequency 2
    
    
    
    
    temp = text.split(" ")
    
    return temp

In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    reviewDict = { }
    for token in tokens: #adding to the review dict
        if token in reviewDict:
            reviewDict[token] = reviewDict[token] + 1
        else:
            reviewDict[token] = 1
    
    for token in tokens:
        if token in featureDict:
            featureDict[token] = featureDict[token] + 1
        else:
            featureDict[token] = 1
    return reviewDict

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        tempDataSet = dataset[0:i] + dataset[i+foldSize:len(dataset)] # joins to parts of the list to form the dataset to test.
        testingSet = dataset[i:i+foldSize] #takes the fold size an i to find the current test data.
        trueLabels = list(map(itemgetter(1), testingSet)) #creates a 1D array of result labels
        
        #training
        classifier = trainClassifier(tempDataSet)
        
        #classifing
        results = predictLabels(testingSet, classifier)
        
        precision = Skmet.precision_score(trueLabels, results,pos_label="fake")
        recall = Skmet.recall_score(trueLabels, results,pos_label="fake")
        f_score = 2 * (precision * recall)/(precision + recall)
        accuracy = Skmet.accuracy_score(trueLabels,results)
        cv_results.append((precision, recall, f_score, accuracy)) #create a turple for metrics for each fold
    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

#download lemminzer data
nltk.download("wordnet", "c:/nltk_data/")


# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

crossValidationResults = crossValidate(trainData, 10);

print("Precision, Recall, Fscore, Accuracy")
for i in range(0,len(crossValidationResults)):
    print(crossValidationResults[i])

[nltk_data] Downloading package wordnet to c:/nltk_data/...
[nltk_data]   Package wordnet is already up-to-date!


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
71941
Training Classifier...




Training Classifier...
Training Classifier...
Training Classifier...




Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Precision, Recall, Fscore, Accuracy
(0.49166666666666664, 1.0, 0.659217877094972, 0.49166666666666664)
(0.4928571428571429, 1.0, 0.6602870813397129, 0.4928571428571429)
(0.5023809523809524, 1.0, 0.6687797147385103, 0.5023809523809524)
(0.0, 0.0, nan, 0.0)
(0.5261904761904762, 1.0, 0.6895475819032763, 0.5261904761904762)
(0.4886904761904762, 1.0, 0.6565373850459816, 0.4886904761904762)
(0.0, 0.0, nan, 0.0)
(0.5196428571428572, 1.0, 0.6839012925969448, 0.5196428571428572)
(0.4988095238095238, 1.0, 0.6656076250992852, 0.4988095238095238)
(0.0, 0.0, nan, 0.0)


#### Output Pre Question 4

Precision, Recall, Fscore, Accuracy
(0.0, 0.0, 0.0, 0.4875) #This result gave a convergence warning so precision, recall and fscore set to 0.
(0.4886904761904762, 1.0, 0.6565373850459816, 0.4886904761904762)
(1.0, 1.0, 1.0, 1.0)
(0.5029761904761905, 1.0, 0.6693069306930693, 0.5029761904761905)
(1.0, 1.0, 1.0, 1.0)
(0.48333333333333334, 1.0, 0.6516853932584269, 0.48333333333333334)
(0.49047619047619045, 1.0, 0.65814696485623, 0.49047619047619045)
(1.0, 1.0, 1.0, 1.0)
(0.5023809523809524, 1.0, 0.6687797147385103, 0.5023809523809524)
(0.5125, 1.0, 0.6776859504132231, 0.5125)

Not great. The accuracy is very bad you would get this from random guessing the class. Slightly better than guessin the same for everything but still poor.

#### Question 4

Improve Preprocessing
    
    ####removed html tags (mainly <br/>) 
    
    The addition of tag removal reduced the feature count by 17815 from 89756 to 71941
(1.0, 1.0, 1.0, 1.0)
(0.0, 0.0, 0.0, 0.0)
(0.48273809523809524, 1.0, 0.65114411882778, 0.48273809523809524)
(0.0, 0.0, 0.0, 0.0)
(1.0, 1.0, 1.0, 1.0)
(0.0, 0.0, 0.0, 0.0)
(0.5029761904761905, 1.0, 0.6693069306930693, 0.5029761904761905)
(0.5029761904761905, 1.0, 0.6693069306930693, 0.5029761904761905)
(0.0, 0.0, 0.0, 0.0)
(0.49642857142857144, 1.0, 0.6634844868735084, 0.49642857142857144)
    
    lemmatisation

Normalising Punctuation
    
    
    
Alternatives to Unigrams

    Results
    
Change Class Weights

    Results


#### Question 5

Is the review from a varified purchaser

    Results
    
Length of the review text

    Results
    

