In [1]:
import csv                               # csv reader
import nltk
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn import metrics as Skmet #used for precision_recall_fscore_support()
from operator import itemgetter #used to unpack turples
import re
import math

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        count = 0
        for line in reader:
            if count == 0:  #edit to remove the headers from the text data.
                count += 1
            else:
                (Id, Text, Label, Verified) = parseReview(line)
                rawData.append((Id, Text, Label, Verified))
                preprocessedData.append((Id, preProcess(Text), Label, Verified))
                count += 1
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label, Verified) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label, Verified) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [3]:
# QUESTION 1

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    #print(reviewLine)
    Id = reviewLine[0] #Assigns the ID, review text and alters the label. #ex5 extension: the edition of Verified.
    Text = reviewLine[8] #
    Verified = reviewLine[3] 
    if reviewLine[1] == '__label2__':
        Label = 'real'
    else:
        Label = 'fake'
    
    return (Id, Text, Label, Verified)



In [10]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    #I noticed html tags in the text. Often joining words without a space
    #so all <*> occurances are replaced with " "
    text = text.lower() #normalize the text 
    text = re.sub(r"<.*>", " ", text) # html tag removal
    text = re.sub(r".\'", "" , text) # to prevent 'don't' becoming 'don' 't'
    tokenizer = RegexpTokenizer(r'\w+')
    preNormTemp = tokenizer.tokenize(text) #applying regex tokenizer to remove punctuation.

    lemmatizer = WordNetLemmatizer()
    
    
    
    temp = []
    stopWords = set(stopwords.words('english')) #stop word removal.
    for w in preNormTemp:
        if (w not in stopWords):
            temp.append(lemmatizer.lemmatize(w))
    
    return temp

In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    reviewDict = {}
    for token in tokens: #adding to the review dict
        if token in reviewDict:
            reviewDict[token] = reviewDict[token] + 1
        else:
            reviewDict[token] = 1
    
    reviewDict["reviewLength"] = len(tokens) # ex5 adds the review length feature with weight = to number of tokens in review
    
    for token in tokens: #adds values to featureDict
        if token in featureDict:
            featureDict[token] = featureDict[token] + 1
        else:
            featureDict[token] = 1
    return reviewDict

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    #print((dataset[0:1000]))
    cv_results = []
    temp = []
    totalPrecision = 0
    totalRecall = 0
    totalFScore = 0
    totalAccuracy = 0
    foldSize = int(len(dataset)/folds) #the fold size
    for i in range(0,len(dataset),foldSize):
        tempDataSet = dataset[0:i] + dataset[i+foldSize:len(dataset)] # joins to parts of the list to form the dataset to test.
        testingSet = dataset[i:i+foldSize] #takes the fold size an i to find the current test data.
        trueLabels = list(map(itemgetter(1), testingSet)) #creates a 1D array of result labels
        
        #training
        classifier = trainClassifier(tempDataSet) #classifier using 9/10th of the dataset
        #classifing
        results = predictLabels(testingSet, classifier) #predictLabels using 1/10th of the dataset
        #print(results[0:10])        
        #print(trueLabels[0:10])
        
        precision = Skmet.precision_score(trueLabels, results,pos_label="fake") #finds precision score
        recall = Skmet.recall_score(trueLabels, results,pos_label="fake") #finds recall score
        f_score = 2 * (precision * recall)/(precision + recall) #calculates f_score
        accuracy = Skmet.accuracy_score(trueLabels,results) #calculate accuracy
        totalPrecision += precision
        totalRecall += recall
        if math.isnan(f_score): #if f_score not a number won't add it to the total
            totalFScore += 0
        else:
            totalFScore += f_score
        totalAccuracy += accuracy
    
    cv_results = (totalPrecision/folds,totalRecall/folds,totalFScore/folds,totalAccuracy/folds)
        
    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [11]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

#download nltk data
nltk.download("wordnet") #word set for lemmatizing
nltk.download("stopwords") #stop word dataset for stopword removal

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

crossValidationResults = crossValidate(trainData, 10);

print("Precision, Recall, Fscore, Accuracy")
print(crossValidationResults) # prints cross validation results.
    

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nickf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nickf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
67316
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...


  'precision', 'predicted', average, warn_for)


Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Precision, Recall, Fscore, Accuracy
(0.9, 0.9, 0.9, 0.9471428571428572)


#### Output Pre Question 4

Precision, Recall, Fscore, Accuracy
(0.30220238095238094, 0.5, 0.368599484952393, 0.30220238095238094)

Not great. The accuracy is very bad you would get better from randomly guessing the class. Better f_score than guessin the same for everything but still poor.

#### Question 4

Improve Preprocessing
    
   removed html tags mostly break tags
   
   The addition of tag removal reduced the feature count by 17815 from 89756 to 71941. Didn't improve the poor performance. So I decided to remove stop words.
   
        Precision, Recall, Fscore, Accuracy
        (0.3516666666666667, 0.7, 0.4680749513397764, 0.4523809523809523)

        
   I noticed it might be a good idea to make everything lowercase. Reduced the feature count from 71941 to 31528. This massively improved the classification.
   
        Precision, Recall, Fscore, Accuracy
        (0.8480952380952381, 1.0, 0.8982958400334422, 0.8480952380952381)
        
Stop word removal reduce the feature count from 31528 to 31384 and the results appeared to improve further. 
   
        Precision, Recall, Fscore, Accuracy
        (0.9480952380952381, 1.0, 0.964951768488746, 0.9480952380952381)
 
   Lemmatization
   
   I lemmatized the data this reduced the feature count further from 31528 to 27759. There was no apparent improvement. As the output was consistanly correctly classified bar +- 1.
   
        Precision, Recall, Fscore, Accuracy
        (1.0, 1.0, 1.0, 1.0)

#### Question 5

Length of the review text

    I was curious to see if the lenght of the text was a useful feature to consider. I created a feature key reviewLength that weight was the length of the review. 

Is the review from a varified purchaser

    Results
    

    

