In [1]:
import csv                               # csv reader
import nltk
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn import metrics as Skmet #used for precision_recall_fscore_support()
from operator import itemgetter #used to unpack turples
import re
import math

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        count = 0
        for line in reader:
            if count == 0:  #edit to remove the headers from the text data.
                count += 1
            else:
                (Id, Text, Label, Verified, Title) = parseReview(line)
                rawData.append((Id, Text, Label, Verified, Title))
                preprocessedData.append((Id, preProcess(Text), Label, Verified, preProcess(Title)))
                count += 1
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label, Verified, Title) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text),preProcess(Title),Verified),Label))
    for (_, Text, Label, Verified, Title) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text),preProcess(Title),Verified),Label))

In [3]:
# QUESTION 1

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    #print(reviewLine)
    Id = reviewLine[0] #Assigns the ID, review text and alters the label. #ex5 extension: the edition of Verified.
    Text = reviewLine[8] #
    Verified = reviewLine[3] 
    if reviewLine[1] == '__label2__':
        Label = 'real'
    else:
        Label = 'fake'
    Title = reviewLine[7]
    
    return (Id, Text, Label, Verified, Title)



In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    #I noticed html tags in the text. Often joining words without a space
    #so all <*> occurances are replaced with " "
    text = text.lower() #normalize the text 
    text = re.sub(r"<.*>", " ", text) # html tag removal
    text = re.sub(r".\'", "" , text) # to prevent 'don't' becoming 'don' 't'
    tokenizer = RegexpTokenizer(r'\w+')
    preNormTemp = tokenizer.tokenize(text) #applying regex tokenizer to remove punctuation.

    #lemmatizer = WordNetLemmatizer()
     
    #temp = []
    #stopWords = set(stopwords.words('english')) #stop word removal.
    #for w in preNormTemp:
    #    if (w not in stopWords):
    #        temp.append(w)#lemmatizer.lemmatize(w))
    #temp = text.split(" ")
        
    return preNormTemp

In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens,titles,verified):
    # Should return a dictionary containing features as keys, and weights as values
    reviewDict = {}
    for token in tokens: #adding to the review dict
        if token in reviewDict:
            reviewDict[token] = reviewDict[token] + 1
        else:
            reviewDict[token] = 1
    
    for title in titles:
        if title in reviewDict:
            reviewDict[title] = reviewDict[title] + 1
        else:
            reviewDict[title] = 1
    
    if verified == "Y":
        reviewDict["VerifiedToken"] = 1
    
    reviewDict["reviewLength"] = len(tokens) # ex5 adds the review length feature with weight = to number of tokens in review
    
    for token in tokens: #adds values to featureDict
        if token in featureDict:
            featureDict[token] = featureDict[token] + 1
        else:
            featureDict[token] = 1
    
    return reviewDict

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [7]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    #print(dataset[0])
    cv_results = []
    temp = []
    totalPrecision = 0
    totalRecall = 0
    totalFScore = 0
    totalAccuracy = 0
    foldSize = int(len(dataset)/folds) #the fold size
    for i in range(0,len(dataset),foldSize):
        tempDataSet = dataset[0:i] + dataset[i+foldSize:len(dataset)] # joins to parts of the list to form the dataset to test.
        testingSet = dataset[i:i+foldSize] #takes the fold size an i to find the current test data.
        trueLabels = list(map(itemgetter(1), testingSet)) #creates a 1D array of result labels
        testingSetRemovedLabel = list(map(itemgetter(0),testingSet))
        
        #print(testingSetRemovedLabel)
        #training
        classifier = trainClassifier(tempDataSet) #classifier using 9/10th of the dataset
        #classifing
        
        results = []
        for i in testingSetRemovedLabel:
            results.append(predictLabel(i,classifier))
        
        #predictLabels using 1/10th of the dataset
        #print(results[0:10])        
        #print(trueLabels[0:10])
        
        precision = Skmet.precision_score(trueLabels, results,pos_label="fake") #finds precision score
        recall = Skmet.recall_score(trueLabels, results,pos_label="fake") #finds recall score
        f_score = 2 * (precision * recall)/(precision + recall) #calculates f_score
        accuracy = Skmet.accuracy_score(trueLabels,results) #calculate accuracy
        totalPrecision += precision
        totalRecall += recall
        if math.isnan(f_score): #if f_score not a number won't add it to the total
            totalFScore += 0
        else:
            totalFScore += f_score
        totalAccuracy += accuracy
    
    cv_results = (totalPrecision/folds,totalRecall/folds,totalFScore/folds,totalAccuracy/folds)
        
    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify((reviewSample))

In [9]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

#download nltk data
nltk.download("wordnet") #word set for lemmatizing
nltk.download("stopwords") #stop word dataset for stopword removal

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

crossValidationResults = crossValidate(trainData, 10);

print("Precision, Recall, Fscore, Accuracy")
print(crossValidationResults) # prints cross validation results.
    

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nickf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nickf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
31528
Training Classifier...




Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Precision, Recall, Fscore, Accuracy
(0.6123057913391206, 0.6182430884511234, 0.600991956523832, 0.607202380952381)


#### Output Pre Question 4

Precision, Recall, Fscore, Accuracy
(0.6436093620282846, 0.6562403435435953, 0.6497714381913016, 0.6463690476190477)

Not great. The accuracy is bad but better than randomly guessing the class and a better f_score than classifying the whole set as the same class for everything but still very poor.

#### Question 4

Improve Preprocessing
      
        I noticed it might be a good idea to make everything lowercase. Reduced the feature count from 98689 to 78556. This didn't increase the effectiveness of the classification.
        
        Precision, Recall, Fscore, Accuracy
        (0.6354279957939459, 0.6546865208273466, 0.6446480291722718, 0.6392857142857142)

Remove html tags

        I removed html tags mostly break tags. The addition of tag removal reduced the feature count by 17815 from 78556 to 63038. Didn't improve the poor performance. 
   
        Precision, Recall, Fscore, Accuracy
        (0.6295785761348717, 0.6499272215428664, 0.6395140434003548, 0.633690476190476)
 
Punctuation Removal

        Next I removed all punctuation. First removing ' without replacing with a space to avoid "don't" to "don" "t". This reduced the features to 31528. The accuracy decreased further. I should probably maintain some punctuation.
      
        Precision, Recall, Fscore, Accuracy
        (0.6010034972565854, 0.6248868779572152, 0.612596200212048, 0.6049404761904762)      

Stop word removal

        Stop word removal reduce the feature count from 31528 to 31384. But none of the scores increased.
   
        Precision, Recall, Fscore, Accuracy
        (0.5941561103254681, 0.6171329835048187, 0.6052725735879014, 0.5976190476190477)
 
Lemmatization
   
       I lemmatized the data this reduced the feature count further from 31528 to 27759. There was a slight improvement over the previous version without lemmatization. 
   
        Precision, Recall, Fscore, Accuracy
        (0.5999308331787133, 0.6240195903442175, 0.6113552589768079, 0.6036904761904761)


#### Question 5

Length of the review text

    I was curious to see if the lenght of the text was a useful feature to consider. I created a feature key reviewLength that weight was the length of the review. There was an improvement however repeated tested didn't always result in the same values.
        Precision, Recall, Fscore, Accuracy
        (0.6123057913391206, 0.6182430884511234, 0.600991956523832, 0.607202380952381)
    

Is the review from a varified purchaser and the inclusion of a title feature.

    I modified crossvalidation to use predicate label so that I was able to pass in more features. These features were the title of the review and if the review was varified. I gave each word in the title a weight of 3 making the assumption that the title was likely to distill the point of the review because it is there to get readers to read the whole review therefore is representative of the review. I also gave the a weight of 5 if the review was varified. The results got much better after adding these features seeing a 13% improvement.
    
    Precision, Recall, Fscore, Accuracy
    (0.7302124656407247, 0.7394123539961068, 0.7346843002043812, 0.7333333333333333)

    I altered the weights to see if the accuracy returned. Verified 2 and 2 for words in the title. Improved it only very slightly.
    
    Precision, Recall, Fscore, Accuracy
    (0.7373699041694369, 0.7426404459824403, 0.7398504102592587, 0.7391666666666666)
    
    I removed the feature of the length of the text. As on inspection this was always the largest weight by a large amount.
  
    Precision, Recall, Fscore, Accuracy 
    (0.7335934192445652, 0.738770453224032, 0.7359985523759297, 0.7352976190476191)
    
    Further reduced the title weight and Verified value. 1 for the weight and if it was verified.

    Precision, Recall, Fscore, Accuracy Further Minior Improvements.
    (0.7416190589082572, 0.7522645420762297, 0.7467738416749492, 0.7451190476190476)