In [24]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn import metrics as Skmet

In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            #print(line)
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [15]:
# QUESTION 1

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    #print(reviewLine)
    Id = reviewLine[0]
    Text = reviewLine[8]
    if reviewLine[1] == '__label2__':
        Label = 'real'
    else:
        Label = 'fake'
    return (Id, Text, Label)



In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION

# Input: a string of one review
def preProcess(text):
    # Should return a list of tokens
    temp = text.split(" ")
    #do more normalisation when I feel like it.
    
    return temp

In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    reviewDict = { }
    for token in tokens: #adding to the review dict
        if token in reviewDict:
            reviewDict[token] = reviewDict[token] + 1
        else:
            reviewDict[token] = 1
    
    for token in tokens:
        if token in featureDict:
            featureDict[token] = featureDict[token] + 1
        else:
            featureDict[token] = 1
    return reviewDict

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [28]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        tempDataSet = dataset[0:i] + dataset[i+foldSize:len(dataset)] # joins to parts of the list to form the dataset to test.
        # Replace by code that trains and tests on the 10 folds of data in the dataset
        print(tempDataSet[0:10]) 
        classifier = trainClassifier(tempDataSet)
        results = predictLabels(tempDataSet, classifier)
        precision, recall, f_score, support = Skmet.precision_recall_fscore_support(actualLabels,results);
        
        
    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [29]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

crossValidationResults = crossValidate(trainData, 10);

print(len(crossValidationResults[0]))

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21001 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21001 rawData, 16800 trainData, 4201 testData
Training Samples: 
16800
Features: 
89046
[({"I'm": 1, 'kind': 1, 'of': 2, 'a': 2, 'swimsuit': 1, 'addict.': 1, '': 1, 'So': 1, 'I': 1, 'use': 2, 'one': 1, 'half': 1, 'this': 1, 'to': 2, 'hang': 1, 'up': 1, 'my': 2, 'suits.<br': 1, '/>I': 2, 'the': 1, 'other': 1, 'side': 1, 'put': 1, 'away': 1, 'winter': 1, 'gear,': 1, 'like': 1, 'scarfs,': 1, 'mittens,': 1, 'gloves': 1, 'and': 1, 'few': 1, 'hats.<br': 1, "/>It's": 1, 'not': 1, 'even': 1, 'full': 1, 'yet,': 1, 'but': 1, 'it': 1, 'holds': 1, 'SO': 1, 'MUCH': 1, 'STUFF!!!<br': 1, 'love': 1, 'it!<br': 1, '/><br': 1, '/>-Liz<br': 1, '/>(writing': 1, 'from': 1, 'husbands': 1, 'account)': 1}, 'real'), ({'Amazing': 1, 'detail..Ken': 1, 'as': 3, 'a': 3, 'Coca': 2, 'Cola': 2, 'soda': 2, 'jerk': 1, 'the': 6, "50's": 2, 'referred': 1, 'to': 2, 'person': 1, 

NameError: name 'actualLabels' is not defined