In [38]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
import numpy as np


In [19]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        # We want to skip the first line as it is the column title
        next(reader)
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [3]:
# QUESTION 1

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    
    id = int(reviewLine[0])
    text = str(reviewLine[8])
    label = str(reviewLine[1])
    #print(id,text,label)
    #print(type(id),type(text),type(label))
    return (id, text, label)


In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
import contractions
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')

# Input: a string of one review
def preProcess(text):
    # Initialisation steps:
    #print(text)
    new_text = []
    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()
    
    # replace i'd with i would and other similar contractions
    text = contractions.fix(text) 
    # split by whitespace
    tokens = text.split()
    for token in tokens:
        # Loop through words and remove capital letters
        new_token = token.lower()
        # If token is a stop word we don't want to include it
        if new_token in stop_words:
            continue;
        # Use the porter algorithm to stem the word e.g. rationalise -> rational
        new_text.append(porter.stem(new_token))
        
    # Should return a list of tokens
    return new_text

[nltk_data] Downloading package stopwords to /home/leem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    featureVec = {}
    #TODO add more complex weighting system
    for token in tokens:
        # For each token, we want to increment the global feature count
        #print('token:',token)
        if token in featureDict:
            featureDict[token] += 1
        else:
            featureDict[token] = 1
        # We want to add the token to the dictionary to create a simple vector
        if token in featureVec:
            featureVec[token] += 1
        else:
            featureVec[token] = 1

    return featureVec

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [54]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    data = prepData(dataset)
    # Initialise an empty confusion matrix
    totalConfusionMatrix = np.zeros((2,2))
    
    testDataLen = int(len(data)/folds)
    for i in range(0,len(data),testDataLen):
        testEnd = i+testDataLen
        testingData = data[i:testEnd]
        trainingData = data[:i] + data[testEnd:]
        #print("Split:", i, "No train data:",len(trainingData),"No test data:",len(testingData),"Scores below")
        classifier = trainClassifier(trainingData)
        yPred = [predictVector(x[0], classifier) for x in testingData]
        yTrue = [x[1] for x in testingData]
        #results = precision_recall_fscore_support(yTrue, yPred)
        confusionMatrix = confusion_matrix(yTrue,yPred)
        totalConfusionMatrix = np.add(confusionMatrix, totalConfusionMatrix)
        
    #print(totalConfusionMatrix)
    averagePrecision = totalConfusionMatrix[0][0] / (totalConfusionMatrix[0][0] + totalConfusionMatrix[0][1])
    averageRecall = totalConfusionMatrix[0][0] / (totalConfusionMatrix[0][0] + totalConfusionMatrix[1][0])
    averageF1Score = 2*averagePrecision*averageRecall / (averagePrecision + averageRecall)
    averageAccuracy = (totalConfusionMatrix[0][0] + totalConfusionMatrix[1][1])/ float(np.sum(totalConfusionMatrix))
    cv_results = (averagePrecision,averageRecall,averageF1Score,averageAccuracy)
    
    return cv_results

def prepData(data):
    newData = []
    for (_, Text, Label) in data:
        newData.append((toFeatureVector(preProcess(Text)),Label))
    return newData

In [24]:
# PREDICTING LABELS GIVEN A CLASSIFIER

# Takes in a list of strings as review samples and returns list of predictions
def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

# Takes in string as a review sample and returns a prediction
def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

# More efficient to use lambda function and simply pass the vector to this predict func
def predictVector(textVec, classifier):
    return classifier.classify(textVec)

In [20]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
68481


In [23]:
#TODO fix the classifier as it's classifying everything as label2
testThing = trainData[-1:] + trainData[:1]
#testThing.append(trainData[:1000])
#simpleClassifier = trainClassifier(trainData)
print(testThing[0][0])
#yTest = testThing
#rawthing = [parseReview(x) for x in rawData]
#simpleClassifier = trainClassifier(rawthing)
#predictLabel(trainData[0][0], simpleClassifier)
#predictLabel(rawData[0][1],simpleClassifier)


{'realiz': 1, 'hen': 1, 'compani': 1, 'come': 1, 'look': 1, 'orient': 1, 'fit': 1, 'would': 1, 'differ': 1, 'ethnic': 1, 'bodi': 1, 'type,': 1, 'even': 1, 'though': 1, 'order': 1, 'medium': 2, 'larg': 1, 'may': 1, 'small': 1, 'american': 1, 'standards--': 1, 'small.': 1, 'need': 1, 'send': 1, 'back': 1}


In [55]:
myResults = crossValidate(rawData,10)
print(f"Precision: {myResults[0]} Recall: {myResults[1]} F1Score: {myResults[2]} Accuracy: {myResults[3]}")

Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Training Classifier...
Precision: 0.6148571428571429 Recall: 0.5927830318611698 F1Score: 0.6036183441634332 Accuracy: 0.5962380952380952


Without stemming words, removal of stop words and lowering of word case, the model achieved:

With stemming, removal of stop words and lowering of word case the model achieved: 
Precision: 0.6148571428571429 Recall: 0.5927830318611698 F1Score: 0.6036183441634332 Accuracy: 0.5962380952380952