In [26]:
import csv                               # csv reader
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
import numpy as np


In [2]:
# load data from a file and append it to the rawData
def loadData(path, Text=None):
    with open(path) as f:
        reader = csv.reader(f, delimiter='\t')
        # We want to skip the first line as it is the column title
        next(reader)
        for line in reader:
            (Id, Text, Label) = parseReview(line)
            rawData.append((Id, Text, Label))
            preprocessedData.append((Id, preProcess(Text), Label))
        
def splitData(percentage):
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (_, Text, Label) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainData.append((toFeatureVector(preProcess(Text)),Label))
    for (_, Text, Label) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testData.append((toFeatureVector(preProcess(Text)),Label))

In [3]:
# QUESTION 1

# Convert line from input file into an id/text/label tuple
def parseReview(reviewLine):
    # Should return a triple of an integer, a string containing the review, and a string indicating the label
    
    id = int(reviewLine[0])
    text = str(reviewLine[8])
    label = str(reviewLine[1])
    #print(id,text,label)
    #print(type(id),type(text),type(label))
    return (id, text, label)


In [4]:
# TEXT PREPROCESSING AND FEATURE VECTORIZATION
import contractions
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')

# Input: a string of one review
def preProcess(text):
    # Initialisation steps:
    new_text = []
    stop_words = set(stopwords.words('english'))
    porter = PorterStemmer()
    
    # replace i'd with i would and other similar contractions
    text = contractions.fix(text) 
    # split by whitespace
    tokens = text.split()
    for token in tokens:
        # Loop through words and remove capital letters
        new_token = token.lower()
        # If token is a stop word we don't want to include it
        if new_token in stop_words:
            continue;
        # Use the porter algorithm to stem the word e.g. rationalise -> rational
        new_text.append(porter.stem(new_token))
        
    # Should return a list of tokens
    return new_text

[nltk_data] Downloading package stopwords to /home/leem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# QUESTION 2
featureDict = {} # A global dictionary of features

def toFeatureVector(tokens):
    # Should return a dictionary containing features as keys, and weights as values
    featureVec = {}
    #TODO add more complex weighting system
    for token in tokens:
        # For each token, we want to increment the global feature count
        #print('token:',token)
        if token in featureDict:
            featureDict[token] += 1
        else:
            featureDict[token] = 1
        # We want to add the token to the dictionary to create a simple vector
        if token in featureVec:
            featureVec[token] += 1
        else:
            featureVec[token] = 1

    return featureVec

In [6]:
# TRAINING AND VALIDATING OUR CLASSIFIER
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

In [42]:
# QUESTION 3

def crossValidate(dataset, folds):
    shuffle(dataset)
    cv_results = []
    foldSize = int(len(dataset)/folds)
    for i in range(0,len(dataset),foldSize):
        testingData = dataset[i:i+foldSize]
        trainingData = dataset[:i] + dataset[i+foldSize:]
        classifier = trainClassifier(trainingData)
        yPred = predictLabels(testingData, classifier)
        yTrue = [x[1] for x in testingData]
        print(yPred)
        print('*******************************')
        print(yTrue)
        cv_results.append(precision_recall_fscore_support(yTrue, yPred))
    return cv_results

In [8]:
# PREDICTING LABELS GIVEN A CLASSIFIER

def predictLabels(reviewSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), reviewSamples))

def predictLabel(reviewSample, classifier):
    return classifier.classify(toFeatureVector(preProcess(reviewSample)))

In [9]:
# MAIN

# loading reviews
rawData = []          # the filtered data from the dataset file (should be 21000 samples)
preprocessedData = [] # the preprocessed reviews (just to see how your preprocessing is doing)
trainData = []        # the training data as a percentage of the total dataset (currently 80%, or 16800 samples)
testData = []         # the test data as a percentage of the total dataset (currently 20%, or 4200 samples)

# the output classes
fakeLabel = 'fake'
realLabel = 'real'

# references to the data files
reviewPath = 'amazon_reviews.txt'

## Do the actual stuff
# We parse the dataset and put it in a raw data list
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing the dataset...",sep='\n')
loadData(reviewPath) 
# We split the raw dataset into a set of training data and a set of test data (80/20)
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Preparing training and test data...",sep='\n')
splitData(0.8)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)),
      "Training Samples: ", len(trainData), "Features: ", len(featureDict), sep='\n')

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Now 21000 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 21000 rawData, 16800 trainData, 4200 testData
Training Samples: 
16800
Features: 
68481


In [None]:
#TODO fix the classifier as it's classifying everything as label2

In [43]:
crossValidate(trainData,10)

Training Classifier...
['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2_

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


['__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__label2__', '__la

  'precision', 'predicted', average, warn_for)


[(array([0.        , 0.50416667]),
  array([0., 1.]),
  array([0.        , 0.67036011]),
  array([833, 847])),
 (array([0.        , 0.50952381]),
  array([0., 1.]),
  array([0.        , 0.67507886]),
  array([824, 856])),
 (array([0.        , 0.48333333]),
  array([0., 1.]),
  array([0.        , 0.65168539]),
  array([868, 812])),
 (array([0.    , 0.4875]),
  array([0., 1.]),
  array([0.        , 0.65546218]),
  array([861, 819])),
 (array([0.        , 0.50119048]),
  array([0., 1.]),
  array([0.        , 0.66772403]),
  array([838, 842])),
 (array([0.       , 0.5047619]),
  array([0., 1.]),
  array([0.        , 0.67088608]),
  array([832, 848])),
 (array([0.        , 0.48869048]),
  array([0., 1.]),
  array([0.        , 0.65653739]),
  array([859, 821])),
 (array([0.        , 0.49583333]),
  array([0., 1.]),
  array([0.        , 0.66295265]),
  array([847, 833])),
 (array([0.        , 0.50297619]),
  array([0., 1.]),
  array([0.        , 0.66930693]),
  array([835, 845])),
 (array([0.

NameError: name 'yTrue' is not defined