In [1]:
import nltk.classify.util, nltk.metrics
import pandas as pd
import random
import collections
from nltk.classify import NaiveBayesClassifier
from nltk import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.metrics import *
from nltk.metrics.scores import accuracy, precision, recall, f_measure
                                     


Reading in the data with Pandas

In [2]:
path = ("C:\\Users\\Myles\\Documents\\Napier Data Science\\Wranging Assment\\Submission\\training.txt")
df = pd.read_table(path,header=None)

Rename the colums to something more meaningfull

In [4]:
df.columns = ['sentiment', 'text']

Function to format and clean the data

In [5]:
def extract_word_feats(sentence):
    sentence = sentence.lower() 
    words = nltk.word_tokenize(sentence)
    stemmer = nltk.PorterStemmer()
    words = [word for word in words if word not in stopwords.words('english')]
    words = [stemmer.stem(word) for word in words]
    return dict([(word, True) for word in words])


Construct a very simple dictionary which maps a feature name (word) to True if the word exists in the data. Test the function on sample text

In [6]:
extract_word_feats("blab and if swimming Test sentence")

{'blab': True, 'sentenc': True, 'swim': True, 'test': True}

Naive Bayes is a generative classifier, i.e. it builds a model of each class and given an observation, it returns the class most likely to have generated the observation. 



In [7]:
neg = df.loc[df['sentiment'] == 0]
pos = df.loc[df['sentiment'] == 1]

The Naive Bayes classifier training method expects to be given a list of tokens in the form of [(feats, label)] where feats is a feature dictionary and label is the classification label. In our case, feats will be of the form {word: True} and label will be one of ‘pos’ or ‘neg’.Creating feature-label pairs where the features will be a feature dictionary in the form of {word: True} and the label is either a "pos" or a "neg" label. 

In [8]:
negreviews = [(extract_word_feats(f), 'neg') for f in neg['text']]
posreviews = [(extract_word_feats(f), 'pos') for f in pos['text']]

Combine the reviews into one dataset

In [9]:
data = negreviews + posreviews

Randomise the data

In [10]:
random.shuffle(data)

Split data into training and test with a 80%training 20% test split

In [12]:
len(data)* 0.8 # multipling the length by 0.8 to get 80% of data


5534.400000000001

In [80]:
training = data[:5534]
test = data[5535:]

Applying Naive Bayes classifier and create classifer variable

In [81]:
classifier = NaiveBayesClassifier.train(training)


Using Accuracy metric

In [82]:
nltk.classify.util.accuracy(classifier, test)

0.9768618944323934

Pre-proscessing for alternative metrics

In [83]:
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

In [84]:
for i, (feats, label) in enumerate(test):
    refsets[label].add(i)
    predicted = classifier.classify(feats)
    testsets[predicted].add(i)

Applying alternate accuracy measures within the NLTK package Precision, recall and the combined F-measure score.

In [85]:
print('pos precision:', nltk.precision(refsets['pos'], testsets['pos']))
print('pos recall:', nltk.recall(refsets['pos'], testsets['pos']))
print('pos F-measure:', nltk.f_measure(refsets['pos'], testsets['pos']))
print('neg precision:', nltk.precision(refsets['neg'], testsets['neg']))
print('neg recall:', nltk.recall(refsets['neg'], testsets['neg']))
print('neg F-measure:', nltk.f_measure(refsets['neg'], testsets['neg']))


pos precision: 0.9744245524296675
pos recall: 0.9844961240310077
pos F-measure: 0.9794344473007711
neg precision: 0.9800332778702163
neg recall: 0.9671592775041051
neg F-measure: 0.9735537190082646
