In [5]:

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

import matplotlib.pyplot as plt
import scikitplot as skplt
import numpy
from sklearn.metrics import *
import time
import sys


In [6]:
def read_files():
    with open(sys.argv[1], 'r', encoding='utf-8') as train:
        trainData = train.readlines("train.txt") # copy the content of the file in a list

    with open(sys.argv[2], 'r', encoding='utf-8') as test:
        testData = test.readlines("test.txt")

    return trainData, testData


NameError: name 'trainData' is not defined

In [7]:
def apply_stemmer(doc):
    stemmer = PorterStemmer()

    roots = [stemmer.stem(plural) for plural in doc]

    return roots


In [8]:
def modify_corpus(data, use_sentiment):

    documents = []
    labels = []

    for line in data:
        tokens = line.strip().split()  # tokenize the lines

        documents.append(tokens[3:])  # append the text - starts from 4th tokens

        if use_sentiment:
            # 2-class problem: positive vs negative
            labels.append(tokens[1])  # tokens[1] is sentiment type (either pos/neg)
        else:
            # 6-class problem: books, camera, dvd, health, music, software
            labels.append(tokens[0])  # tokens[0] is one of 6 topic types

    stemmed_documents = []
    for doc in documents:
        stemmed_documents.append(apply_stemmer(doc))

    return stemmed_documents, labels

In [11]:
# Show Distribution of Data
def distribution(trainClass, testClass):

    labels = ["books", "camera", "dvd", "health", "music", "software"]
    count_training = [0, 0, 0, 0, 0, 0]
    count_testing = [0, 0, 0, 0, 0, 0]

    i = 0
    for label in labels:
        for cls in trainClass:
            if cls == label:
                count_training[i] += 1
        i += 1

    i = 0
    for label in labels:
        for cls in testClass:
            if cls == label:
                count_testing[i] += 1
        i += 1

    print("Distribution of classes in Training Set:")
    print(labels)
    print(count_training)

    print("\nDistribution of classes in Testing Set:")
    print(labels)
    print(count_testing)


In [12]:
# a dummy function that just returns its input
def identity(x):
    return x


# Using NLTK lemmatizer
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()

    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


In [14]:
def tf_idf_func(tfidf, stopwords_bn):
    # let's use the

    # we use a dummy function as tokenizer and preprocessor,
    # since the texts are already preprocessed and tokenized.
    # if tfidf:
    #     vec = TfidfVectorizer(stop_words=stopwords_bn, preprocessor = identity, tokenizer = identity, ngram_range=(1, 3))
    # else:
    #     vec = CountVectorizer(stop_words=stopwords_bn, preprocessor = identity, tokenizer = identity, ngram_range=(1, 3))

    # using lemmatizer doesn't improve performance
    if tfidf:
        vec = TfidfVectorizer(preprocessor = identity, tokenizer = identity)
    else:
        vec = CountVectorizer(preprocessor = identity, tokenizer = identity)

    return vec


In [16]:
def NB_classifier(trainDoc, trainClass, testDoc, testClass, stopwords_bn, tfIdf, use_sentiment):

    # decides on TfidfVectorizer(True) or CountVectorizer(False)
    vec = tf_idf_func(tfIdf, stopwords_bn)

    # combine the vectorizer with a Naive Bayes classifier
    classifier = Pipeline( [('vec', vec),
                            ('cls', MultinomialNB())] )

    t0 = time.time()
    # Fit/Train Multinomial Naive Bayes classifier according to trainDoc, trainClass
    # Here trainDoc are the documents from training set and trainClass is the class labels for those documents
    classifier.fit(trainDoc, trainClass)

    train_time = time.time() - t0

    t1 = time.time()
    # Use the classifier to predict the class for all the documents in the test set testDoc
    # Save those output class labels in testGuess
    testGuess = classifier.predict(testDoc)

    test_time = time.time() - t1

    # Just to know the output type
    classType = "Topic Class"
    if use_sentiment:
        classType = "Sentiment Class"

    # Just to know which version of Tfidf is being used
    tfIDF_type = "TfidfVectorizer" if(tfIdf) else "CountVectorizer"     # This is ternary conditional operator in python

    print("\n########### Naive Bayes Classifier For ", classType, " (", tfIDF_type, ") ###########")

    # Call to function(s) to do the jobs ^_^
    calculate_measures(classifier, testClass, testGuess)

    # Showing 10 fold cross validation score cv = no. of folds
    # print("Cross Validation:\n", cross_val_score(classifier, testDoc, testClass, cv=10))
    print()
    print("Training Time: ", train_time)
    print("Testing Time: ", test_time)

    calculate_probabilities(classifier, testClass, trainClass)