# Text-Classification with NLTK

## Loading Data

In [1]:
import nltk
import random
import pickle
from nltk.corpus import movie_reviews
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True

In [0]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append([movie_reviews.words(fileid), category])

In [0]:
random.shuffle(documents)

In [4]:
print(documents)



In [5]:
print(documents[0])

[['note', 'to', 'screenwriters', 'and', 'self', ':', ...], 'neg']


In [6]:
print(documents[0][0])
print(documents[0][1])

['note', 'to', 'screenwriters', 'and', 'self', ':', ...]
neg


In [0]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

In [8]:
print(len(all_words))

1583820


In [9]:
all_words_freq = nltk.FreqDist(all_words)
print(all_words_freq.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [10]:
print(all_words_freq['good'])
print(all_words_freq['bad'])

2411
1395


## Training Data

In [11]:
print(documents)



In [0]:
word_features = list(all_words_freq.keys())
word_features = word_features[:3000]

In [13]:
print(word_features)



In [0]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        if(w in words):
            features[w] = True
        else:
            features[w] = False

    return features


In [15]:
print(find_features(movie_reviews.words('neg/cv000_29416.txt')))



In [0]:
feature_sets = []
for document, category in documents:
    # print(document)
    # print(rev)
    # break

    features = find_features(document)
    feature_sets.append([features, category])

In [17]:
print(feature_sets[0])



In [0]:
train_set = feature_sets[:1900]
test_set = feature_sets[1900:] 

# NLTK Baseline

In [0]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [20]:
print("Naive Bayes Algo accuracy: %0.2f " %(nltk.classify.accuracy(classifier, train_set) * 100))

Naive Bayes Algo accuracy: 89.95 


In [21]:
classifier.show_most_informative_features(15)

Most Informative Features
                   sucks = True              neg : pos    =      9.8 : 1.0
                 frances = True              pos : neg    =      9.0 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
              schumacher = True              neg : pos    =      7.4 : 1.0
               atrocious = True              neg : pos    =      7.0 : 1.0
                 martian = True              neg : pos    =      7.0 : 1.0
                    nest = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
                obstacle = True              pos : neg    =      6.3 : 1.0
                  shoddy = True              neg : pos    =      6.3 : 1.0
                  suvari = True              neg : pos    =      6.3 : 1.0

# Saving the Classifier

In [22]:
print(classifier)

<nltk.classify.naivebayes.NaiveBayesClassifier object at 0x7f4acd7a5fd0>


In [0]:
 with open ("naivebayes.pickle", "wb") as f:
     pickle.dump(classifier, f)

In [0]:
with open ("naivebayes.pickle", "rb") as f:
    classifier_l2 = pickle.load(f)

# Using Scikit-Learn API

In [0]:
from nltk.classify.scikitlearn import SklearnClassifier

In [0]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

In [28]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(train_set)
print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(MNB_classifier, test_set)*100))

Test Accuracy of MNB Classifier: 79.00 


In [0]:
# GNB_classifier = SklearnClassifier(GaussianNB())
# GNB_classifier.train(train_set)
# print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(GNB_classifier, test_set)*100))

In [30]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_set)
print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(LogisticRegression_classifier, test_set)*100))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Test Accuracy of MNB Classifier: 84.00 


In [31]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(train_set)
print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(SVC_classifier, test_set)*100))

Test Accuracy of MNB Classifier: 80.00 


In [32]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(train_set)
print("Test Accuracy of MNB Classifier: %0.2f " %(nltk.classify.accuracy(SGDClassifier_classifier, test_set)*100))

Test Accuracy of MNB Classifier: 79.00 


# Combining Algorithms with NLTK

In [0]:
from nltk.classify import ClassifierI
from statistics import mode

In [0]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for cfr in self._classifiers:
            v = cfr.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for cfr in self._classifiers:
            v = cfr.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes) 
        return conf


In [42]:
voted_classifier = VoteClassifier(classifier, MNB_classifier, SVC_classifier, SGDClassifier_classifier, LogisticRegression_classifier)
print("Votes classifier accuracy = %0.2f "%(nltk.classify.accuracy(voted_classifier, train_set) * 100))
print("Classification: ", voted_classifier.classify(test_set[0][0]))
print("Confidence", voted_classifier.confidence(test_set[0][0]))

Votes classifier accuracy = 98.74 
Classification:  pos
Confidence 1.0
