# Text classification

***Null Space Research***

# Text Classification with Sklearn

In [1]:
# we will work with the dataset called '20newsgroups' - which is a collection of 20000 news docs
# each document is labeled into 20 categories - but for sake of simplicity we will work with 4 labels

# we will apply a NaiveBayes algorothm to classify new documents into classes by training 
# the model on existing labels

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [2]:
# we will now load those files that match the above mentioned four classes 

from sklearn.datasets import fetch_20newsgroups

twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [3]:
# we can use 'target_names' method to view the class labels

twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [4]:
# we can check how many documents (files) are there in this dataset

len(twenty_train.data)

2257

In [5]:
# we can get the numerical encoded class labels as follows

twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [6]:
# the numerical encodings corresond to the target names as follows

for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [7]:
# we will now turn the textual data into numerical feature vectors
# we can use the count_Vectorizer to get feature word vector

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [8]:
# we can get the vocabulary size as follows

count_vect.vocabulary_.get(u'algorithm')

4690

In [9]:
# alternatively we can use the TF-IDF vectorizer to create word vectors

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [10]:
# now we train a Naive Bayes algorithm 

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [12]:
# now before we test this model on new documents/sentences
# we will first vectorize the test data 

docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

In [13]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


# Text Classification with NLTK

In [15]:
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [17]:
# we now define a word feature as - whether a documents contains that word or not
# also, we look at only the top 2000 most frequent words in the whole corpus to see this condition

import nltk
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

In [18]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [28]:
list(document_features(movie_reviews.words('pos/cv957_8737.txt')).items())[:20]

[('contains(,)', True),
 ('contains(the)', True),
 ('contains(.)', True),
 ('contains(a)', True),
 ('contains(and)', True),
 ('contains(of)', True),
 ('contains(to)', True),
 ("contains(')", True),
 ('contains(is)', True),
 ('contains(in)', True),
 ('contains(s)', True),
 ('contains(")', True),
 ('contains(it)', True),
 ('contains(that)', True),
 ('contains(-)', True),
 ('contains())', True),
 ('contains(()', True),
 ('contains(as)', True),
 ('contains(with)', True),
 ('contains(for)', True)]

In [24]:
featuresets = [(document_features(d),c) for (d, c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [25]:
print(nltk.classify.accuracy(classifier, test_set))

0.84


In [26]:
classifier.show_most_informative_features(5)

Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.0 : 1.0
         contains(mulan) = True              pos : neg    =      9.0 : 1.0
        contains(seagal) = True              neg : pos    =      6.6 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.6 : 1.0
         contains(damon) = True              pos : neg    =      6.3 : 1.0


In [None]:
# the above code result says - 

# if a movie review contains the word 'mulan' - then there is a 9:1 odds of it being positive