# Classification of 20 newsgroups dataset 

http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

## Prepare text data

In [35]:
import numpy as np
import pandas as pd

In [72]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'talk.politics.guns', 'sci.space', 'talk.politics.misc']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)
from pprint import pprint
pprint(list(newsgroups_train.target_names)) 

['alt.atheism',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.misc',
 'talk.religion.misc']


In [2]:
newsgroups_train.filenames.shape

(2461,)

In [3]:
newsgroups_train.target.shape

(2461,)

In [4]:
newsgroups_train.target[:10] #  The target attribute is the integer index of the category

array([2, 3, 3, 1, 4, 0, 2, 2, 1, 2])

## Prepare feature vectors

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=0.01)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [88]:
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors

<2461x1465 sparse matrix of type '<class 'numpy.int64'>'
	with 93788 stored elements in Compressed Sparse Row format>

In [7]:
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (['text', 'document', 'analyze'])

True

In [8]:
len(vectorizer.get_feature_names())

1465

In [9]:
vectorizer.get_feature_names()[500:510]

['father',
 'fault',
 'fbi',
 'fear',
 'federal',
 'feds',
 'feel',
 'feet',
 'field',
 'fight']

In [10]:
vectorizer.vocabulary_.get('attitude')

144

In [73]:
feature_names = np.array(vectorizer.get_feature_names())
sparse_array_of_features = vectorizer.transform(['Something completely new.'])
feature_names[sparse_array_of_features.tocoo().col]

array(['completely', 'new'], 
      dtype='<U15')

In [74]:
vectors.nnz / float(vectors.shape[0])

38.10971149939049

## Define classification vars and tools

In [89]:
dense_vectors = vectors.toarray()
num_classes = len(newsgroups_train.target_names)

test_vectors = vectorizer.transform(newsgroups_test.data)
test_dense_vectors = test_vectors.toarray()
test_targets = newsgroups_test.target

In [91]:
newsgroups_test.target.shape

(1638,)

In [92]:
def measure_quality(samples, targets):
    num_samples = targets.shape[0]
    errors = 0
    prediction_matrix = [[class_num2 for class_num2 in range(num_classes)] for class_num1 in range(num_classes)]
    for index, sample in enumerate(samples):
        predicted_class = clf.predict(sample.reshape(1, -1))[0]
        real_class = targets[index]
        if predicted_class != real_class:
            errors +=1 
            prediction_matrix[real_class][predicted_class] += 1
    precision = int(round((1 - (errors / num_samples)) * 100))
    return precision, prediction_matrix

## Try Naive Bayes classifier

In [93]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(dense_vectors, newsgroups_train.target)

GaussianNB(priors=None)

In [94]:
precision, prediction_matrix = measure_quality(dense_vectors, newsgroups_train.target)
print("Train data precision: {}%".format(precision))

Train data precision: 65%


### Test on train data

In [95]:
target_names = newsgroups_train.target_names
results_df = pd.DataFrame(prediction_matrix, columns=target_names, index=target_names)
print("Real class (row) -> Predicted class (column)")
results_df

Real class (row) -> Predicted class (column)


Unnamed: 0,alt.atheism,sci.space,talk.politics.guns,talk.politics.misc,talk.religion.misc
alt.atheism,0,5,2,4,157
sci.space,31,1,2,4,118
talk.politics.guns,46,30,2,8,178
talk.politics.misc,77,43,31,3,139
talk.religion.misc,16,5,2,5,4


### Test on test data

In [96]:
precision, prediction_matrix = measure_quality(test_dense_vectors, newsgroups_test.target)
print("Test data precision: {}%".format(precision))

Test data precision: 48%


In [97]:
target_names = newsgroups_train.target_names
results_df = pd.DataFrame(prediction_matrix, columns=target_names, index=target_names)
print("Real class (row) -> Predicted class (column)")
results_df

Real class (row) -> Predicted class (column)


Unnamed: 0,alt.atheism,sci.space,talk.politics.guns,talk.politics.misc,talk.religion.misc
alt.atheism,0,9,8,12,147
sci.space,23,1,7,15,79
talk.politics.guns,26,21,2,17,172
talk.politics.misc,44,24,63,3,132
talk.religion.misc,55,11,20,10,4


## Try SVM classifier

In [101]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(dense_vectors, newsgroups_train.target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [102]:
precision, prediction_matrix = measure_quality(dense_vectors, newsgroups_train.target)
print("Train data precision: {}%".format(precision))

Train data precision: 43%


### Test on train data

In [103]:
target_names = newsgroups_train.target_names
results_df = pd.DataFrame(prediction_matrix, columns=target_names, index=target_names)
print("Real class (row) -> Predicted class (column)")
results_df

Real class (row) -> Predicted class (column)


Unnamed: 0,alt.atheism,sci.space,talk.politics.guns,talk.politics.misc,talk.religion.misc
alt.atheism,0,349,10,3,4
sci.space,0,1,3,4,4
talk.politics.guns,0,332,2,4,4
talk.politics.misc,5,334,44,3,4
talk.religion.misc,45,264,16,3,4


### Test on test data

In [104]:
precision, prediction_matrix = measure_quality(test_dense_vectors, newsgroups_test.target)
print("Test data precision: {}%".format(precision))

Test data precision: 38%


In [105]:
target_names = newsgroups_train.target_names
results_df = pd.DataFrame(prediction_matrix, columns=target_names, index=target_names)
print("Real class (row) -> Predicted class (column)")
results_df

Real class (row) -> Predicted class (column)


Unnamed: 0,alt.atheism,sci.space,talk.politics.guns,talk.politics.misc,talk.religion.misc
alt.atheism,0,222,7,4,11
sci.space,1,1,5,6,4
talk.politics.guns,0,259,2,5,5
talk.politics.misc,3,215,59,3,4
talk.religion.misc,33,186,18,6,4
