# Classification of 20 newsgroups dataset 

http://scikit-learn.org/stable/datasets/twenty_newsgroups.html

## Prepare text data

In [108]:
import numpy as np
import pandas as pd

In [109]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'talk.politics.guns', 'sci.space', 'talk.politics.misc']
newsgroups_train = fetch_20newsgroups(subset='train', 
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)
from pprint import pprint
pprint(list(newsgroups_train.target_names)) 

['alt.atheism',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.misc',
 'talk.religion.misc']


In [110]:
newsgroups_train.filenames.shape

(2461,)

In [111]:
newsgroups_train.target.shape

(2461,)

In [112]:
newsgroups_train.target[:10] #  The target attribute is the integer index of the category

array([2, 3, 3, 1, 4, 0, 2, 2, 1, 2])

## Prepare feature vectors

In [113]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_df=0.9, min_df=0.01)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=0.01,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [114]:
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors

<2461x1465 sparse matrix of type '<class 'numpy.int64'>'
	with 93788 stored elements in Compressed Sparse Row format>

In [115]:
analyze = vectorizer.build_analyzer()
analyze("This is a text document to analyze.") == (['text', 'document', 'analyze'])

True

In [116]:
len(vectorizer.get_feature_names())

1465

In [117]:
vectorizer.get_feature_names()[500:510]

['father',
 'fault',
 'fbi',
 'fear',
 'federal',
 'feds',
 'feel',
 'feet',
 'field',
 'fight']

In [118]:
vectorizer.vocabulary_.get('attitude')

144

In [119]:
feature_names = np.array(vectorizer.get_feature_names())
sparse_array_of_features = vectorizer.transform(['Something completely new.'])
feature_names[sparse_array_of_features.tocoo().col]

array(['completely', 'new'], 
      dtype='<U15')

In [120]:
vectors.nnz / float(vectors.shape[0])

38.10971149939049

## Define classification vars and tools

In [121]:
dense_vectors = vectors.toarray()
num_classes = len(newsgroups_train.target_names)

test_vectors = vectorizer.transform(newsgroups_test.data)
test_dense_vectors = test_vectors.toarray()
test_targets = newsgroups_test.target

In [122]:
newsgroups_test.target.shape

(1638,)

In [175]:
from sklearn.metrics import confusion_matrix

def measure_quality(model, samples, targets):
    cmatrix = confusion_matrix(y_pred=model.predict(samples), y_true=newsgroups_train.target)
    precision = sum(cmatrix.diagonal()) / cmatrix.sum()
    return precision, cmatrix

In [176]:
def try_classifer(model, data, targets, target_names):
    model.fit(data, targets)
    precision, cmatrix = measure_quality(model, data, targets)
    print("Precision: {}%".format(precision))
    results_df = pd.DataFrame(cmatrix, columns=target_names, index=target_names)
    print("Real class (row) -> Predicted class (column)")
    results_df

## Try Naive Bayes classifier

In [181]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
try_classifer(clf, dense_vectors, newsgroups_train.target, newsgroups_train.target_names)

Precision: 0.6493295408370581%
Real class (row) -> Predicted class (column)
                    alt.atheism  sci.space  talk.politics.guns  \
alt.atheism                 322          4                   0   
sci.space                    31        447                   0   
talk.politics.guns           46         29                 292   
talk.politics.misc           77         42                  29   
talk.religion.misc           16          4                   0   

                    talk.politics.misc  talk.religion.misc  
alt.atheism                          1                 153  
sci.space                            1                 114  
talk.politics.guns                   5                 174  
talk.politics.misc                 182                 135  
talk.religion.misc                   2                 355  


In [179]:
try_classifer(clf, test_dense_vectors, newsgroups_test.target, newsgroups_test.target_names)

ValueError: Found input variables with inconsistent numbers of samples: [2461, 1638]

In [134]:
precision, confusion_matrix = measure_quality(dense_vectors, newsgroups_train.target)
print("Train data precision: {}%".format(precision))
confusion_matrix

Train data precision: 65%


[[322, 5, 2, 4, 157],
 [31, 448, 2, 4, 118],
 [46, 30, 294, 8, 178],
 [77, 43, 31, 185, 139],
 [16, 5, 2, 5, 359]]

### Test on train data

In [143]:
target_names = newsgroups_train.target_names
results_df = pd.DataFrame(cmatrix, columns=target_names, index=target_names)
print("Real class (row) -> Predicted class (column)")
results_df

Real class (row) -> Predicted class (column)


Unnamed: 0,alt.atheism,sci.space,talk.politics.guns,talk.politics.misc,talk.religion.misc
alt.atheism,322,4,0,1,153
sci.space,31,447,0,1,114
talk.politics.guns,46,29,292,5,174
talk.politics.misc,77,42,29,182,135
talk.religion.misc,16,4,0,2,355


### Test on test data

In [144]:
precision, prediction_matrix = measure_quality(test_dense_vectors, newsgroups_test.target)
print("Test data precision: {}%".format(precision))

Test data precision: 48%


In [146]:
target_names = newsgroups_train.target_names
results_df = pd.DataFrame(cmatrix, columns=target_names, index=target_names)
print("Real class (row) -> Predicted class (column)")
results_df

Real class (row) -> Predicted class (column)


Unnamed: 0,alt.atheism,sci.space,talk.politics.guns,talk.politics.misc,talk.religion.misc
alt.atheism,322,4,0,1,153
sci.space,31,447,0,1,114
talk.politics.guns,46,29,292,5,174
talk.politics.misc,77,42,29,182,135
talk.religion.misc,16,4,0,2,355


## Try SVM classifier

In [148]:
from sklearn.preprocessing import scale

normalized_vectors = scale(dense_vectors)



In [156]:
from sklearn.svm import SVC
clf = SVC(decision_function_shape='ovo')
clf.fit(normalized_vectors, newsgroups_train.target)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovo', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [160]:
precision, confusion_matrix = measure_quality(clf, normalized_vectors, newsgroups_train.target)
print("Train data precision: {}%".format(precision))

Train data precision: 0.7200325071109305%


### Test on train data

In [161]:
target_names = newsgroups_train.target_names
results_df = pd.DataFrame(confusion_matrix, columns=target_names, index=target_names)
print("Real class (row) -> Predicted class (column)")
results_df

Real class (row) -> Predicted class (column)


Unnamed: 0,alt.atheism,sci.space,talk.politics.guns,talk.politics.misc,talk.religion.misc
alt.atheism,327,150,3,0,0
sci.space,0,593,0,0,0
talk.politics.guns,0,150,395,1,0
talk.politics.misc,8,159,8,290,0
talk.religion.misc,36,146,27,1,167


### Test on test data

In [104]:
precision, prediction_matrix = measure_quality(clf, test_dense_vectors, newsgroups_test.target)
print("Test data precision: {}%".format(precision))

Test data precision: 38%


In [105]:
target_names = newsgroups_train.target_names
results_df = pd.DataFrame(prediction_matrix, columns=target_names, index=target_names)
print("Real class (row) -> Predicted class (column)")
results_df

Real class (row) -> Predicted class (column)


Unnamed: 0,alt.atheism,sci.space,talk.politics.guns,talk.politics.misc,talk.religion.misc
alt.atheism,0,222,7,4,11
sci.space,1,1,5,6,4
talk.politics.guns,0,259,2,5,5
talk.politics.misc,3,215,59,3,4
talk.religion.misc,33,186,18,6,4
