# Twenty Newsgroups Analysis

Author: Paul Sheridan

Goal: Run multinomial Naive Bayes classifier on Twenty Newsgroups dataset to using TF, TF-IDF, and hypergeometric test derived features, respectively.

## Load modules

In [6]:
!pip install scikit-learn
import sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
import numpy as np
from scipy.sparse import csr_matrix
from scipy import sparse
from sklearn.base import BaseEstimator, TransformerMixin

Collecting scikit-learn
  Downloading scikit_learn-1.2.2-cp311-cp311-macosx_12_0_arm64.whl (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting numpy>=1.17.3
  Downloading numpy-1.24.3-cp311-cp311-macosx_11_0_arm64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scipy>=1.3.2
  Downloading scipy-1.10.1-cp311-cp311-macosx_12_0_arm64.whl (28.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.7/28.7 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting joblib>=1.1.1
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, numpy, joblib, scipy, scikit-learn
Successfully installed job

## Custom TF-IDF Transformer

In [7]:
class CanonicalTfidfTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
      # Initialize scoring matrix
      d, m = X.shape
      Nj = X.sum(axis=1)
      tfidf = np.empty(shape=(d, m), dtype=np.float32)
      
      # Calculate IDF scores
      a = X.nonzero()[1]
      indices, counts = np.unique(a, return_counts=True)
      Bi = np.zeros(m)
      for i in range(len(indices)):
        index = indices[i]
        Bi[index] = counts[i]
      for i in range(m):
        if Bi[i] == 0:
          Bi[i] = 1
      IDF = np.log(d / Bi)
      
      # Calculate TF-IDF scores
      for j in range(d):
        tfidf[j] = np.multiply(X[j].toarray(), IDF)

      return sparse.csr_matrix(tfidf)

## Custom Hypergeometric Test Transformer

In [8]:
from scipy.stats import hypergeom
import itertools

class HypergeomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
      min_tail_prob = 1e-250

      # Initialize scoring matrix
      d, m = X.shape
      print('Corpus size:', d, '\n')
      print('Vocabulary size:', m, '\n')
      print('Nonzero element count:', X.count_nonzero(), '\n')
      
      # Calculate base stats
      N = X.sum()
      n = X.sum(axis=1)
      K = X.sum(axis=0)
      
      # Calculate hypergeometric test scores
      #hgeom = sparse.csr_matrix(np.empty(shape=(d, m), dtype=np.float64))
      hgeom_arr = np.empty(X.count_nonzero())
      count = 0

      doc_ids, term_ids = X.nonzero()
      for doc_id, term_id in zip(doc_ids, term_ids):
        if count % 250000 == 0:
          print('count =', count)
        tail_prob = hypergeom.sf(k = X[doc_id, term_id] - 1, M = N, n = K[0, term_id], N = n[doc_id, 0])
        if np.isinf(tail_prob):
          print('Infinity at doc_id', doc_id, ' and term_id ', term_id, '\n')
          exit()
        if tail_prob < min_tail_prob:
          hgeom_arr[count] = - np.log(min_tail_prob)
        else:
          hgeom_arr[count] = - np.log(tail_prob)
        #hgeom[doc_id, term_id] = - np.log(min(tail_prob, min_tail_prob))
        count = count + 1

      unique_term_ids = set(term_ids)
      if len(unique_term_ids) < m:
        all_term_ids = np.array(range(m))
        missing_term_ids = np.array(list(set(all_term_ids) - unique_term_ids))

        for missing_term_id in missing_term_ids:
          doc_ids = np.append(doc_ids, 0)
          term_ids = np.append(term_ids, missing_term_id)
          hgeom_arr = np.append(hgeom_arr, 0)

      hgeom_coo = sparse.coo_matrix((hgeom_arr, (doc_ids, term_ids)))

      hgeom = hgeom_coo.tocsr()

      #return sparse.csr_matrix(hgeom)
      return hgeom

## Prepare the data

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

## TF Analysis

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', MultinomialNB()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.85      0.24      0.37       319
           1       0.71      0.60      0.65       389
           2       0.79      0.65      0.71       394
           3       0.63      0.75      0.69       392
           4       0.86      0.68      0.76       385
           5       0.88      0.68      0.77       395
           6       0.90      0.72      0.80       390
           7       0.71      0.92      0.80       396
           8       0.84      0.91      0.87       398
           9       0.86      0.85      0.86       397
          10       0.90      0.93      0.91       399
          11       0.52      0.96      0.67       396
          12       0.78      0.52      0.63       393
          13       0.82      0.76      0.79       396
          14       0.83      0.81      0.82       394
          15       0.34      0.98      0.51       398
          16       0.66      0.80      0.73       364
          17       0.96    

## TF-IDF Analysis

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer(smooth_idf=False)),
                     ('clf', MultinomialNB()),
                     ])

text_clf.fit(X_train, y_train)


predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.80      0.53      0.64       319
           1       0.81      0.65      0.72       389
           2       0.82      0.65      0.72       394
           3       0.67      0.78      0.72       392
           4       0.86      0.77      0.81       385
           5       0.89      0.75      0.82       395
           6       0.93      0.68      0.79       390
           7       0.85      0.92      0.88       396
           8       0.93      0.93      0.93       398
           9       0.92      0.90      0.91       397
          10       0.89      0.97      0.93       399
          11       0.59      0.97      0.73       396
          12       0.84      0.60      0.70       393
          13       0.92      0.73      0.82       396
          14       0.84      0.89      0.87       394
          15       0.44      0.98      0.61       398
          16       0.64      0.93      0.76       364
          17       0.93    

## Canonical TF-IDF Analysis

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', CanonicalTfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.81      0.82      0.82       319
           1       0.62      0.77      0.69       389
           2       0.74      0.04      0.08       394
           3       0.53      0.78      0.63       392
           4       0.73      0.85      0.79       385
           5       0.78      0.76      0.77       395
           6       0.80      0.76      0.78       390
           7       0.87      0.92      0.89       396
           8       0.93      0.96      0.94       398
           9       0.95      0.94      0.94       397
          10       0.96      0.97      0.96       399
          11       0.87      0.92      0.89       396
          12       0.77      0.76      0.76       393
          13       0.90      0.82      0.86       396
          14       0.87      0.90      0.89       394
          15       0.86      0.94      0.90       398
          16       0.82      0.90      0.85       364
          17       0.96    

## Hypergeometric Test of Statistical Significance Analysis

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', HypergeomTransformer()),
                     ('clf', MultinomialNB()),
                     ])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)

print(metrics.classification_report(y_test, predicted))

Corpus size: 11314 

Vocabulary size: 130107 

Nonzero element count: 1787565 

count = 0
count = 250000
count = 500000
count = 750000
count = 1000000
count = 1250000
count = 1500000
count = 1750000
Corpus size: 7532 

Vocabulary size: 130107 

Nonzero element count: 1107956 

count = 0
count = 250000
count = 500000
count = 750000
count = 1000000
              precision    recall  f1-score   support

           0       0.80      0.81      0.81       319
           1       0.65      0.77      0.70       389
           2       0.88      0.23      0.36       394
           3       0.56      0.78      0.65       392
           4       0.76      0.85      0.80       385
           5       0.81      0.76      0.79       395
           6       0.80      0.78      0.79       390
           7       0.87      0.92      0.89       396
           8       0.93      0.96      0.94       398
           9       0.95      0.94      0.95       397
          10       0.96      0.97      0.96       399
  