# Newsgroups
## 1. word frequency

Step 1: Import Packages

In [1]:
import numpy as np
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

Step 2: Prepare Data

In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print('target_names:', twenty_train.target_names)
print('training_dataset size:', len(twenty_train.data))

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

# See the contents
# for article in twenty_train.data[:3]:
#     myindex = twenty_train.data.index(article)
#     print("\n*** Article #{} Label: {} ***\n\n".format(myindex, \
#                                                        twenty_train.target_names[twenty_train.target[myindex]]))
#     print(article)
    
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(twenty_train.data)
print('count vector shape:', x_train_counts.shape)
y_train_counts = twenty_train.target
# print('x_train_counts[0]', x_train_counts[0])

x_test_counts = count_vect.transform(twenty_test.data) # get counts on the words in twenty_test data
print('x_train_counts.shape:', x_train_counts.shape)
y_test_counts = twenty_test.target

target_names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
training_dataset size: 11314
count vector shape: (11314, 130107)
x_train_counts.shape: (11314, 130107)


In [3]:
# X_train_counts[0].data – 
# X_train_counts[0].indptr – Tells us where the article starts and ends. In our case this
# would be 0 and 89, because there are 89 words in this article.
# X_train_counts[0].indices – Tells us which words were actually used.

A = x_train_counts[0]
print(A.data) # The actual word count
print(A.indptr) # Tells us where the article starts and ends. In our case this
# would be 0 and 89, because there are 89 words in this article.
print(A.indices[A.indptr[0]:A.indptr[1]]) # Tells us which words were actually used.

# for index in A.indices[A.indptr[0]:A.indptr[1]]:
#     print(count_vect.get_feature_names()[index]) # a list of integer
    
print(count_vect.get_feature_names()[A.indices[88]])

[1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 2 1 1 1 2 1 1 6 1 2 1 1 1 1 1 2 2 1 4 1 1 1 1 1 3 1 1 1 1
 1 1 5 3 5 1 1 1 1 2 2 2 2 2 3]
[ 0 89]
[ 86580 128420  35983  35187  66098 114428  78955  94362  76722  57308
  62221 128402  67156 123989  90252  63363  78784  96144 128026 109271
  51730  86001  83256 113986  37565  73201  27436  34181 101378 106116
  35612  56989  26073  66608 108252  99822 123796  48620  34995  37433
  18299  50111  16574  74693  32311 115475  76718 109581  48618  68766
  45295  90686 114455 104813  89860  80638  51793  42876 114579  90774
  28615  65798 124931 123292   4605  76032  92081  40998  79666  89362
 118983  90379  98949  64095  95162  87620 114731  68532  37780 123984
 111322 114688  85354 124031  50527 118280 123162  75358  56979]
from


Step 3: Build Model

In [4]:
mn = MultinomialNB()

Step 4: Train Model


In [5]:
mn.fit(x_train_counts, y_train_counts)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Step 5: Evaluate Model

In [6]:
y_predicted = mn.predict(x_test_counts)
score = accuracy_score(y_predicted, y_test_counts)
print(score)

0.7728359001593202


## 2. TF-IDF

Step 1: Import Packages

In [7]:
# same

Step 2: Prepare Data

In [8]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print('target_names:', twenty_train.target_names)
print('training_dataset size:', len(twenty_train.data))

twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts) # we use counts to calculate their tfidf
y_train_tfidf = twenty_train.target

x_test_tfidf = tfidf_transformer.transform(x_test_counts)
y_test_tfidf = twenty_test.target

target_names: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
training_dataset size: 11314


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Step 3: Build Model

In [9]:
mn_tfidf = MultinomialNB()

Step 4: Train Model


In [10]:
mn_tfidf.fit(x_train_tfidf, y_train_tfidf)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Step 5: Evaluate Model

In [11]:
y_predicted = mn_tfidf.predict(x_test_tfidf)
score = accuracy_score(y_predicted, y_test_tfidf)
print(score)

0.7738980350504514


## 3. TF-IDF without stopwords

Step 1: Import Packages

In [12]:
# same

Step 2: Prepare Data

In [13]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

from sklearn.feature_extraction.text import TfidfTransformer
count_vect_nostopwords = CountVectorizer(stop_words='english')

x_train_counts_nostopwords = count_vect_nostopwords.fit_transform(twenty_train.data)
x_test_counts_nostopwords = count_vect_nostopwords.transform(twenty_test.data) # get counts on the words in twenty_test data

tfidf_transformer = TfidfTransformer()
x_train_tfidf_nostopwords = tfidf_transformer.fit_transform(x_train_counts_nostopwords) # we use counts to calculate their tfidf
y_train_tfidf_nostopwords = twenty_train.target

x_test_tfidf_nostopwords = tfidf_transformer.transform(x_test_counts_nostopwords)
y_test_tfidf_nostopwords = twenty_test.target

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Step 3: Build Model

In [14]:
mn_tfidf_nostopwords = MultinomialNB()

Step 4: Train Model


In [15]:
mn_tfidf_nostopwords.fit(x_train_tfidf_nostopwords, y_train_tfidf_nostopwords)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Step 5: Evaluate Model

In [16]:
y_predicted = mn_tfidf_nostopwords.predict(x_test_tfidf_nostopwords)
score = accuracy_score(y_predicted, y_test_tfidf_nostopwords)
print(score)

0.8169144981412639


## 4. SVM

In [17]:
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)
svm.fit(x_train_tfidf_nostopwords, y_train_tfidf_nostopwords)
y_predicted = svm.predict(x_test_tfidf_nostopwords)
score = accuracy_score(y_predicted, y_test_tfidf_nostopwords)
print(score)

0.8224907063197026
