# Load the news dataset

In [16]:
from sklearn.datasets import fetch_20newsgroups

In [17]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

In [18]:
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', categories=categories)


In [19]:
train, train_labels = twenty_train.data, twenty_train.target
test, test_labels = twenty_test.data, twenty_test.target

In [20]:
print("No of rows:",len(twenty_train.data))

No of rows: 2257


In [21]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))
print("Label:",twenty_train.target_names[twenty_train.target[0]])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Label: comp.graphics


# Convert Text to numbers
CountVectorizer: Convert a collection of text documents to a matrix of token counts. <br>
Here by doing ‘count_vect.fit_transform(twenty_train.data)’, we are learning the vocabulary dictionary and it returns a Document-Term matrix. [n_samples, n_features].


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train)
X_train_counts.shape

(2257, 35788)

# TFIDF
TF: Just counting the number of words in each document has 1 issue: it will give more weightage to longer documents than shorter documents. To avoid this, we can use frequency (TF - Term Frequencies) i.e. #count(word) / #Total words, in each document.

TF-IDF: Finally, we can even reduce the weightage of more common words like (the, is, an etc.) which occurs in all document. This is called as TF-IDF i.e Term Frequency times inverse document frequency.

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

# Train the model

In [24]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_labels)

# Manual text prediction

In [25]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


# Accuracy testing of Naivebayes Model

In [26]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_labels)

X_new_counts = count_vect.transform(test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

y_pred = clf.predict(X_new_tfidf)


from sklearn.metrics import confusion_matrix,accuracy_score
cm = confusion_matrix(test_labels, y_pred)
ac = accuracy_score(test_labels,y_pred)
print(ac)
print(cm)

0.8348868175765646
[[192   2   6 119]
 [  2 347   4  36]
 [  2  11 322  61]
 [  2   2   1 393]]


# Trying other algorithms

Lets try SGDClassifier <br><br>
SGDClassifier is a linear classifier (by default in sklearn it is a linear SVM) that uses SGD for training (that is, looking for the minima of the loss using SGD). According to the documentation: SGDClassifier is a Linear classifiers (SVM, logistic regression, a.o.) with SGD training.

SGDClassifier supports multi-class classification by combining multiple binary classifiers in a “one versus all” (OVA) scheme. For each of the classes, a binary classifier is learned that discriminates between that and all oth

In [27]:
from sklearn.linear_model import SGDClassifier

In [28]:
sgd_model = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf = sgd_model.fit(X_train_tfidf, twenty_train.target)


In [29]:

X_new_counts = count_vect.transform(test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

y_pred = clf.predict(X_new_tfidf)

cm = confusion_matrix(test_labels, y_pred)
ac = accuracy_score(test_labels,y_pred)
print(ac)
print(cm)

0.9101198402130493
[[256  11  16  36]
 [  4 380   3   2]
 [  5  35 353   3]
 [  5  11   4 378]]


Note: You can checkout other models to load from https://scikit-learn.org/stable/supervised_learning.html