In [18]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import metrics
import numpy as np

In [19]:
twenty_train = fetch_20newsgroups(subset='train',shuffle=True, random_state=42)
i=1
for cat in twenty_train.target_names:
    print("Category[%d]:"%i,cat)
    i=i+1

Category[1]: alt.atheism
Category[2]: comp.graphics
Category[3]: comp.os.ms-windows.misc
Category[4]: comp.sys.ibm.pc.hardware
Category[5]: comp.sys.mac.hardware
Category[6]: comp.windows.x
Category[7]: misc.forsale
Category[8]: rec.autos
Category[9]: rec.motorcycles
Category[10]: rec.sport.baseball
Category[11]: rec.sport.hockey
Category[12]: sci.crypt
Category[13]: sci.electronics
Category[14]: sci.med
Category[15]: sci.space
Category[16]: soc.religion.christian
Category[17]: talk.politics.guns
Category[18]: talk.politics.mideast
Category[19]: talk.politics.misc
Category[20]: talk.religion.misc


In [20]:
categories = ['alt.atheism','soc.religion.christian','comp.graphics','sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [21]:
text_pipeline = Pipeline(
    [('vect', CountVectorizer()),
    ('transformer', TfidfTransformer()),
    ('clf', MultinomialNB())])

In [22]:
text_pipeline.fit(twenty_train.data, twenty_train.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast']
predictions = text_pipeline.predict(docs_new)
for data, cat in zip(docs_new, predictions):
    print('{0} => {1}'.format(data, twenty_train.target_names[cat]))

God is love => soc.religion.christian
OpenGL on the GPU is fast => comp.graphics


In [23]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predictions = text_pipeline.predict(docs_test)
mean = np.mean(predictions == twenty_test.target)
print(f'\nMean: {mean}\n\n')


Mean: 0.8348868175765646




In [24]:
#print(classification_report(twenty_test.target, predictions))
print(metrics.classification_report(twenty_test.target,predictions,target_names=twenty_test.target_names))


                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

              accuracy                           0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502



In [27]:
print("\nLength of training data is",len(twenty_train.data))
print("\nThe content/data of first file is:\n")
print(twenty_train.data[0])

#print("The contents/data of first 10 files is in Training data:\n")
#for i in range(0,10):
 #   print("\nFILE NO: %d\n"%(i+1))
  #  print(twenty_train.data[i])


Length of training data is 2257

The content/data of first file is:

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.

