# Text Classification Using Sklearn 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
import seaborn as sns
sns.set()
data=fetch_20newsgroups()

In [4]:
categories=data.target_names
categories

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [5]:
# Dividing the DataSet into 2 parts train and test

train=fetch_20newsgroups(subset='train' , categories=categories)

test=fetch_20newsgroups(subset='test' , categories=categories)



In [8]:
len(train.data)

11314

In [9]:
len(test.data)

7532

In [11]:
len(test.target)

7532

# Making Model

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [21]:
model=make_pipeline(TfidfVectorizer(),MultinomialNB())

In [22]:
model.fit(train.data,train.target)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('multinomialnb', MultinomialNB())])

In [23]:
labels=model.predict(test.data)

In [26]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(test.target,labels))

[[166   0   0   1   0   1   0   0   1   1   1   3   0   6   3 123   4   8
    0   1]
 [  1 252  15  12   9  18   1   2   1   5   2  41   4   0   6  15   4   1
    0   0]
 [  0  14 258  45   3   9   0   2   1   3   2  25   1   0   6  23   2   0
    0   0]
 [  0   5  11 305  17   1   3   6   1   0   2  19  13   0   5   3   1   0
    0   0]
 [  0   3   8  23 298   0   3   8   1   3   1  16   8   0   2   8   3   0
    0   0]
 [  1  21  17  13   2 298   1   0   1   1   0  23   0   1   4  10   2   0
    0   0]
 [  0   1   3  31  12   1 271  19   4   4   6   5  12   6   3   9   3   0
    0   0]
 [  0   1   0   3   0   0   4 364   3   2   2   4   1   1   3   3   4   0
    1   0]
 [  0   0   0   1   0   0   2  10 371   0   0   4   0   0   0   8   2   0
    0   0]
 [  0   0   0   0   1   0   0   4   0 357  22   0   0   0   2   9   1   1
    0   0]
 [  0   0   0   0   0   0   0   1   0   4 387   1   0   0   1   5   0   0
    0   0]
 [  0   2   1   0   0   1   1   3   0   0   0 383   1   0   0   3

In [29]:
from sklearn.metrics import classification_report
print(classification_report(test.target,labels))

              precision    recall  f1-score   support

           0       0.80      0.52      0.63       319
           1       0.81      0.65      0.72       389
           2       0.82      0.65      0.73       394
           3       0.67      0.78      0.72       392
           4       0.86      0.77      0.81       385
           5       0.89      0.75      0.82       395
           6       0.93      0.69      0.80       390
           7       0.85      0.92      0.88       396
           8       0.94      0.93      0.93       398
           9       0.92      0.90      0.91       397
          10       0.89      0.97      0.93       399
          11       0.59      0.97      0.74       396
          12       0.84      0.60      0.70       393
          13       0.92      0.74      0.82       396
          14       0.84      0.89      0.87       394
          15       0.44      0.98      0.61       398
          16       0.64      0.94      0.76       364
          17       0.93    

# Prediction Results

In [38]:
ans=model.predict(['Narendra Modi is the best Prime Minister of World'])
categories[ans[0]]

'talk.politics.mideast'

In [39]:
ans=model.predict(['Bmw is the best car brand'])
categories[ans[0]]

'rec.autos'

In [44]:
ans=model.predict(['Which game you play in Games Period'])
categories[ans[0]]

'rec.sport.hockey'

In [45]:
ans=model.predict(['Nasa is about to find a way to settle in Mars'])
categories[ans[0]]

'sci.space'

In [47]:
ans=model.predict(['In our Country there is a lot of debate on Hindu and Muslim wars'])
categories[ans[0]]

'talk.politics.mideast'

In [51]:
ans=model.predict(['Absence of Doctors'])
categories[ans[0]]

'sci.med'

### Exporting the model

In [52]:
import pickle

In [53]:
with open('Text_Classification_model','wb') as f:
    pickle.dump(model,f)
    

In [None]:
with open('Text_Classification_model','wb') as f:
    model1=pickle.load(f)