# Text classification
### Dataset used - 20newsgroups

In [3]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
news_train = fetch_20newsgroups(subset='train',categories=categories,shuffle=True)
news_test = fetch_20newsgroups(subset='test',categories=categories,shuffle=True)

Downloading 20news dataset. This may take a few minutes.
INFO:sklearn.datasets.twenty_newsgroups:Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)
INFO:sklearn.datasets.twenty_newsgroups:Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
news_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

##### CountVectorizer used for word count
##### It assigns unique no. to each word and we get features by CountVectorizer.fit()
##### we can see that features by get_feature_names()
##### CountVectorizer.transform() is used to count no. of occarunces of each word (encoding) and to get feature matrix

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
x_train_tf = count_vect.fit_transform(news_train.data)
x_train_tf.shape

(2257, 35788)

##### Term Frequency(TD) - It tells how often given word appears within a document
##### Inverse Document Frequency(IDF) - This downscales words that appears lot(e.g. the,is,this etc)
##### we get IDF from count matrix obtain from CountVectorizer

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_trans = TfidfTransformer()
x_train_tfidf = tfidf_trans.fit_transform(x_train_tf)
x_train_tfidf.shape

(2257, 35788)

#### Naive Bays Classifier

In [7]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(x_train_tfidf,news_train.target)

In [8]:
x_test_tf = count_vect.transform(news_test.data)
x_test_tfidf = tfidf_trans.transform(x_test_tf)
predicted = model.predict(x_test_tfidf)

In [11]:
from sklearn import metrics
from sklearn.metrics import accuracy_score

print('Accuracy: ',accuracy_score(news_test.target,predicted))
print(metrics.classification_report(news_test.target,predicted,target_names=news_test.target_names))
metrics.confusion_matrix(news_test.target,predicted)

Accuracy:  0.834886817577
                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

             micro avg       0.83      0.83      0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502



array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]], dtype=int64)