In [1]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [2]:
# Import Dataset
from sklearn.datasets import fetch_20newsgroups

In [3]:
# We defined the categories which we want to classify
categories = ['rec.motorcycles', 'sci.electronics', 'comp.graphics', 'sci.med']

In [4]:
categories

['rec.motorcycles', 'sci.electronics', 'comp.graphics', 'sci.med']

In [5]:
# sklearn provides us with subset data for training and testing
train_data = fetch_20newsgroups(subset='train',
                                categories=categories, shuffle=True, random_state=42)

In [7]:
type(train_data)

sklearn.utils.Bunch

In [8]:
print(train_data.target_names)

['comp.graphics', 'rec.motorcycles', 'sci.electronics', 'sci.med']


In [13]:
# Let's look at categories of our first ten training data
for t in train_data.target[:10]:
    print(train_data.target_names[t])

comp.graphics
comp.graphics
rec.motorcycles
comp.graphics
sci.med
sci.electronics
sci.electronics
comp.graphics
rec.motorcycles
sci.electronics


In [30]:
print("There are {0} documents present in the training dataset".format(len(train_data.data)))

There are 2367 documents present in the training dataset


In [16]:
# Lets print the first 3 rows of first doc
print("\n".join(train_data.data[0].split("\n")[:3]))
print(train_data.target_names[train_data.target[0]])

From: kreyling@lds.loral.com (Ed Kreyling 6966)
Subject: Sun-os and 8bit ASCII graphics
Organization: Loral Data Systems
comp.graphics


In [35]:
# Builds a dictionary of features and transforms documents to feature vectors and convert our text documents to a
# matrix of token counts (CountVectorizer)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data.data)

In [39]:
print("There are {0} words/features present".format(len(count_vect.get_feature_names())))

There are 35653 words/features present


In [42]:
# Print the type of output and shape of it
print(type(X_train_counts))
print(X_train_counts.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(2367, 35653)


In [43]:
# Transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [45]:
# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=7)

# training our classifier ; train_data.target will be having numbers assigned for each category in train data
clf = knn.fit(X_train_tfidf, train_data.target)

In [46]:
# Input Data to predict their classes of the given categories
docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']

In [47]:
# building up feature vector of our input
X_new_counts = count_vect.transform(docs_new)

In [48]:
# We call transform instead of fit_transform because it's already been fit
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [49]:
# predicting the category of our input text: Will give out number for category
predicted = clf.predict(X_new_tfidf)

In [50]:
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))

'I have a Harley Davidson and Yamaha.' => rec.motorcycles
'I have a GTX 1050 GPU' => sci.med


In [None]:
# We can use Pipeline to add vectorizer -> transformer -> classifier all in a one compound classifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', knn),
])