# Importing Libraries

In [36]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# Data

In [2]:
categories = ['rec.motorcycles', 'sci.electronics',
              'comp.graphics', 'sci.med']

# sklearn provides us with subset data for training and testing
train_data = fetch_20newsgroups(subset='train',
                                categories=categories, shuffle=True, random_state=42)

print(train_data.target_names)

print("\n".join(train_data.data[0].split("\n")[:3]))
print(train_data.target_names[train_data.target[0]])

# Let's look at categories of our first ten training data
for t in train_data.target[:10]:
    print(train_data.target_names[t])

['comp.graphics', 'rec.motorcycles', 'sci.electronics', 'sci.med']
From: kreyling@lds.loral.com (Ed Kreyling 6966)
Subject: Sun-os and 8bit ASCII graphics
Organization: Loral Data Systems
comp.graphics
comp.graphics
comp.graphics
rec.motorcycles
comp.graphics
sci.med
sci.electronics
sci.electronics
comp.graphics
rec.motorcycles
sci.electronics


# Converting text data into numerical data

In [6]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_data.data)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# KNN Implementation

In [7]:
knn = KNeighborsClassifier(n_neighbors=7)

clf = knn.fit(X_train_tfidf, train_data.target)

docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']

X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [8]:
predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_data.target_names[category]))

'I have a Harley Davidson and Yamaha.' => rec.motorcycles
'I have a GTX 1050 GPU' => sci.med


# pipelining KNN

In [13]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', knn),
])
# Fitting our train data to the pipeline
text_clf.fit(train_data.data, train_data.target)

# Test data 
test_data = fetch_20newsgroups(subset='test',
                               categories=categories, shuffle=True, random_state=42)
docs_test = test_data.data
# Predicting our test data
predicted = text_clf.predict(docs_test)
print('We got an accuracy of',np.mean(predicted == test_data.target)*100, '% over the test data.')

We got an accuracy of 82.67766497461929 % over the test data.


# Naive Bayes Implementation


In [24]:
count_vect1 = CountVectorizer()
X_train_counts1 = count_vect.fit_transform(train_data.data)

# transform a count matrix to a normalized tf-idf representation (tf-idf transformer)
tfidf_transformer1 = TfidfTransformer()
X_train_tfidf1 = tfidf_transformer1.fit_transform(X_train_counts1).toarray()

In [28]:
gnb = GaussianNB()
glf = gnb.fit(X_train_tfidf1.toarray(), train_data.target)

docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']

X_new_counts1 = count_vect.transform(docs_new)
X_new_tfidf1 = tfidf_transformer1.transform(X_new_counts1).toarray()

In [29]:
predicted1 = gnb.predict(X_new_tfidf1)

for doc, category in zip(docs_new, predicted1):
    print('%r => %s' % (doc, train_data.target_names[category]))

'I have a Harley Davidson and Yamaha.' => rec.motorcycles
'I have a GTX 1050 GPU' => sci.med


In [40]:
text_clf1 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('glf', gnb),
])
# Fitting our train data to the pipeline
text_clf1.fit(train_data.data, train_data.target)

# Test data 
test_data1 = fetch_20newsgroups(subset='test',
                               categories=categories, shuffle=True, random_state=42)
docs_test = test_data1.data
# Predicting our test data
predicted1 = text_clf.predict(docs_test)
print('We got an accuracy of',np.mean(predicted1 == test_data.target)*100, '% over the test data.')

We got an accuracy of 79.75838 % over the test data.


# Multinomial Naive Bayes Implementation

In [37]:
clfa = MultinomialNB()
c = clfa.fit(X_train_tfidf, train_data.target)

docs_new = ['I have a Harley Davidson and Yamaha.', 'I have a GTX 1050 GPU']

X_new_counts2 = count_vect.transform(docs_new)
X_new_tfidf2 = tfidf_transformer.transform(X_new_counts2)

In [39]:
predicted2 = gnb.predict(X_new_tfidf2.toarray())

for doc, category in zip(docs_new, predicted2):
    print('%r => %s' % (doc, train_data.target_names[category]))

'I have a Harley Davidson and Yamaha.' => rec.motorcycles
'I have a GTX 1050 GPU' => sci.med


In [42]:
text_clf2 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('c', clfa),
])
# Fitting our train data to the pipeline
text_clf2.fit(train_data.data, train_data.target)

# Test data 
test_data2 = fetch_20newsgroups(subset='test',
                               categories=categories, shuffle=True, random_state=42)
docs_test = test_data2.data
# Predicting our test data
predicted2 = text_clf.predict(docs_test)
print('We got an accuracy of',np.mean(predicted2 == test_data.target)*100, '% over the test data.')

We got an accuracy of 82.46782 % over the test data.
