In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups



In [22]:
categories = [
            'alt.atheism',
            'soc.religion.christian',
            'comp.graphics',
            'sci.med'
            ]


In [23]:
train_data = fetch_20newsgroups(
        subset='train',
        categories=categories,
        shuffle=True,
        random_state=42
        )

test_data = fetch_20newsgroups(
        subset='test',
        categories=categories,
        shuffle=True,
        random_state=42
        )

train_df = pd.DataFrame(
        {'Text':train_data.data, 'Category':train_data.target})
test_df = pd.DataFrame(
        {'Text':train_data.data, 'Category':train_data.target})



In [24]:
class_names = train_df['Category'].unique()
print(class_names)

[1 3 2 0]


In [25]:
print(f"Train Length: {len(train_df)}")

Train Length: 2257


In [26]:
print(f"Test Length: {len(test_df)}")

Test Length: 2257


In [27]:
full_df = train_df.append(test_df)
print(f"Full dataset length: {len(full_df)}")

Full dataset length: 4514


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_df['Text'])
X_train_counts.shape

(2257, 35788)

In [43]:
count_vect.vocabulary_.get(u'20')

1341

In [30]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [31]:
from sklearn.naive_bayes import MultinomialNB
mnb_clf = MultinomialNB().fit(X_train_tfidf, train_df['Category'])

In [32]:
# docs_new = ['the stocks are going up', 'mobile phones are growing fast']
docs_new = test_df['Text'].to_list()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

mnb_predicted = mnb_clf.predict(X_new_tfidf)

In [33]:
# print(predicted[:5])
# print(test_df['Category'].to_list()[:5])

In [34]:
from sklearn.metrics import classification_report
print(accuracy_score(mnb_predicted, test_df['Category'].to_list()))

0.9636685866194062


In [35]:
from sklearn import svm

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,train_df['Category'])
svm_predicted = SVM.predict(X_new_tfidf)
print(accuracy_score(test_df['Category'].to_list(), svm_predicted))

0.999113867966327


In [36]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=20).fit(X_train_tfidf, train_df['Category'])

In [38]:
predicted = clf.predict(X_new_tfidf)
print(accuracy_score(test_df['Category'].to_list(), predicted))

1.0
