In [32]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

RANDOM_SEED = 42

import time

start = time.time()
print(f'Time: {time.time() - start}')

Time: 3.9577484130859375e-05


In [33]:
categories = [
            'alt.atheism',
            'soc.religion.christian',
            'comp.graphics',
            'sci.med'
            ]


In [34]:
train_df = pd.read_csv("dataset/new_train.csv")
test_df = pd.read_csv("dataset/test.csv")

In [35]:
train_df = train_df[train_df['categories'].isin(categories)]

In [40]:
print(f"Train dataset length: {len(train_df)}")

Train dataset length: 2104


In [37]:
from sklearn.model_selection import train_test_split

# train_feature, test_features, train_labels, test_labels = train_test_split(train_df['text'], train_df['categories'], test_size=0.30, random_state=RANDOM_SEED)

train_feature, train_labels = train_df['text'], train_df['categories']
test_features, test_labels = test_df['text'], train_df['categories']

## Feature Engineering

### We will convert a collection of text documents to a matrix of token counts

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_feature)
train_counts.shape

(2104, 37363)

In [39]:
train_labels.shape

(2104,)

In [41]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)

In [42]:
test_counts = count_vect.transform(test_features)
test_tfidf = tfidf_transformer.transform(test_counts)

In [20]:
from sklearn.naive_bayes import MultinomialNB

start = time.time()
mnb_clf = MultinomialNB().fit(train_tfidf, train_labels)
print(f'Time: {time.time() - start}')

Time: 0.01473093032836914


In [23]:
mnb_predicted = mnb_clf.predict(test_tfidf)
print(accuracy_score(mnb_predicted, test_labels))

0.9077757685352622


In [24]:
from sklearn.svm import SVC

SVM = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')

start = time.time()
SVM.fit(train_tfidf, train_labels)
print(f'Time: {time.time() - start}')

svm_predicted = SVM.predict(test_tfidf)
print(accuracy_score(test_labels, svm_predicted))

Time: 2.9731314182281494
0.9620253164556962


In [25]:
from sklearn.ensemble import RandomForestClassifier

start = time.time()
clf = RandomForestClassifier(n_estimators=100,warm_start=True, oob_score=True).fit(train_tfidf, train_labels)
print(f'Time: {time.time() - start}')

predicted = clf.predict(test_tfidf)

print(accuracy_score(test_labels, predicted))

Time: 1.6352012157440186
0.8933092224231465


In [26]:
from sklearn.neural_network import MLPClassifier

start = time.time()
clf = MLPClassifier(random_state=1, max_iter=5).fit(train_tfidf, train_labels)
print(f'Time: {time.time() - start}')

Time: 6.757193565368652




In [27]:
predicted = clf.predict(test_tfidf)

print(accuracy_score(test_labels, predicted))

0.9584086799276673
