In [1]:
from pprint import pprint
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [3]:
categories = [
  'sci.electronics',
    'comp.graphics',
    'soc.religion.christian'
]

In [4]:
dataset = fetch_20newsgroups(subset='train', categories=categories)
len(dataset.data)

1774

In [5]:
#defining training and test data
my_train = fetch_20newsgroups(subset='train', categories=categories)
my_test = fetch_20newsgroups(subset='test', categories=categories)

In [6]:
X_train = my_train.data
y_train = my_train.target

X_test = my_test.data
y_test = my_test.target

In [7]:
results = []
classifiers = {
    'MultinomialNB': MultinomialNB(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
}

Countvectorizer Extractor

In [8]:
vect = CountVectorizer()
X_train_counts = vect.fit_transform(X_train)
x_test_counts = vect.transform(X_test)


tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
x_test_tfidf = tfidf_transformer.transform(x_test_counts)

for name, classifier in classifiers.items():
    classifier.fit(x_train_tfidf, y_train)
    y_pred = classifier.predict(x_test_tfidf)

    accuracy =  accuracy_score(y_test, y_pred)
    results.append({'Feature Extractor':'CountVectorizer', 'Algorithm': name, 'Accuracy': accuracy})

pprint(results)

[{'Accuracy': 0.8957627118644068,
  'Algorithm': 'MultinomialNB',
  'Feature Extractor': 'CountVectorizer'},
 {'Accuracy': 0.9271186440677966,
  'Algorithm': 'LogisticRegression',
  'Feature Extractor': 'CountVectorizer'},
 {'Accuracy': 0.923728813559322,
  'Algorithm': 'SVC',
  'Feature Extractor': 'CountVectorizer'},
 {'Accuracy': 0.7754237288135594,
  'Algorithm': 'DecisionTree',
  'Feature Extractor': 'CountVectorizer'}]


TF-IDF Vectorizer

In [9]:
tfidf_vectorizer = TfidfVectorizer()
X_train_vect = tfidf_vectorizer.fit_transform(X_train)
X_test_vect = tfidf_vectorizer.transform(X_test)

for name, classifier in classifiers.items():
    classifier.fit(X_train_vect, y_train)
    y_pred = classifier.predict(X_test_vect)

    accuracy =  accuracy_score(y_test, y_pred)
    results.append({'Feature Extractor':'TfidfVectorizer', 'Algorithm': name, 'Accuracy': accuracy})


pprint(results)

[{'Accuracy': 0.8957627118644068,
  'Algorithm': 'MultinomialNB',
  'Feature Extractor': 'CountVectorizer'},
 {'Accuracy': 0.9271186440677966,
  'Algorithm': 'LogisticRegression',
  'Feature Extractor': 'CountVectorizer'},
 {'Accuracy': 0.923728813559322,
  'Algorithm': 'SVC',
  'Feature Extractor': 'CountVectorizer'},
 {'Accuracy': 0.7754237288135594,
  'Algorithm': 'DecisionTree',
  'Feature Extractor': 'CountVectorizer'},
 {'Accuracy': 0.8957627118644068,
  'Algorithm': 'MultinomialNB',
  'Feature Extractor': 'TfidfVectorizer'},
 {'Accuracy': 0.9271186440677966,
  'Algorithm': 'LogisticRegression',
  'Feature Extractor': 'TfidfVectorizer'},
 {'Accuracy': 0.923728813559322,
  'Algorithm': 'SVC',
  'Feature Extractor': 'TfidfVectorizer'},
 {'Accuracy': 0.7796610169491526,
  'Algorithm': 'DecisionTree',
  'Feature Extractor': 'TfidfVectorizer'}]


Word2vec Extractor

In [10]:
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import MinMaxScaler

train_tokens  =  [word_tokenize(doc.lower()) for doc in X_train]
test_tokens = [word_tokenize(doc.lower()) for doc in X_test]

word2vec_model = Word2Vec(sentences=train_tokens, vector_size=100, window=5,  workers=4, seed=42)

def  document_vector(word2vec_model, doc_tokens):
      words  = [word for word in doc_tokens if word in word2vec_model.wv.key_to_index]
      if len(words) == 0:
        return  np.zeros(word2vec_model.wv.vector_size)
      return  np.mean(word2vec_model.wv[words], axis=0)


X_train_w2v = np.array([document_vector(word2vec_model,doc) for doc in train_tokens])
X_test_w2v  = np.array([document_vector(word2vec_model,doc)for doc in test_tokens])

scaler = MinMaxScaler()
X_train_w2v_scaled = scaler.fit_transform(X_train_w2v)
X_test_w2v_scaled = scaler.transform(X_test_w2v)

for name, classifier in classifiers.items():
    classifier.fit(X_train_w2v_scaled, y_train)
    y_pred = classifier.predict(X_test_w2v_scaled)

    accuracy =  accuracy_score(y_test, y_pred)
    results.append({'Feature Extractor':'Word2Vec', 'Algorithm': name, 'Accuracy': accuracy})

#pprint(results)

In [11]:

results_df = pd.DataFrame(results)
results_df['Accuracy'] = results_df['Accuracy'].round(3)

#pprint(results_df)


best_accuracy = 0
best_config = None

for row in results:
    if row['Accuracy'] > best_accuracy:
        best_accuracy = row['Accuracy']
        best = row


print("\nBest Extractor:")
pprint(best)


Best Extractor:
{'Accuracy': 0.9271186440677966,
 'Algorithm': 'LogisticRegression',
 'Feature Extractor': 'CountVectorizer'}


In [12]:
best_df = pd.DataFrame([best])

In [13]:
best_df.to_string('Joy_Task0_Text_Classification.txt', index=False)