# NLP_Text_Classification
## This notebook outlines the usage of NLP Feature extraction (CountVectorizer, Word2Vec, Doc2Vec, TfidfVectorizer) in classification of text documents

## Algorithms: Multinomial NaÃ¯ve Bayes, Logistic Regression, Support Vector Machines, Decision Trees


### Import all the necessary libraries

In [None]:
from pprint import pprint
import logging
import time

from sklearn.model_selection import train_test_split,RepeatedStratifiedKFold
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, classification_report, roc_curve, roc_auc_score, f1_score
import warnings
from matplotlib import pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

### Choose a few categories fro the entire 20 categories

In [None]:
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
]

In [None]:
print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc']


### Fetch documents for these 2 categories

In [None]:
data = fetch_20newsgroups(subset='train', categories=categories)
print(f"{len(data.filenames)} documents")
print(f"{len(data.target_names)} categories")
print()

857 documents
2 categories



In [None]:
print("Target names:", data.target_names)

Target names: ['alt.atheism', 'talk.religion.misc']


In [None]:
print("Filenames of the documents:", data.filenames)

## Splitting the data

In [None]:
# Features (X)
X = data.data

# Target labels (y)
y = data.target

In [None]:
# Print the first document and its target label
print("First document:")
print(X[0])
print("Target label:", y[0])

First document:
From: mangoe@cs.umd.edu (Charley Wingate)
Subject: Benediktine Metaphysics
Lines: 24

Benedikt Rosenau writes, with great authority:

>     IF IT IS CONTRADICTORY IT CANNOT EXIST.

"Contradictory" is a property of language.  If I correct this to


      THINGS DEFINED BY CONTRADICTORY LANGUAGE DO NOT EXIST

I will object to definitions as reality.  If you then amend it to

      THINGS DESCRIBED BY CONTRADICTORY LANGUAGE DO NOT EXIST

then we've come to something which is plainly false.  Failures in
description are merely failures in description.

(I'm not an objectivist, remember.)


-- 
C. Wingate        + "The peace of God, it is no peace,
                  +    but strife closed in the sod.
mangoe@cs.umd.edu +  Yet, brothers, pray for but one thing:
tove!mangoe       +    the marv'lous peace of God."

Target label: 0


In [None]:
y

array([0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## CountVectorizer

In [None]:
# Define the vectorizer and transformers
vect = CountVectorizer()

In [None]:
# Define the classifiers
nb_classifier = MultinomialNB()
lr_classifier = LogisticRegression(class_weight="balanced",random_state=42)
svm_classifier = SVC(random_state=42)
dt_classifier = DecisionTreeClassifier(random_state=42)

### CountVectorizer with Naive Bayes model

In [None]:
# Fit and transform the training data
X_train_dtm = vect.fit_transform(X_train)

# Transform the test data (do not fit again, use the vocabulary learned from the training data)
X_test_dtm = vect.transform(X_test)

# Fit the Multinomial Naive Bayes model
nb_classifier.fit(X_train_dtm, y_train)

# Predict and evaluate
y_pred_class = nb_classifier.predict(X_test_dtm)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))


Accuracy: 0.9069767441860465


### CountVectorizer with Linear Regression model

In [None]:
lr_classifier.fit(X_train_dtm, y_train)
y_pred_class = lr_classifier.predict(X_test_dtm)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

### CountVectorizer with SVM

In [None]:
svm_classifier.fit(X_train_dtm, y_train)
y_pred_class = svm_classifier.predict(X_test_dtm)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.6744186046511628


### CountVectorizer with Decision Tree model

In [None]:
dt_classifier.fit(X_train_dtm, y_train)
y_pred_class = dt_classifier.predict(X_test_dtm)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.872093023255814


## TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer()

### TfidfVectorizer with Naive Bayes model

In [None]:
# Vectorize the text data using TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Create and fit the model
nb_classifier.fit(X_train_tfidf, y_train)

# Predict
y_pred_class = nb_classifier.predict(X_test_tfidf)

# Calculate accuracy (or other metrics)
accuracy = metrics.accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)

Accuracy: 0.7635658914728682


### TfidfVectorizer with Linear Regression model

In [None]:
lr_classifier.fit(X_train_tfidf, y_train)
y_pred_class = lr_classifier.predict(X_test_tfidf)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.9186046511627907


### TfidfVectorizer with SVM

In [None]:
svm_classifier.fit(X_train_tfidf, y_train)
y_pred_class = svm_classifier.predict(X_test_tfidf)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.9341085271317829


### TfidfVectorizer with Decision Trees

In [None]:
dt_classifier.fit(X_train_tfidf, y_train)
y_pred_class = dt_classifier.predict(X_test_tfidf)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.8372093023255814


## Word2Vec

In [None]:
w2v = Word2Vec(vector_size=100, window=5, min_count=1, workers=4)

### Tokenize the docs

In [None]:
import nltk
from nltk.tokenize import word_tokenize

# X_train is a list of documents
tokenized_train = [word_tokenize(doc) for doc in X_train]

# Print tokens for the first document as an example
print(tokenized_train[0])


# X_test is a list of documents
tokenized_test = [word_tokenize(doc) for doc in X_test]

# Print tokens for the first document as an example
print(tokenized_test[0])


['From', ':', 'eczcaw', '@', 'mips.nott.ac.uk', '(', 'A.Wainwright', ')', 'Subject', ':', 'Re', ':', 'some', 'thoughts', '.', 'Keywords', ':', 'Dan', 'Bissell', 'Reply-To', ':', 'eczcaw', '@', 'mips.nott.ac.uk', '(', 'A.Wainwright', ')', 'Organization', ':', 'Nottingham', 'University', 'Lines', ':', '28', 'In', 'article', '<', 'healta.145.734928689', '@', 'saturn.wwc.edu', '>', ',', 'healta', '@', 'saturn.wwc.edu', '(', 'Tammy', 'R', 'Healy', ')', 'writes', ':', '|', '>', 'I', 'hope', 'you', "'re", 'not', 'going', 'to', 'flame', 'him', '.', 'Please', 'give', 'him', 'the', 'same', 'coutesy', "you'", '|', '>', 've', 'given', 'me', '.', '|', '>', '|', '>', 'Tammy', 'If', 'a', 'person', 'gives', 'a', 'well-balanced', 'reasoned', 'argument', ',', 'Tammy', ',', 'then', 'all', 'are', 'happy', 'to', 'discuss', 'it', 'with', 'him', '.', 'If', 'he', 'makes', 'astounding', 'claims', ',', 'which', 'are', 'not', 'backed', 'up', 'with', 'any', 'evidence', 'then', 'he', 'must', 'be', 'expected', 'to'

### Word2Vec with Gaussian Naive Bayes model

In [None]:
import numpy as np

# Train Word2Vec model
w2v.build_vocab(tokenized_train)
w2v.train(tokenized_train, total_examples=w2v.corpus_count, epochs=10)

# Transform the data using the trained Word2Vec model
X_train_w2v = [np.mean([w2v.wv[word] for word in doc if word in w2v.wv], axis=0) for doc in tokenized_train]
X_test_w2v = [np.mean([w2v.wv[word] for word in doc if word in w2v.wv], axis=0) for doc in tokenized_test]

# Create and fit the model
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train_vectors, y_train)

# Predict
y_pred_class = gnb_classifier.predict(X_test_w2v)

# Calculate accuracy (or other metrics)
accuracy = metrics.accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)

Accuracy: 0.46124031007751937


### Word2Vec with Linear Regression model

In [None]:
lr_classifier.fit(X_train_w2v, y_train)
y_pred_class = lr_classifier.predict(X_test_w2v)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.7093023255813954


### Word2Vec with SVM

In [None]:
svm_classifier.fit(X_train_w2v, y_train)
y_pred_class = svm_classifier.predict(X_test_w2v)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.5930232558139535


### Word2Vec with Decision Trees

In [None]:
dt_classifier.fit(X_train_vectors, y_train)
y_pred_class = dt_classifier.predict(X_test_vectors)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.6046511627906976


## Doc2Vec

In [None]:
d2v = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=10)

### Doc2Vec with Gaussian Naive Bayes model

In [None]:
# Step 1: Tag documents
X_train_tagged = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(X_train)]
X_test_tagged = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(X_test)]

# Step 2: Train Doc2Vec model
d2v.build_vocab(X_train_tagged)
d2v.train(X_train_tagged, total_examples=d2v.corpus_count, epochs=d2v.epochs)

# Step 3: Infer vectors
X_train_vectors = [d2v.infer_vector(doc.words) for doc in X_train_tagged]
X_test_vectors = [d2v.infer_vector(doc.words) for doc in X_test_tagged]

# Step 4: Train Gaussian Naive Bayes
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train_vectors, y_train)

# Step 5: Predict and Evaluate
y_pred_class = gnb_classifier.predict(X_test_vectors)

# Calculate accuracy (or other metrics)
accuracy = metrics.accuracy_score(y_test, y_pred_class)
print("Accuracy:", accuracy)

Accuracy: 0.6395348837209303


### Doc2Vec with Linear Regression Model

In [None]:
lr_classifier.fit(X_train_vectors, y_train)
y_pred_class = lr_classifier.predict(X_test_vectors)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.6937984496124031


### Doc2Vec with SVM

In [None]:
svm_classifier.fit(X_train_vectors, y_train)
y_pred_class = svm_classifier.predict(X_test_vectors)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.6511627906976745


### Doc2Vec with Decision Trees

In [None]:
dt_classifier.fit(X_train_vectors, y_train)
y_pred_class = dt_classifier.predict(X_test_vectors)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred_class))

Accuracy: 0.6046511627906976
