# Text Classification
This task is going to solve a real-world problem using the concepts learned in Text Classification module.
We are going to create a benchmark analysis with different algorithms and feature extractors.

### Dataset: 
Fetch 20 Newsgroups
### Algorithms: 
Logistic Regression, Support Vector Machines, Decision Trees
### Feature Extractors: 
CountVectorizer, Word2Vec, Doc2Vec and so on

In [37]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.model_selection import train_test_split

from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [38]:
# Define the categories
categories = ['alt.atheism', 'talk.religion.misc']

# Load the 20 newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'), categories= categories)

X, y = newsgroups.data, newsgroups.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
Models= {"SGD": SGDClassifier(), "LR": LogisticRegression(), "SVM": SVC(), "DT": DecisionTreeClassifier()}

In [46]:
parameters= {
    'loss': ['hinge', 'log_loss', 'log'],
    'penalty': ['l2', 'l1'],
    'alpha': [0.0001, 0.001, 0.01],
    'warm_start': [True, False],
    'average': [True, False],
    
    'force_alpha': [True, False],
    'fit_prior': [True, False],
    
    'dual': [True, False],
    'C': [0.5, 1, 2],
    'fit_intercept': [True, False],
    'solver': ['lbfgs', 'liblinear', 'newton-cg'],
    
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [3, 5, 7],
    'gamma': ['scale', 'auto'],
    'probability': [True, False],
    
    'criterion': ["gini", "entropy", "log_loss"],
    'splitter': ["best", "random"],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1, 3, 5],
}

In [47]:
# CountVectorizer:
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)


# Word2Vec:
# Tokenize the text and train a Word2Vec model
tokenized_text = [text.split() for text in X_train]
word2vec_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=1)
def document_vector(word2vec_model, doc):
    # Create an array for the document vectors
    doc_vec = []
    for word in doc:
        if word in word2vec_model.wv:
            doc_vec.append(word2vec_model.wv[word])
    if not doc_vec:
        # If the document is empty, return a zero vector
        return np.zeros(word2vec_model.vector_size)
    # Calculate the mean of the word vectors
    return np.mean(doc_vec, axis=0)
X_train_word2vec = [document_vector(word2vec_model, text.split()) for text in X_train]
X_test_word2vec = [document_vector(word2vec_model, text.split()) for text in X_test]


# Doc2Vec:
# Tag documents with unique labels
tagged_data = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(X_train)]
# Train a Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=100, window=5, min_count=1, workers=4, epochs=10)
doc2vec_model.build_vocab(tagged_data)
doc2vec_model.train(tagged_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Infer vector representations for the training and testing data
X_train_doc2vec = [doc2vec_model.infer_vector(text.split()) for text in X_train]
X_test_doc2vec = [doc2vec_model.infer_vector(text.split()) for text in X_test]

In [48]:
X_train_dict= {'count': X_train_count, 'word2vec': X_train_word2vec, 'doc2vec': X_train_doc2vec}
X_test_dict= {'count': X_test_count, 'word2vec': X_test_word2vec, 'doc2vec': X_test_doc2vec}

In [49]:
Results= pd.DataFrame(columns= ['ML_model', 'Exr_count', 'Exr_word2vec', 'Exr_doc2vec'])

In [50]:
results_list = []

for m_name, model in Models.items():
    # Setting parameters
    #paras = {pa: val for pa, val in parameters.items() if pa in model.get_params()}
    paras= {}
    for pa , val in parameters.items():
            if pa in model.get_params():
                paras[pa]= val

    # Grid Search
    grid_search = GridSearchCV(estimator=model, param_grid=paras, scoring='accuracy', cv=5, n_jobs=-1, verbose=1)

    record = {'ML_model': m_name}

    for name, X_train_data in X_train_dict.items():
        # Fit the model
        grid_search.fit(X= X_train_data, y= y_train)

        # Best Score
        print(f"Best score: {round(grid_search.best_score_, 3)}")

        # Recording the results
        record['Exr_' + name] = round(grid_search.best_score_, 3)

    results_list.append(record)

# Convert the list of dictionaries to a DataFrame
Results = pd.DataFrame(results_list)


Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.715
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.639
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.636
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.706
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.623
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.624
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.681
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.623
Fitting 5 folds for each of 144 candidates, totalling 720 fits
Best score: 0.623
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best score: 0.627
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best score: 0.614
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Best score: 0.632


In [51]:
Results

Unnamed: 0,ML_model,Exr_count,Exr_word2vec,Exr_doc2vec
0,SGD,0.715,0.639,0.636
1,LR,0.706,0.623,0.624
2,SVM,0.681,0.623,0.623
3,DT,0.627,0.614,0.632
