Benchmarking Algorithms and Feature Extractors using fetch20newsgroup

1. Import Libraries and Load Dataset

In [12]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Step 1: Load dataset
categories = ['alt.atheism', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
print(f"{len(newsgroups.filenames)} documents")
print(f"{len(newsgroups.target_names)} categories")
print()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/millenasiqueiraguimaraes/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/millenasiqueiraguimaraes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/millenasiqueiraguimaraes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1427 documents
2 categories



2. Pre-process the text: Vectorize train and test data, clean is a custom defined function for pre-processing 

In [2]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

Apply preprocessing 

In [3]:
X_processed = [' '.join(clean(text)) for text in newsgroups.data]
y = newsgroups.target

3. Split dataset 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

Function to replace negative values with zero 

In [5]:
def replace_negatives(X):
    return np.maximum(X, 0)

4. Extract features 

Word2vec

In [6]:
def word2vec_transform(X):
    word2vec_model = Word2Vec(sentences=X, vector_size=100, window=5, min_count=1, workers=4)
    X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in X])
    return X_word2vec


Doc2vec

In [7]:
tagged_data = [TaggedDocument(words=words, tags=[str(i)]) for i, words in enumerate(X_train)]

In [8]:
def doc2vec_transform(X):
    doc2vec_model = Doc2Vec(tagged_data, vector_size=100, window=5, min_count=1, workers=4)
    X_doc2vec = np.array([doc2vec_model.infer_vector(doc_words=words.split()) for words in X])
    return X_doc2vec

    return X

5. train the classifier 

In [9]:
pipelines = {
    'Word2Vec + LogisticRegression': Pipeline([
        ('w2v', FunctionTransformer(word2vec_transform, validate=False)),
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000))
    ]),
    'Word2Vec + SVC': Pipeline([
        ('w2v', FunctionTransformer(word2vec_transform, validate=False)),
        ('scaler', StandardScaler()),
        ('clf', SVC())
    ]),
    'Word2Vec + DecisionTree': Pipeline([
        ('w2v', FunctionTransformer(word2vec_transform, validate=False)),
        ('clf', DecisionTreeClassifier())
    ]),
    'Doc2Vec + LogisticRegression': Pipeline([
        ('d2v', FunctionTransformer(doc2vec_transform, validate=False)),
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000))
    ]),
    'Doc2Vec + SVC': Pipeline([
        ('d2v', FunctionTransformer(doc2vec_transform, validate=False)),
        ('scaler', StandardScaler()),
        ('clf', SVC())
    ]),
    'Doc2Vec + DecisionTree': Pipeline([
        ('d2v', FunctionTransformer(doc2vec_transform, validate=False)),
        ('clf', DecisionTreeClassifier())
    ]),
}

Evaluate the classifier: grid search and evaluation

In [10]:
# Dictionary to store classification reports
classification_reports = {}

# Grid search and evaluation
for pipeline_name, pipeline in pipelines.items():
    # Fit pipeline
    pipeline.fit(X_train, y_train)
    
    # Predict
    y_pred = pipeline.predict(X_test)
    
    # Generate classification report
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Store classification report in dictionary
    classification_reports[pipeline_name] = report


Save report as txt

In [11]:
home_dir = os.path.expanduser('/Users/millenasiqueiraguimaraes/Library/Mobile Documents/com~apple~CloudDocs/Courses/GBC material/Machine Learning II')
output_file = os.path.join(home_dir, "classification_reports.txt")

with open(output_file, "w") as file:
    for pipeline_name, report in classification_reports.items():
        file.write(f"Pipeline: {pipeline_name}\n")
        file.write(classification_report(y_test, y_pred))
        file.write("\n\n")

print(f"Classification reports saved to {output_file}")

Classification reports saved to /Users/millenasiqueiraguimaraes/Library/Mobile Documents/com~apple~CloudDocs/Courses/GBC material/Machine Learning II/classification_reports.txt
