# 3. Document Classification Based on BOW


## 1. Preparing the 20 Newsgroups Data and Feature Extraction

 http://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

In [1]:
from sklearn.datasets import fetch_20newsgroups

# Create a list of topics to select from the 20 categories
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

# Fetch the training dataset
newsgroups_train = fetch_20newsgroups(subset='train',
# Remove hinting parts from the email content - classify purely based on content
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

# Fetch the test dataset
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

print('#Train set size:', len(newsgroups_train.data))
print('#Test set size:', len(newsgroups_test.data))
print('#Selected categories:', newsgroups_train.target_names)
print('#Train labels:', set(newsgroups_train.target))

#Train set size: 2034
#Test set size: 1353
#Selected categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
#Train labels: {0, 1, 2, 3}


In [2]:
print('#Train set text samples:', newsgroups_train.data[0])
print('#Train set label smaples:', newsgroups_train.target[0])
print('#Test set text samples:', newsgroups_test.data[0])
print('#Test set label smaples:', newsgroups_test.target[0])

#Train set text samples: Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
#Train set label smaples: 1
#Test set text samples: TRry the SKywatch project in  Arizona.
#Test set label smaples: 2


In [3]:
X_train = newsgroups_train.data   # Training dataset documents
y_train = newsgroups_train.target # Training dataset labels

X_test = newsgroups_test.data     # Test dataset documents
y_test = newsgroups_test.target   # Test dataset labels

## 2. Document Representation Based on Distributed Representation

### 1) Word2Vec

In [4]:
# Import necessary libraries for Word2Vec and machine learning models
import gensim
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK resources for tokenization and stopwords
nltk.download('punkt')  # For word tokenization
nltk.download('stopwords')  # For filtering out common stopwords

# Initialize a set of stopwords for English and a stemmer to reduce words to their root form
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()  # PorterStemmer is commonly used to reduce words to their stem form

# Data preprocessing function
def preprocess_data(data):
    processed_data = []
    for sentence in data:
        # Tokenize the sentence into words
        tokens = word_tokenize(sentence)
        # Convert to lowercase, remove stopwords and special characters, and apply stemming
        tokens = [stemmer.stem(re.sub(r'\W+', '', word.lower()))
                  for word in tokens
                  if word.lower() not in stop_words and re.sub(r'\W+', '', word)]
        processed_data.append(tokens)  # Add the cleaned tokens to the processed data
    return processed_data

# Step 1: Train the Word2Vec model
# Preprocess the training and testing data using the preprocess_data function
X_train_tokenized = preprocess_data(X_train)
X_test_tokenized = preprocess_data(X_test)

# Flatten the tokenized training data into a single list and calculate word frequency
all_words = [word for sentence in X_train_tokenized for word in sentence]
word_counts = Counter(all_words)  # Count frequency of each word in the training data

# Define a threshold to remove low-frequency words
min_count_threshold = 2  # Words with a frequency of 2 or lower will be removed
# Filter the tokenized training and testing data to keep only frequent words
X_train_tokenized = [[word for word in sentence if word_counts[word] > min_count_threshold] for sentence in X_train_tokenized]
X_test_tokenized = [[word for word in sentence if word_counts[word] > min_count_threshold] for sentence in X_test_tokenized]

# Train the Word2Vec model with the tokenized and filtered training data
# vector_size: Dimensionality of the word vectors
# window: Maximum distance between the current and predicted word
# sg: Use skip-gram (1) instead of CBOW (0)
w2v_model = Word2Vec(sentences=X_train_tokenized, vector_size=100, window=2, min_count=2, sg=1)

# Step 2: Function to generate Word2Vec vectors for each sentence
# Each sentence vector is computed as the average of its word vectors
def get_w2v_vectors(data, model, vector_size=100):
    vectors = []
    for sentence in data:
        # Initialize a zero vector for the sentence
        sentence_vec = np.zeros(vector_size)
        count = 0  # Track the number of words found in the Word2Vec model
        for word in sentence:
            # Check if the word exists in the Word2Vec model
            if word in model.wv.key_to_index:
                # Add the word vector to the sentence vector
                sentence_vec += model.wv[word]
                count += 1
        # If the sentence contains valid words, compute the average of the word vectors
        if count != 0:
            # The sentence vector is the average of the word vectors in the sentence
            sentence_vec /= count
        # Append the resulting sentence vector (average of word vectors) to the list
        vectors.append(sentence_vec)
    return np.array(vectors)

# Generate Word2Vec vectors for both the training and testing data
X_train_w2v = get_w2v_vectors(X_train_tokenized, w2v_model)
X_test_w2v = get_w2v_vectors(X_test_tokenized, w2v_model)

# Step 3: Train machine learning models (Logistic Regression and Random Forest)
# Initialize two different classifiers for comparison
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),  # Logistic Regression with maximum iterations set to 1000
    'Random Forest': RandomForestClassifier()  # Random Forest Classifier
}

# Dictionary to store model names and their performance metrics
results = {'Model': [], 'Train Accuracy': [], 'Test Accuracy': []}

# Train each model and evaluate accuracy on the training and testing data
for model_name, model in models.items():
    # Fit the model using the Word2Vec vectorized training data and corresponding labels
    model.fit(X_train_w2v, y_train)
    # Calculate accuracy on both the training and testing datasets
    train_acc = model.score(X_train_w2v, y_train)
    test_acc = model.score(X_test_w2v, y_test)

    # Store the results for each model
    results['Model'].append(model_name + " (Word2Vec)")  # Add the model name and the method used (Word2Vec)
    results['Train Accuracy'].append(train_acc)  # Training accuracy
    results['Test Accuracy'].append(test_acc)  # Testing accuracy

[nltk_data] Downloading package punkt to /home/minjoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/minjoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
print(results)

{'Model': ['Logistic Regression (Word2Vec)', 'Random Forest (Word2Vec)'], 'Train Accuracy': [0.683874139626352, 0.9783677482792527], 'Test Accuracy': [0.6400591278640059, 0.6577974870657798]}


### 2) FastText

In [6]:
# Import FastText from Gensim library
from gensim.models import FastText

# Step 1: Train FastText model
# FastText model is trained on tokenized training sentences
# vector_size: Dimensionality of the word vectors
# window: The maximum distance between the current and predicted word within a sentence
# min_count: Ignores all words with total frequency lower than this value
# sg: Training algorithm. 1 means skip-gram, 0 means CBOW
fasttext_model = FastText(sentences=X_train_tokenized, vector_size=100, window=5, min_count=2, sg=1)

# Step 2: Function to generate sentence vectors from Word2Vec or FastText models
# This function takes tokenized sentences and converts them into sentence vectors
# by averaging the word vectors for words that exist in the model's vocabulary.
def get_w2v_vectors(data, model, vector_size=100):
    vectors = []
    for sentence in data:
        # Initialize a zero vector for each sentence
        sentence_vec = np.zeros(vector_size)
        count = 0  # To track how many words in the sentence exist in the model
        for word in sentence:
            # Check if the word exists in the model's vocabulary
            if word in model.wv.key_to_index:
                # Add the word vector to the sentence vector
                sentence_vec += model.wv[word]
                count += 1
        # If there are valid words in the sentence, compute the average word vector
        if count != 0:
            # The sentence vector is the average of the word vectors
            sentence_vec /= count
        # Append the sentence vector to the list
        vectors.append(sentence_vec)
    # Return the list of sentence vectors as a numpy array
    return np.array(vectors)

# Step 3: Generate FastText vectors for training and testing data
# Use the previously defined function to convert tokenized sentences to vectors
# by averaging the word vectors learned by the FastText model.
X_train_fasttext = get_w2v_vectors(X_train_tokenized, fasttext_model)
X_test_fasttext = get_w2v_vectors(X_test_tokenized, fasttext_model)

# Step 4: Train machine learning models (Logistic Regression and Random Forest)
# on FastText sentence vectors and evaluate their performance
for model_name, model in models.items():
    # Train the model on FastText vectors and the corresponding labels
    model.fit(X_train_fasttext, y_train)
    # Calculate accuracy on the training and testing data
    train_acc = model.score(X_train_fasttext, y_train)
    test_acc = model.score(X_test_fasttext, y_test)

    # Store the model name and its accuracy results
    results['Model'].append(model_name + " (FastText)")  # Append model name with FastText notation
    results['Train Accuracy'].append(train_acc)  # Append training accuracy
    results['Test Accuracy'].append(test_acc)  # Append testing accuracy

In [7]:
print(results)

{'Model': ['Logistic Regression (Word2Vec)', 'Random Forest (Word2Vec)', 'Logistic Regression (FastText)', 'Random Forest (FastText)'], 'Train Accuracy': [0.683874139626352, 0.9783677482792527, 0.7340216322517208, 0.9783677482792527], 'Test Accuracy': [0.6400591278640059, 0.6577974870657798, 0.6858832224685883, 0.6962305986696231]}


### 3) GloVe

In [8]:
!pip install glove-python3



In [9]:
# Import GloVe libraries to create word embeddings using the GloVe algorithm
from glove import Corpus, Glove

# Step 1: Train GloVe model
# Create a Corpus object that will hold the co-occurrence matrix for the GloVe model
corpus = Corpus()

# Fit the corpus with the tokenized training data
# The 'window' parameter specifies the context window size around each word
corpus.fit(X_train_tokenized, window=5)

# Initialize the GloVe model
# no_components: Dimensionality of the word vectors
# learning_rate: Learning rate for model training
glove_model = Glove(no_components=100, learning_rate=0.05)

# Train the GloVe model using the co-occurrence matrix from the corpus
# epochs: Number of iterations to train the model
# no_threads: Number of parallel threads for training
# verbose: Whether to print progress during training
glove_model.fit(corpus.matrix, epochs=10, no_threads=4, verbose=True)

# Add the word dictionary from the corpus to the GloVe model
# This allows the model to map words to indices in the vector space
glove_model.add_dictionary(corpus.dictionary)

# Step 2: Function to generate sentence vectors from GloVe model
# The function converts tokenized sentences into sentence vectors
# by averaging the GloVe word vectors for words found in the dictionary
def get_glove_vectors(data, model, dictionary, vector_size=100):
    vectors = []
    for sentence in data:
        # Initialize a zero vector for each sentence
        sentence_vec = np.zeros(vector_size)
        count = 0  # Count the number of valid words in the model
        for word in sentence:
            # Check if the word exists in the dictionary
            if word in dictionary:
                # Add the word's GloVe vector to the sentence vector
                sentence_vec += model.word_vectors[dictionary[word]]
                count += 1
        # If the sentence has valid words, average their word vectors
        if count != 0:
            # Compute the average of the word vectors to form the sentence vector
            sentence_vec /= count
        # Append the resulting sentence vector to the list
        vectors.append(sentence_vec)
    # Return the sentence vectors as a numpy array
    return np.array(vectors)

# Generate GloVe vectors for both training and testing data
# This uses the get_glove_vectors function to convert tokenized data into sentence vectors
X_train_glove = get_glove_vectors(X_train_tokenized, glove_model, corpus.dictionary)
X_test_glove = get_glove_vectors(X_test_tokenized, glove_model, corpus.dictionary)

# Step 3: Model training and performance evaluation
# Train each machine learning model (Logistic Regression, Random Forest)
# on the GloVe sentence vectors and evaluate their performance
for model_name, model in models.items():
    # Fit the model using the GloVe vectors and corresponding labels
    model.fit(X_train_glove, y_train)
    # Calculate accuracy on both training and testing datasets
    train_acc = model.score(X_train_glove, y_train)
    test_acc = model.score(X_test_glove, y_test)

    # Store the model name and accuracy results
    results['Model'].append(model_name + " (GloVe)")  # Append model name with GloVe notation
    results['Train Accuracy'].append(train_acc)  # Append training accuracy
    results['Test Accuracy'].append(test_acc)  # Append testing accuracy

Performing 10 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


In [10]:
print(results)

{'Model': ['Logistic Regression (Word2Vec)', 'Random Forest (Word2Vec)', 'Logistic Regression (FastText)', 'Random Forest (FastText)', 'Logistic Regression (GloVe)', 'Random Forest (GloVe)'], 'Train Accuracy': [0.683874139626352, 0.9783677482792527, 0.7340216322517208, 0.9783677482792527, 0.47640117994100295, 0.9783677482792527], 'Test Accuracy': [0.6400591278640059, 0.6577974870657798, 0.6858832224685883, 0.6962305986696231, 0.4656319290465632, 0.5898004434589801]}


In [11]:
import pandas as pd

# Organize the results into a dataframe for better visibility
results_df = pd.DataFrame(results)

# Print the results in a table format
results_df

Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Logistic Regression (Word2Vec),0.683874,0.640059
1,Random Forest (Word2Vec),0.978368,0.657797
2,Logistic Regression (FastText),0.734022,0.685883
3,Random Forest (FastText),0.978368,0.696231
4,Logistic Regression (GloVe),0.476401,0.465632
5,Random Forest (GloVe),0.978368,0.5898
