# Document classification with Word2Vec model

## Word Embeddings

The objective is to map words to real vectors. The vectors can be generated with neural networks,
co-occurrence matrix, probabilistic models, etc.

The idea is that the BoW approach leads to highly dimensional and sparse vectors,
plus completely independent tokens. Namely, each n-gram representation does not
change if used in different sentences.

[Word2Vec](https://code.google.com/archive/p/word2vec/) is a method of building
word embeddings with the following two strategies. The resulting vectors are
continuous and context dependent.

- Skip Gram: NN that predicts the context, given the current word.
- Common Bag of Words (CBOW): NN that predicts the current word, given its context.

In this notebook we train a Word2Vec model taken from the
[gensim](https://radimrehurek.com/gensim/index.html) package and use it to
perform token embedding.  
The high dimensional embeddings are then classfied by a support vector machine.

In [None]:
# load dataset
from sklearn.datasets import fetch_20newsgroups

categories = ["alt.atheism", "soc.religion.christian", "comp.graphics", "sci.med"]
twenty_train = fetch_20newsgroups(
    subset="train",
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=("headers", "footers", "quotes"),
)

twenty_test = fetch_20newsgroups(
    subset="test",
    categories=categories,
    shuffle=True,
    random_state=42,
    remove=("headers", "footers", "quotes"),
)

In [29]:
# tokenize dataset
import tqdm
import gensim

tokenized_train = [gensim.utils.simple_preprocess(doc) for doc in twenty_train.data]

In [74]:
# train word2vec model
# sg = 0 -> CBOW, sg = 1 skip-gram
model = gensim.models.Word2Vec(
    sentences=tokenized_train, vector_size=200, window=5, min_count=5, sg=1
)

In [75]:
import numpy as np

# extract a single embedding for each doc
# use the average of its tokens vectors
def document_vector(doc, model):
    vecs = [
        model.wv[token]
        for token in gensim.utils.simple_preprocess(doc)
        if token in model.wv
    ]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)


def extract_corpus_embed_matrix(corpus, model):
    train_vectors = [document_vector(doc, model) for doc in corpus]
    return np.stack(train_vectors, axis=0)


x_train = extract_corpus_embed_matrix(twenty_train.data, model)
x_train.shape

(2257, 200)

In [76]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss="hinge", penalty="l2", alpha=1e-3, random_state=42, tol=None)
clf.fit(x_train, twenty_train.target)

## Create a new sklearn Pipeline block

Wrap `gensim.models.Word2Vec` around an sklearn BaseEstimantor to put the node
in a pipeline.

In [81]:
import gensim
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin


class Word2VecTransformer(BaseEstimator, TransformerMixin):
    """Sklearn Pipeline transformer wrapper around gensim Word2Vec model"""

    def __init__(
        self,
        vector_size: int = 100,
        window: int = 5,
        min_count: int = 5,
        sg: int = 0
    ):
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.sg = sg

        self.model = None

    def fit(self, X, y=None):
        """
        :param X: the corpus to train the Word2Vec on
        :type X: List[str]
        """
        # tokenize corpus
        tokenized_train = [gensim.utils.simple_preprocess(doc) for doc in X]

        # train the model
        self.model = gensim.models.Word2Vec(
            sentences=tokenized_train,
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            sg=self.sg,
        )
        return self

    def document_vector(self, doc: str):
        # tokenize the document and compute embeddings
        vecs = [
            model.wv[token]
            for token in gensim.utils.simple_preprocess(doc)
            if token in model.wv
        ]
        return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)

    def transform(self, X):
        if self.model is None:
            raise ValueError(
                "The model should call the `fit()` method before `transform()`"
            )
        train_vectors = [self.document_vector(doc) for doc in X]
        return np.stack(train_vectors, axis=0)

In [82]:
# create pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline(
    [
        ("word2vec", Word2VecTransformer()),
        (
            "clf",
            SGDClassifier(
                loss="hinge", penalty="l2", alpha=1e-3, random_state=42, tol=None
            ),
        ),

    ]
)
text_clf.fit(twenty_train.data, twenty_train.target)

In [84]:
# test the pipeline
preds = text_clf.predict(twenty_test.data)
np.mean(preds == twenty_test.target)

0.6830892143808256

# Grid Search

In [92]:
parameters = {
    "word2vec__vector_size": [100,200,300,400,500],
    "word2vec__sg": [0,1],
}

In [93]:
from sklearn.model_selection import GridSearchCV

gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=6)

gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [94]:
gs_clf.best_score_

0.69

In [95]:
for p in sorted(parameters.keys()):
    print(f"{p}: {gs_clf.best_params_[p]}")

word2vec__sg: 0
word2vec__vector_size: 100


In [96]:
import pandas as pd
pd.DataFrame(gs_clf.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_word2vec__sg,param_word2vec__vector_size,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2.438784,0.089807,0.151731,0.037054,0,100,"{'word2vec__sg': 0, 'word2vec__vector_size': 100}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
1,2.454347,0.166459,0.146495,0.022484,0,200,"{'word2vec__sg': 0, 'word2vec__vector_size': 200}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
2,2.646499,0.099795,0.143623,0.01718,0,300,"{'word2vec__sg': 0, 'word2vec__vector_size': 300}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
3,2.711491,0.10379,0.212415,0.113689,0,400,"{'word2vec__sg': 0, 'word2vec__vector_size': 400}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
4,2.94189,0.30926,0.164905,0.032156,0,500,"{'word2vec__sg': 0, 'word2vec__vector_size': 500}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
5,2.843275,0.105423,0.183753,0.062593,1,100,"{'word2vec__sg': 1, 'word2vec__vector_size': 100}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
6,3.67147,0.243405,0.208096,0.107494,1,200,"{'word2vec__sg': 1, 'word2vec__vector_size': 200}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
7,4.417891,0.126784,0.192075,0.040728,1,300,"{'word2vec__sg': 1, 'word2vec__vector_size': 300}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
8,4.90223,0.155742,0.249699,0.192018,1,400,"{'word2vec__sg': 1, 'word2vec__vector_size': 400}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
9,3.604108,1.025849,0.091894,0.02856,1,500,"{'word2vec__sg': 1, 'word2vec__vector_size': 500}",0.675,0.6625,0.65,0.8,0.6625,0.69,0.055565,1
