# Word Embedding

## Simple example

In [1]:
from gensim.models import Word2Vec
# define training data
sentences = [['this', 'is', 'the', 'first', 'sentence', 'for', 'word2vec'],
             ['this', 'is', 'the', 'second', 'sentence'],
             ['yet', 'another', 'sentence'],
             ['one', 'more', 'sentence'],
             ['and', 'the', 'final', 'sentence']]

# train model
model = Word2Vec(sentences, min_count=1,vector_size=50)

word="another"
print("another:",model.wv[word])
print("Model lenghth:",len(model.wv))

print("Most similar words:",model.wv.most_similar(positive=word,topn=3))

# save model
model.save('model.bin')

# load model
new_model = Word2Vec.load('model.bin')
print(new_model)


another: [ 0.00855287  0.00015212 -0.01916856 -0.01933109 -0.01229639 -0.00025714
  0.00399483  0.01886394  0.0111687  -0.00858139  0.00055663  0.00992872
  0.01539662 -0.00228845  0.00864684 -0.01162876 -0.00160838  0.0162001
 -0.00472013 -0.01932691  0.01155852 -0.00785964 -0.00244575  0.01996103
 -0.0045127  -0.00951413 -0.01065877  0.01396178 -0.01141774  0.00422733
 -0.01051132  0.01224143  0.00871461  0.00521271 -0.00298217 -0.00549213
  0.01798587  0.01043155 -0.00432504 -0.01894062 -0.0148521  -0.00212748
 -0.00158989 -0.00512582  0.01936544 -0.00091704  0.01174752 -0.01489517
 -0.00501215 -0.01109973]
Model lenghth: 14
Most similar words: [('one', 0.1845843493938446), ('final', 0.13661059737205505), ('sentence', 0.13204392790794373)]
Word2Vec<vocab=14, vector_size=50, alpha=0.025>


## Pretrained Models

Check availability

In [2]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


Downloading a model...

In [3]:
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

Printing a vector length

In [4]:
print(len(glove_vectors[0]))

50


Printing the vector of a given word

In [5]:
print(glove_vectors['twitter'])

[ 0.55473    0.14251    1.577      0.44222   -0.40965   -0.24373
 -1.2366    -0.64589    0.31804    0.48623   -0.20947    0.019861
 -0.28046   -0.64705    0.87607   -0.28965   -1.1877    -0.22703
  0.73132    0.064986   0.34437   -0.044798   0.85787    1.0463
  1.3781    -0.21831    0.45545   -0.36639   -0.32279   -0.34018
  1.5663    -0.028824   0.0062708 -0.62084   -1.3351     0.082663
 -0.085856  -0.67657   -1.1872    -0.40016    1.1583    -0.50842
 -1.8528     0.49679    0.94368   -0.97676    0.30505    0.15514
  0.26331   -0.10485  ]


Getting the most similar words

In [6]:
print(glove_vectors.most_similar('twitter'))

[('facebook', 0.9333045482635498), ('myspace', 0.8801369667053223), ('youtube', 0.8430657982826233), ('blog', 0.8262057304382324), ('blogs', 0.8064824342727661), ('blogging', 0.7970671057701111), ('tumblr', 0.7901089787483215), ('email', 0.778261125087738), ('tweets', 0.7604537010192871), ('e-mail', 0.7538726925849915)]


Parallelogram example

In [7]:
print(glove_vectors.most_similar(positive=['doctor','woman'],negative='man'))

[('nurse', 0.8404642939567566), ('child', 0.7663259506225586), ('pregnant', 0.7570130228996277), ('mother', 0.7517457604408264), ('patient', 0.7516663074493408), ('physician', 0.7507280707359314), ('dentist', 0.7360343933105469), ('therapist', 0.7342537045478821), ('parents', 0.7286345958709717), ('surgeon', 0.7165213227272034)]


## Spam classification with document embedding

Text preprocessing (as usual)

In [8]:
import gensim
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
from sklearn.preprocessing import Normalizer


def transformText(text):
    # Convert text to lowercase
    text = text.lower()
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    stops = set(stopwords.words("english"))
    filtered_words = [word for word in text.split()]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    return nltk.word_tokenize(text)

Embedding transformer

In [9]:
from sklearn.base import BaseEstimator,TransformerMixin

class MeanEmbeddingVectorizer(BaseEstimator,TransformerMixin):
    def __init__(self, model):
        self.model= model
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(self.model[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        newDocs=[]
        for doc in X:
            vecs=[]
            i=0
            for w in doc:
                i+=1
                if w in self.model:
                    vecs.append(self.model[w])
                else:
                    vecs.append(np.asarray(np.zeros(self.dim)))
            if i>0:
                n=np.asarray(np.mean(vecs,axis=0))
            else:
                n=np.asarray(np.zeros(self.dim))
            newDocs.append(np.asarray(n))
        return np.array(newDocs)

Pretrained model loading and pipeline instantiation

In [10]:
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
glove_vectors = gensim.downloader.load('glove-wiki-gigaword-100')

from sklearn.pipeline import Pipeline
from sklearn import svm

clf = Pipeline(
    [
        ("emb", MeanEmbeddingVectorizer(glove_vectors)),
        ("clf", svm.SVC(kernel="rbf",C=1000,gamma=0.001)),
    ]
)



['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


Data loading, and training/test set splitting

In [11]:
dataset=pd.read_csv("sms_spam.csv")
#applies transformText to all rows of text
dataset['text'] = dataset['text'].map(transformText)

## Split the data
from sklearn.model_selection import train_test_split
#import time
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['type'],
                                                    test_size=0.33, random_state=10)

Training

In [12]:
clf.fit(X_train, y_train)

Prediction

In [13]:
#performing the actual prediction
predicted = clf.predict(X_test)

from sklearn import metrics
print(pd.crosstab(y_test,predicted))
print(metrics.classification_report(y_test, predicted))

col_0   ham  spam
type             
ham    1564    19
spam     44   208
              precision    recall  f1-score   support

         ham       0.97      0.99      0.98      1583
        spam       0.92      0.83      0.87       252

    accuracy                           0.97      1835
   macro avg       0.94      0.91      0.92      1835
weighted avg       0.96      0.97      0.96      1835

