In [83]:
import pandas as pd
pd.set_option("display.max_colwidth", 100)
import numpy as np

from sklearn.model_selection import train_test_split

## word2vec

In [84]:
import gensim
import gensim.downloader as api 

In [85]:
# load pretrained word vectors
wiki_embeddings = api.load("glove-wiki-gigaword-100")

In [86]:
# exploring for the word vector king
wiki_embeddings["king"]

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

In [87]:
# Find the words most similar to king based on this trained model
wiki_embeddings.most_similar("king")

[('prince', 0.7682329416275024),
 ('queen', 0.7507689595222473),
 ('son', 0.7020888328552246),
 ('brother', 0.6985775828361511),
 ('monarch', 0.6977890729904175),
 ('throne', 0.691999077796936),
 ('kingdom', 0.6811410188674927),
 ('father', 0.6802029013633728),
 ('emperor', 0.6712858080863953),
 ('ii', 0.6676074266433716)]

In [88]:
# Read in the data and clean up column names
data = pd.read_csv("./../DATA/spam.csv", encoding="latin-1")
data = data.drop(columns=["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"])
data.columns = ["label", "text"]

In [89]:
# peek
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [90]:
# Clean data using the built in cleaner in gensim
data["cleaned_text"] = data["text"].apply(lambda x: gensim.utils.simple_preprocess(x))

In [91]:
# peek to check new column
data.head()

Unnamed: 0,label,text,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...","[go, until, jurong, point, crazy, available, only, in, bugis, great, world, la, buffet, cine, th..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,"[free, entry, in, wkly, comp, to, win, fa, cup, final, tkts, st, may, text, fa, to, to, receive,..."
3,ham,U dun say so early hor... U c already then say...,"[dun, say, so, early, hor, already, then, say]"
4,ham,"Nah I don't think he goes to usf, he lives around here though","[nah, don, think, he, goes, to, usf, he, lives, around, here, though]"


In [92]:
# Split data into train and test sets
X = data["cleaned_text"]
y = data["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [93]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(vector_size=100, window=5, min_count=2)
w2v_model.build_vocab(X_train)
w2v_model.train(X_train, total_examples=w2v_model.corpus_count, epochs=w2v_model.epochs)

(241916, 317290)

In [94]:
# Explore the word vector for "king" base on our trained model
w2v_model.wv["king"]

array([-0.05201751,  0.05199692,  0.02610636,  0.00249891,  0.00444603,
       -0.12647581,  0.06698973,  0.15534317, -0.05369888, -0.0716176 ,
       -0.04915069, -0.07898469, -0.01976242,  0.02804947,  0.02965663,
       -0.05393755,  0.00526644, -0.04550261, -0.00899211, -0.14256194,
        0.05493692,  0.02498526,  0.05035293, -0.06732663, -0.02427477,
        0.00092512, -0.06136346, -0.04174254, -0.03787519,  0.01552656,
        0.05572938,  0.00255378,  0.02925463, -0.07840391, -0.02743652,
        0.09920113,  0.0228102 , -0.03637904, -0.04697398, -0.1095472 ,
        0.00983425, -0.07732075, -0.03736102,  0.01495052,  0.07554429,
       -0.0080024 , -0.05246867, -0.00321334,  0.03820533,  0.01366043,
        0.03317934, -0.0719398 ,  0.00064927,  0.00182583, -0.03952695,
        0.00532693,  0.05121961, -0.00247967, -0.08785013,  0.02707108,
        0.00113539,  0.04829653, -0.0002375 ,  0.0156558 , -0.08421139,
        0.07579385, -0.00157904,  0.05663024, -0.10685826,  0.10

In [95]:
w2v_model.wv.most_similar("king")

[('thanks', 0.9949692487716675),
 ('didnt', 0.9949305653572083),
 ('our', 0.9948911070823669),
 ('box', 0.9948787093162537),
 ('xmas', 0.9948737621307373),
 ('camera', 0.9948344230651855),
 ('to', 0.9948233962059021),
 ('more', 0.9948000907897949),
 ('those', 0.9947593808174133),
 ('or', 0.9947556853294373)]

In [96]:
w2v_model.wv["king"].shape

(100,)

In [98]:
# Cosine similarity
w2v_model.wv.similarity("king", "queen")

0.7177754

In [99]:
# Get all vectors of our corpus that the model trained on
w2v_model.wv.get_normed_vectors()

array([[-0.07795167,  0.08224536,  0.03171175, ..., -0.1042738 ,
         0.02183773, -0.00191008],
       [-0.07895141,  0.08146629,  0.03221877, ..., -0.10575425,
         0.02096343,  0.00175117],
       [-0.07287569,  0.08219016,  0.02569381, ..., -0.09859344,
         0.02777221, -0.0061619 ],
       ...,
       [-0.06356505,  0.12715638, -0.02951374, ..., -0.15374829,
         0.0733623 ,  0.03816374],
       [-0.07138442,  0.03804385,  0.06966289, ..., -0.04707469,
         0.02738026,  0.03870334],
       [-0.07949615,  0.09303778,  0.00032555, ..., -0.09124443,
         0.0238862 ,  0.01884885]], dtype=float32)

In [100]:
# Generate a list of words the word2vec model learned word vectors for, from X_train
print(w2v_model.wv.index_to_key)
print(len(w2v_model.wv.index_to_key))

['you', 'to', 'the', 'and', 'in', 'is', 'me', 'my', 'it', 'for', 'your', 'call', 'of', 'have', 'that', 'on', 'now', 'are', 'can', 'so', 'but', 'not', 'or', 'ur', 'do', 'we', 'at', 'get', 'be', 'if', 'just', 'will', 'with', 'no', 'this', 'gt', 'lt', 'up', 'how', 'when', 'from', 'what', 'ok', 'go', 'll', 'free', 'all', 'know', 'good', 'out', 'like', 'am', 'day', 'there', 'he', 'was', 'got', 'then', 'come', 'its', 'time', 'love', 'only', 'want', 'text', 'send', 'as', 'txt', 'one', 'about', 'stop', 'see', 'sorry', 'going', 'today', 'don', 'home', 'by', 'need', 'mobile', 'she', 'da', 'lor', 'tell', 'still', 'reply', 'back', 'hi', 'dont', 'new', 'our', 'later', 'think', 'they', 'take', 'pls', 'please', 'did', 'phone', 'any', 'some', 'been', 'here', 'week', 'an', 'dear', 'her', 'night', 'claim', 'much', 'where', 'has', 'ì_', 'great', 'well', 'hope', 'who', 're', 'msg', 'oh', 'hey', 'him', 'wat', 'too', 'more', 'happy', 'had', 'give', 'work', 'message', 'yes', 've', 'make', 'won', 'said', 'tom

In [101]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
for msg in X_test[:5]:
    print(msg)

['ok', 'which', 'your', 'another', 'number']
['free', 'day', 'sexy', 'st', 'george', 'day', 'pic', 'of', 'jordan', 'txt', 'pic', 'to', 'dont', 'miss', 'out', 'then', 'every', 'wk', 'saucy', 'celeb', 'more', 'pics', 'pocketbabe', 'co', 'uk', 'wk']
['hey', 'mate', 'hows', 'honey', 'did', 'ave', 'good', 'holiday', 'gimmi', 'de', 'goss']
['its', 'good', 'we', 'll', 'find', 'way']
['am', 'not', 'interested', 'to', 'do', 'like', 'that']


In [102]:
# Reduce dimension
from sklearn.decomposition import PCA

pca = PCA(n_components=3)

In [103]:
X_pca = pca.fit_transform(w2v_model.wv.get_normed_vectors())

In [104]:
X_pca.shape

(3460, 3)

In [105]:
import plotly.express as px 
import plotly.io as pio
# Set the renderer to open in a browser
pio.renderers.default = 'browser'

fig = px.scatter_3d(X_pca[50:150], x=0, y=1, z=2, color=y[50:150])
fig.show()