# word2vec: How To Prep Word Vectors For Modeling

### Train Our Own Model

In [2]:
# Read in the data, clean it, split it into train and test sets, and then train a word2vec model
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]


messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

w2v_model = gensim.models.Word2Vec(X_train,
                                   #size=100,
                                   window=5,
                                   min_count=2)

### Prep Word Vectors

In [4]:
# Generate a list of words the word2vec model learned word vectors for
#w2v_model.wv.index2word
w2v_model.wv.index_to_key

['to',
 'you',
 'the',
 'and',
 'is',
 'in',
 'me',
 'my',
 'it',
 'your',
 'for',
 'call',
 'of',
 'have',
 'on',
 'that',
 'now',
 'so',
 'can',
 'are',
 'but',
 'not',
 'or',
 'do',
 'we',
 'if',
 'will',
 'get',
 'be',
 'ur',
 'no',
 'at',
 'with',
 'just',
 'this',
 'gt',
 'lt',
 'how',
 'up',
 'go',
 'free',
 'ok',
 'when',
 'll',
 'what',
 'from',
 'all',
 'out',
 'then',
 'know',
 'good',
 'got',
 'like',
 'he',
 'its',
 'day',
 'am',
 'come',
 'was',
 'love',
 'there',
 'time',
 'only',
 'want',
 'send',
 'going',
 'txt',
 'text',
 'as',
 'one',
 'home',
 'need',
 'still',
 'about',
 'don',
 'today',
 'lor',
 'by',
 'stop',
 'sorry',
 'reply',
 'back',
 'new',
 'dont',
 'see',
 'da',
 'later',
 'she',
 'tell',
 'think',
 'they',
 'please',
 'take',
 'hi',
 'did',
 'mobile',
 'pls',
 'our',
 'dear',
 'any',
 'who',
 'some',
 'phone',
 'week',
 'her',
 'here',
 'ì_',
 'been',
 'oh',
 'too',
 'wat',
 'where',
 'an',
 'happy',
 'claim',
 'much',
 'him',
 're',
 'well',
 'night',
 

In [7]:
# Generate aggregated sentence vectors based on the word vectors for each word in the sentence
# w2v_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key])
#                      for ls in X_test])
w2v_vect = np.array([np.mean([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key], axis=0) 
                     for ls in X_test if np.array([w2v_model.wv[i] for i in ls if i in w2v_model.wv.index_to_key]).size > 0])


In [8]:
# Why is the length of the sentence different than the length of the sentence vector?
for i, v in enumerate(w2v_vect):
    print(len(X_test.iloc[i]), len(v))

7 100
13 100
16 100
4 100
24 100
30 100
12 100
25 100
28 100
8 100
6 100
8 100
5 100
7 100
7 100
4 100
11 100
24 100
12 100
5 100
16 100
13 100
6 100
8 100
50 100
10 100
6 100
21 100
24 100
6 100
10 100
24 100
26 100
68 100
20 100
17 100
28 100
4 100
30 100
23 100
12 100
23 100
6 100
21 100
6 100
2 100
5 100
5 100
6 100
23 100
33 100
11 100
3 100
5 100
4 100
17 100
4 100
26 100
6 100
6 100
27 100
9 100
8 100
19 100
5 100
8 100
34 100
15 100
10 100
8 100
33 100
27 100
29 100
4 100
14 100
25 100
19 100
8 100
11 100
5 100
5 100
5 100
11 100
4 100
23 100
18 100
6 100
9 100
44 100
5 100
8 100
7 100
4 100
9 100
5 100
8 100
12 100
5 100
6 100
26 100
23 100
4 100
14 100
3 100
22 100
25 100
4 100
12 100
8 100
27 100
6 100
5 100
9 100
10 100
7 100
12 100
4 100
6 100
17 100
6 100
23 100
17 100
4 100
12 100
6 100
7 100
10 100
25 100
25 100
15 100
16 100
9 100
4 100
4 100
12 100
23 100
6 100
11 100
8 100
2 100
22 100
25 100
25 100
1 100
5 100
13 100
21 100
25 100
27 100
9 100
12 100
5 100
8 100
17 

In [9]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
w2v_vect_avg = []

for vect in w2v_vect:
    if len(vect)!=0:
        w2v_vect_avg.append(vect.mean(axis=0))
    else:
        w2v_vect_avg.append(np.zeros(100))

In [11]:
# Are our sentence vector lengths consistent?
# for i, v in enumerate(w2v_vect_avg):
#     print(len(X_test.iloc[i]), len(v))

# Are our sentence vector lengths consistent?
for i, v in enumerate(w2v_vect_avg):
    if isinstance(v, np.ndarray):
        print(len(X_test.iloc[i]), len(v))
    elif isinstance(v, np.float32):
        print(len(X_test.iloc[i]), v)


7 0.013711793
13 0.023129921
16 0.015465846
4 0.02116424
24 0.019511199
30 0.019448273
12 0.020509357
25 0.020377202
28 0.026550388
8 0.023002325
6 0.02602373
8 0.018566454
5 0.016458347
7 0.007882167
7 0.013440284
4 0.009965184
11 0.020393629
24 0.021232242
12 0.025678258
5 0.01412863
16 0.020050501
13 0.02063242
6 0.01839269
8 0.018825583
50 0.021551087
10 0.02165899
6 0.015595197
21 0.01937117
24 0.016402902
6 0.01588728
10 0.021329064
24 0.020169111
26 0.019027881
68 0.020629184
20 0.017572885
17 0.01923362
28 0.021712413
4 0.021261502
30 0.018331427
23 0.016464448
12 0.020590112
23 0.016299665
6 0.009241277
21 0.017958408
6 0.013583243
2 0.012463221
5 0.013752707
5 0.017357096
6 0.020432431
23 0.022608275
33 0.020548664
11 0.022286946
3 0.018276332
5 0.025985442
4 0.01941488
17 0.024656959
4 0.02414125
26 0.023062013
6 0.022279264
6 0.020387294
27 0.017721383
9 0.01446907
8 0.025325198
19 0.012652801
5 0.015609073
8 0.024195585
34 0.019738616
15 0.016443092
10 0.013192747
8 0.0215