# doc2vec: How To Prep Document Vectors For Modeling

### Train Our Own Model

In [3]:
# Read in data, clean it, split it into train/test, and then train a doc2vec model
import gensim
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)

messages = pd.read_csv('./data/spam.csv', encoding='latin-1')
messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["label", "text"]
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))

X_train, X_test, y_train, y_test = train_test_split(messages['text_clean'],
                                                    messages['label'], test_size=0.2)

tagged_docs_tr = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(X_train)]

d2v_model = gensim.models.Doc2Vec(tagged_docs_tr,
                                  vector_size=50,
                                  window=2,
                                  min_count=2)

In [4]:
# What does a document vector look like again?
d2v_model.infer_vector(['convert', 'words', 'to', 'vectors'])


array([ 0.0160803 ,  0.01025523,  0.03182423,  0.02497851,  0.00465351,
        0.00447948,  0.01410904, -0.03273206, -0.00268471, -0.006954  ,
        0.00785535,  0.00921986,  0.01115193,  0.0028922 , -0.01799711,
        0.01185948, -0.00047327, -0.03291325, -0.00081088, -0.00520088,
        0.03184735, -0.01408158,  0.00842579,  0.00569051,  0.00489169,
       -0.00069536,  0.01676262,  0.01821817, -0.02275796, -0.00573135,
       -0.01581258,  0.01050034,  0.02330843,  0.01478808, -0.01017254,
       -0.00776446,  0.00073255,  0.00584688,  0.01352263, -0.01278319,
        0.03053902,  0.02606562,  0.01142512, -0.00947258, -0.00339353,
       -0.00115252, -0.00766568,  0.00707489,  0.00546808,  0.00156479],
      dtype=float32)

In [5]:
# How do we prepare these vectors to be used in a machine learning model?
vectors = [[d2v_model.infer_vector(words)] for words in X_test]

In [6]:
vectors[0]

[array([ 0.03175561,  0.01439424,  0.04863995,  0.04242408,  0.00370294,
         0.01031234,  0.01245122, -0.05891113, -0.01267266, -0.01531839,
         0.00199126,  0.02146759,  0.03133297,  0.01253926, -0.03108902,
         0.02072878,  0.01061508, -0.05465336,  0.00413387, -0.00840401,
         0.0490956 , -0.03260579,  0.01115101,  0.02657761, -0.00176361,
        -0.00407269,  0.0336531 ,  0.05472916, -0.04284498,  0.00895467,
        -0.03518188,  0.01422446,  0.03778114,  0.02076449, -0.02059564,
         0.00427171, -0.01031834,  0.01640993,  0.01220672, -0.02292535,
         0.05847409,  0.07545223,  0.01042801, -0.00586517,  0.01101934,
        -0.00112158, -0.00489529, -0.00517681,  0.00772747,  0.01563663],
       dtype=float32)]

In [7]:
vectors[1]

[array([ 0.05390054,  0.04178806,  0.0731267 ,  0.0703095 , -0.00513255,
         0.01171278,  0.0126497 , -0.08593671, -0.00895066, -0.02875345,
         0.01776839,  0.01333664,  0.04228383,  0.01407511, -0.03641236,
         0.02997456,  0.00567681, -0.0907494 , -0.00569733,  0.00560542,
         0.07772337, -0.04426915,  0.01667151,  0.0448153 ,  0.00234522,
        -0.02207201,  0.04176671,  0.06389876, -0.06267015,  0.01604028,
        -0.03875209,  0.0245656 ,  0.04605745,  0.02375552, -0.02915484,
         0.00389715, -0.0092312 ,  0.01014214,  0.01849409, -0.04075533,
         0.07164147,  0.10245278,  0.02752741, -0.02072025,  0.00768327,
        -0.00224492, -0.00579937,  0.00574254,  0.02090523,  0.02146467],
       dtype=float32)]