### Training Own embeddings

In [35]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

In [36]:
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "If he cares about caring, then he should care about caring about caring.",
    "If he began to care, then he should begin to care about caring about caring.",
    "123 the world is large 32.34",
    'He stripped the striped paint by stripping the first coat of paint.'
]

In [37]:
processed_docs = [doc.lower().replace(".","") for doc in corpus]
processed_docs

['this is the first document',
 'this document is the second document',
 'if he cares about caring, then he should care about caring about caring',
 'if he began to care, then he should begin to care about caring about caring',
 '123 the world is large 3234',
 'he stripped the striped paint by stripping the first coat of paint']

In [38]:
tokenized_corpus = [sentence.split() for sentence in processed_docs]

- In Gensim's Word2Vec model, the input typically consists of a list of sentences, where each sentence is represented as a list of words.

In [39]:
tokenized_corpus

[['this', 'is', 'the', 'first', 'document'],
 ['this', 'document', 'is', 'the', 'second', 'document'],
 ['if',
  'he',
  'cares',
  'about',
  'caring,',
  'then',
  'he',
  'should',
  'care',
  'about',
  'caring',
  'about',
  'caring'],
 ['if',
  'he',
  'began',
  'to',
  'care,',
  'then',
  'he',
  'should',
  'begin',
  'to',
  'care',
  'about',
  'caring',
  'about',
  'caring'],
 ['123', 'the', 'world', 'is', 'large', '3234'],
 ['he',
  'stripped',
  'the',
  'striped',
  'paint',
  'by',
  'stripping',
  'the',
  'first',
  'coat',
  'of',
  'paint']]

#### Training the model
- using CBOW Architecture for trainnig
- using skipgrams

In [40]:
model_cbow = Word2Vec(tokenized_corpus, min_count=1,sg=0)

In [41]:
model_sg = Word2Vec(tokenized_corpus, min_count=1,sg=1)

In [42]:
words = list(model_cbow.wv.index_to_key)

In [43]:
print(words)

['the', 'he', 'about', 'caring', 'is', 'document', 'to', 'first', 'if', 'then', 'should', 'care', 'this', 'paint', 'care,', '3234', 'stripping', 'by', 'striped', 'second', 'stripped', 'cares', 'began', 'large', 'caring,', 'world', '123', 'begin', 'coat', 'of']


In [45]:
print(model_cbow.wv['caring'])

[-8.2417224e-03  9.3069859e-03 -1.9400308e-04 -1.9619826e-03
  4.6059205e-03 -4.0963856e-03  2.7420030e-03  6.9434377e-03
  6.0643940e-03 -7.5113317e-03  9.3865292e-03  4.6708258e-03
  3.9683753e-03 -6.2431861e-03  8.4615955e-03 -2.1535533e-03
  8.8303620e-03 -5.3642010e-03 -8.1347115e-03  6.8240981e-03
  1.6685956e-03 -2.2035337e-03  9.5159374e-03  9.4898753e-03
 -9.7755836e-03  2.5046265e-03  6.1526150e-03  3.8772465e-03
  2.0249460e-03  4.3281142e-04  6.7795103e-04 -3.8275255e-03
 -7.1406630e-03 -2.0937026e-03  3.9244429e-03  8.8162124e-03
  9.2613585e-03 -5.9763086e-03 -9.4033480e-03  9.7637698e-03
  3.4270217e-03  5.1649651e-03  6.2814471e-03 -2.8025953e-03
  7.3196734e-03  2.8309938e-03  2.8748913e-03 -2.3824151e-03
 -3.1320578e-03 -2.3670357e-03  4.2815907e-03  7.2927229e-05
 -9.5865065e-03 -9.6653663e-03 -6.1536096e-03 -1.3028183e-04
  2.0011414e-03  9.4359834e-03  5.5812397e-03 -4.2917039e-03
  2.7557116e-04  4.9664853e-03  7.6976130e-03 -1.1438169e-03
  4.3216646e-03 -5.81023

In [47]:
print(model_cbow.wv.most_similar('caring'))

[('care,', 0.1785808503627777), ('document', 0.13154162466526031), ('if', 0.07507099211215973), ('he', 0.06841271370649338), ('caring,', 0.048295751214027405), ('of', 0.04736408591270447), ('care', 0.041895244270563126), ('to', 0.041559699922800064), ('by', 0.04119827598333359), ('began', 0.04086209461092949)]


In [49]:
print("Similarity between first and second:",model_cbow.wv.similarity('second', 'first'))

Similarity between first and second: 0.31903815


In [51]:
print(model_sg.wv['caring'])

[-8.2111917e-03  9.3490556e-03 -1.9070403e-04 -1.9457607e-03
  4.6252515e-03 -4.0966938e-03  2.7368681e-03  6.9593340e-03
  6.0549006e-03 -7.5082253e-03  9.4071804e-03  4.6621948e-03
  3.9819605e-03 -6.2519521e-03  8.4713567e-03 -2.1818264e-03
  8.8498266e-03 -5.3782323e-03 -8.1616128e-03  6.8055741e-03
  1.6460215e-03 -2.2255767e-03  9.5126741e-03  9.4598019e-03
 -9.7923148e-03  2.4846932e-03  6.1371913e-03  3.9014141e-03
  2.0335470e-03  4.3660274e-04  6.8532838e-04 -3.8686984e-03
 -7.1487995e-03 -2.1138131e-03  3.9072637e-03  8.7929899e-03
  9.2664184e-03 -5.9681204e-03 -9.4147585e-03  9.7453073e-03
  3.3953465e-03  5.1629795e-03  6.2842560e-03 -2.7789690e-03
  7.3110340e-03  2.8341843e-03  2.8904441e-03 -2.3910732e-03
 -3.1514168e-03 -2.3421017e-03  4.2983266e-03  5.8409623e-05
 -9.5848190e-03 -9.6739521e-03 -6.1878818e-03 -1.5248719e-04
  2.0155266e-03  9.4482694e-03  5.5593834e-03 -4.3015340e-03
  2.7122017e-04  4.9595912e-03  7.7095903e-03 -1.1371282e-03
  4.2969016e-03 -5.77577

In [52]:
print(model_sg.wv.most_similar('caring'))
print("Similarity between first and second:",model_sg.wv.similarity('second', 'first'))

[('care,', 0.17995303869247437), ('document', 0.13225498795509338), ('if', 0.07564379274845123), ('he', 0.06894130259752274), ('caring,', 0.048580072820186615), ('of', 0.04763174057006836), ('care', 0.04672780632972717), ('to', 0.041590772569179535), ('by', 0.04135335609316826), ('began', 0.04047591984272003)]
Similarity between first and second: 0.3191207


### Spacy

In [53]:
import spacy

In [54]:
nlp = spacy.load("en_core_web_sm")

In [55]:
for doc in processed_docs:
    doc_nlp = nlp(doc) #creating a spacy "Doc" object which is a container for accessing linguistic annotations. 
    
    print("-"*30)
    print("Average Vector of '{}'\n".format(doc),doc_nlp.vector)#this gives the average vector of each document
    for token in doc_nlp:
        print()
        print(token.text,token.vector)

------------------------------
Average Vector of 'this is the first document'
 [ 1.06188253e-01 -8.40984359e-02 -7.30575174e-02  4.40065563e-01
  2.81971395e-01 -2.71542788e-01  4.37333405e-01  1.30509143e-03
 -2.68775612e-01  4.38305050e-01  1.17133543e-01  2.37281114e-01
 -4.46023285e-01  2.31346875e-01 -6.18541121e-01 -2.73813494e-02
  4.79420945e-02 -6.88767582e-02 -3.92893851e-01  7.09979117e-01
 -2.79510558e-01  1.27013579e-01 -6.40684903e-01 -2.59066582e-01
 -1.58308029e-01 -3.50103050e-01  1.66394398e-01  4.03560251e-01
  8.53817821e-01  4.82782513e-01  7.55635262e-01  7.35163689e-04
  2.89864987e-01 -7.48218238e-01  2.88738668e-01  1.85084030e-01
  4.21226323e-01  2.69632667e-01 -3.24331527e-03  5.37215531e-01
 -2.77827501e-01  6.50483966e-01 -2.97513187e-01  3.12971890e-01
  4.76815775e-02  7.78637350e-01 -3.71743619e-01  9.12731662e-02
  2.13861078e-01 -1.90299839e-01 -8.29162896e-01  1.03423643e+00
 -2.83895254e-01 -7.31662959e-02 -2.16857679e-02 -6.12796023e-02
 -4.0233992