#### IMPORTING MODULES

In [1]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
% matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk

#stop-words
from nltk.corpus import stopwords
stop_words=set(nltk.corpus.stopwords.words('english'))

# tokenizing
from nltk import word_tokenize,sent_tokenize

#keras
import keras
from keras.preprocessing.text import one_hot,Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input
from keras.models import Model

Using TensorFlow backend.


#### CREATING SAMPLE CORPUS OF DOCUMENTS ie TEXTS

In [2]:
sample_text_1="one in hand is better than two in bush"
sample_text_2="bush was the president of US"
sample_text_3="India has only one president"

corp=[sample_text_1,sample_text_2,sample_text_3]


#### INTEGER ENCODING ALL THE DOCUMENTS

after this all the unique words will be reprsented by an integer. for this we are using one_hotnfunction from the Keras. note that the vocab_size is specified large enough so as to ensure unique integer encoding for each and every word.

**note one important thing that the value of word remains same in different docs. eg 'one' is 14 in both first and third documents.**

In [3]:
vocab_size=50  # take large enough to avoid chances of different words having same integr index. or can write your own function.
encod_corp=[]
for i,doc in enumerate(corp):
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

The encoding for document 1  is :  [10, 35, 45, 30, 8, 43, 44, 35, 26]
The encoding for document 2  is :  [26, 17, 44, 3, 26, 19]
The encoding for document 3  is :  [36, 33, 22, 10, 3]


#### PADDING THE DOCS (to make very doc of same length)

the Keras Embedding layer requires all individual documents to be of same length. hence we wil pad the shorter documents with 0 for now. therefore now in Keras Embedding layer the 'input_length' will be equal to the length ie no of words in the document with maximum length.

In [4]:
# length of maximum document. will be nedded whenever create embeddings for the words
maxi=-1
for doc in corp:
    tokens=nltk.word_tokenize(doc)
    if(maxi<len(tokens)):
        maxi=len(tokens)
print("The maximum number of words in any document is : ",maxi)

The maximum number of words in any document is :  9


In [5]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.
pad_corp=pad_sequences(encod_corp,maxlen=maxi,padding='post',value=0.0)
print("No of padded documents: ",len(pad_corp))

No of padded documents:  3


In [6]:
for i,doc in enumerate(pad_corp):
     print("The padded encoding for document",i+1," is : ",doc)

The padded encoding for document 1  is :  [10 35 45 30  8 43 44 35 26]
The padded encoding for document 2  is :  [26 17 44  3 26 19  0  0  0]
The padded encoding for document 3  is :  [36 33 22 10  3  0  0  0  0]


#### ACTUALLY CREATING THE EMBEDDINGS using KERAS EMBEDDING LAYER

now all the documents are of same length. and so now we are ready to create and use the embeddings.

** I will embed the words into vectors of 8 dimensions.**

In [7]:
input=Input(shape=(3,9),dtype='float64')

In [8]:
word_input=Input(shape=(9,),dtype='float64')  # shape of input. each document has 9 element or words
word_embedding=Embedding(input_dim=vocab_size,output_dim=8,input_length=maxi)(word_input) # creating the embedding
word_vec=Flatten()(word_embedding) # flatten
embed_model =Model([word_input],word_vec) # combining all into a Keras model

In [9]:
embed_model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),loss='binary_crossentropy',metrics=['acc']) # compiling the model. parameters can be tuned as always.

In [10]:
print(type(word_embedding))
print(word_embedding)

<class 'tensorflow.python.framework.ops.Tensor'>
Tensor("embedding_1/embedding_lookup:0", shape=(?, 9, 8), dtype=float32)


In [11]:
print(embed_model.summary()) # summary of the model

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 9)                 0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 9, 8)              400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 72)                0         
Total params: 400
Trainable params: 400
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
embeddings=embed_model.predict(pad_corp) # finally getting the embeddings.

In [13]:
print("Shape of embeddings : ",embeddings.shape)
print(embeddings)

Shape of embeddings :  (3, 72)
[[ 0.00130282 -0.00596644  0.02325323 -0.04432362 -0.04526334 -0.02512041
  -0.02512375  0.01176981 -0.04361005  0.0228055  -0.01651942  0.04873116
   0.03909346  0.02524883  0.01447764  0.0360704  -0.02517453  0.01186245
   0.03737007 -0.02054998 -0.02924714 -0.01061837 -0.04460711  0.0448385
   0.04469622  0.02031794  0.01015266  0.04472108 -0.04150879 -0.02770575
  -0.0303477  -0.02566599 -0.00362151 -0.01438711 -0.0116431  -0.04811322
  -0.01641407  0.0200397   0.04407834 -0.02589554 -0.01004182  0.00317496
  -0.04504983  0.02963866 -0.02431839 -0.03926616 -0.03195803  0.0107885
  -0.02775555 -0.00507268  0.04591706  0.01409023  0.04677561  0.04704653
   0.02259154 -0.04589155 -0.04361005  0.0228055  -0.01651942  0.04873116
   0.03909346  0.02524883  0.01447764  0.0360704  -0.04570699 -0.03431757
   0.04339368 -0.02433269  0.0262607   0.03213033  0.03852773 -0.04842984]
 [-0.04570699 -0.03431757  0.04339368 -0.02433269  0.0262607   0.03213033
   0.038

In [14]:
embeddings=embeddings.reshape(-1,maxi,8)
print("Shape of embeddings : ",embeddings.shape) 
print(embeddings)

Shape of embeddings :  (3, 9, 8)
[[[ 0.00130282 -0.00596644  0.02325323 -0.04432362 -0.04526334
   -0.02512041 -0.02512375  0.01176981]
  [-0.04361005  0.0228055  -0.01651942  0.04873116  0.03909346
    0.02524883  0.01447764  0.0360704 ]
  [-0.02517453  0.01186245  0.03737007 -0.02054998 -0.02924714
   -0.01061837 -0.04460711  0.0448385 ]
  [ 0.04469622  0.02031794  0.01015266  0.04472108 -0.04150879
   -0.02770575 -0.0303477  -0.02566599]
  [-0.00362151 -0.01438711 -0.0116431  -0.04811322 -0.01641407
    0.0200397   0.04407834 -0.02589554]
  [-0.01004182  0.00317496 -0.04504983  0.02963866 -0.02431839
   -0.03926616 -0.03195803  0.0107885 ]
  [-0.02775555 -0.00507268  0.04591706  0.01409023  0.04677561
    0.04704653  0.02259154 -0.04589155]
  [-0.04361005  0.0228055  -0.01651942  0.04873116  0.03909346
    0.02524883  0.01447764  0.0360704 ]
  [-0.04570699 -0.03431757  0.04339368 -0.02433269  0.0262607
    0.03213033  0.03852773 -0.04842984]]

 [[-0.04570699 -0.03431757  0.04339368 

The resulting shape is (3,9,8).

**3---> no of documents**

**9---> each document is made of 9 words which was our 'maxi' variable**

**& 8---> each word is 8 dimensional.**

To check this note that the encodings for '0' are same at the very end of output list.

#### GETTING ENCODING FOR A PARTICULAR WORD IN A SPECIFIC DOCUMENT

In [15]:
for i,doc in enumerate(embeddings):
    for j,word in enumerate(doc):
        print("The encoding for ",j+1,"th word","in",i+1,"th document is : \n\n",word)

The encoding for  1 th word in 1 th document is : 

 [ 0.00130282 -0.00596644  0.02325323 -0.04432362 -0.04526334 -0.02512041
 -0.02512375  0.01176981]
The encoding for  2 th word in 1 th document is : 

 [-0.04361005  0.0228055  -0.01651942  0.04873116  0.03909346  0.02524883
  0.01447764  0.0360704 ]
The encoding for  3 th word in 1 th document is : 

 [-0.02517453  0.01186245  0.03737007 -0.02054998 -0.02924714 -0.01061837
 -0.04460711  0.0448385 ]
The encoding for  4 th word in 1 th document is : 

 [ 0.04469622  0.02031794  0.01015266  0.04472108 -0.04150879 -0.02770575
 -0.0303477  -0.02566599]
The encoding for  5 th word in 1 th document is : 

 [-0.00362151 -0.01438711 -0.0116431  -0.04811322 -0.01641407  0.0200397
  0.04407834 -0.02589554]
The encoding for  6 th word in 1 th document is : 

 [-0.01004182  0.00317496 -0.04504983  0.02963866 -0.02431839 -0.03926616
 -0.03195803  0.0107885 ]
The encoding for  7 th word in 1 th document is : 

 [-0.02775555 -0.00507268  0.04591706

#### Now this makes it easier to visualize that we have 3(size of corp) documents with each consisting of 9(maxi) words and each word mapped to a 8-dimensional vector.