### Word Embedding model creation using NLTK

In [1]:
from tensorflow import keras
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Flatten
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
Sent = ['Hello, how are you',
        'how are you',
        'how are you doing',
        'I am doing great',
        'I am doing good',
        'I am good']

In [3]:
sentence_labels = np.array([0,0,0,1,1,1])

In [4]:
my_vocab = 30
encoding_sentence = [one_hot(i, my_vocab)  for i in Sent]
print(encoding_sentence)

[[3, 4, 12, 5], [4, 12, 5], [4, 12, 5, 16], [6, 26, 16, 17], [6, 26, 16, 25], [6, 26, 25]]


In [5]:
length = 5
padded_sentence = pad_sequences(encoding_sentence, maxlen=length, padding='pre')
print(padded_sentence)

[[ 0  3  4 12  5]
 [ 0  0  4 12  5]
 [ 0  4 12  5 16]
 [ 0  6 26 16 17]
 [ 0  6 26 16 25]
 [ 0  0  6 26 25]]


In [6]:
nltk_model = keras.Sequential([
    Embedding(my_vocab, 8, input_length=length),
    Flatten(),
    Dense(1, activation='sigmoid')
])

In [7]:
nltk_model.compile(optimizer='adam',
    loss="binary_crossentropy",
    metrics=['accuracy'])

In [8]:
nltk_model.fit(padded_sentence,sentence_labels,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x22149c00a88>

In [9]:
modelloss, modelaccuracy = nltk_model.evaluate(padded_sentence,sentence_labels, verbose=0)

print('Model loss: %f' % (modelloss))
print('Accuracy: %f' % (modelaccuracy*100))

Model loss: 0.656304
Accuracy: 100.000000


#### Predicting the model

In [10]:
sentence_prediction = ['how are you Buddy',
                       'I am good',
                       'how is life going on',
                       'That is going good'
                      ]

In [11]:
vocab_size = 30
encoded = [one_hot(d, vocab_size) for d in sentence_prediction]
print(encoded)

[[4, 12, 5, 21], [6, 26, 25], [4, 17, 18, 8, 1], [27, 17, 8, 25]]


In [12]:
max_length = 5
mypadded = pad_sequences(encoded, maxlen=max_length, padding='pre')
print(mypadded)

[[ 0  4 12  5 21]
 [ 0  0  6 26 25]
 [ 4 17 18  8  1]
 [ 0 27 17  8 25]]


In [13]:
nltk_model.predict_classes(mypadded)

array([[0],
       [1],
       [0],
       [1]])

### Creating Word Embedding model using gensim

In [14]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize,sent_tokenize
import warnings
warnings.filterwarnings('ignore')

In [15]:
sentences = ["Java is a class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible",
             "C++ is a general-purpose programming language created by Bjarne Stroustrup as an extension of the C programming language",
             "Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant indentation.",
             "C is a general-purpose, procedural computer programming language supporting structured programming, lexical variable scope, and recursion, with a static type system. By design, C provides constructs that map efficiently to typical machine instructions.",
             "JavaScript, often abbreviated as JS, is a programming language that conforms to the ECMAScript specification. JavaScript is high-level, often just-in-time compiled, and multi-paradigm. It has curly-bracket syntax, dynamic typing, prototype-based object-orientation, and first-class functions."]

In [16]:
tokenized_sentences = []
for i in sentences:
    tokenized_sentences.append(word_tokenize(i))

In [17]:
print(tokenized_sentences)

[['Java', 'is', 'a', 'class-based', ',', 'object-oriented', 'programming', 'language', 'that', 'is', 'designed', 'to', 'have', 'as', 'few', 'implementation', 'dependencies', 'as', 'possible'], ['C++', 'is', 'a', 'general-purpose', 'programming', 'language', 'created', 'by', 'Bjarne', 'Stroustrup', 'as', 'an', 'extension', 'of', 'the', 'C', 'programming', 'language'], ['Python', 'is', 'an', 'interpreted', ',', 'high-level', 'and', 'general-purpose', 'programming', 'language', '.', 'Python', "'s", 'design', 'philosophy', 'emphasizes', 'code', 'readability', 'with', 'its', 'notable', 'use', 'of', 'significant', 'indentation', '.'], ['C', 'is', 'a', 'general-purpose', ',', 'procedural', 'computer', 'programming', 'language', 'supporting', 'structured', 'programming', ',', 'lexical', 'variable', 'scope', ',', 'and', 'recursion', ',', 'with', 'a', 'static', 'type', 'system', '.', 'By', 'design', ',', 'C', 'provides', 'constructs', 'that', 'map', 'efficiently', 'to', 'typical', 'machine', 'in

In [18]:
puncts = '#?!-,.;:–/—'
for i in tokenized_sentences:
    for j in i:
        if j in puncts:
            i.remove(j)

In [19]:
print(tokenized_sentences)

[['Java', 'is', 'a', 'class-based', 'object-oriented', 'programming', 'language', 'that', 'is', 'designed', 'to', 'have', 'as', 'few', 'implementation', 'dependencies', 'as', 'possible'], ['C++', 'is', 'a', 'general-purpose', 'programming', 'language', 'created', 'by', 'Bjarne', 'Stroustrup', 'as', 'an', 'extension', 'of', 'the', 'C', 'programming', 'language'], ['Python', 'is', 'an', 'interpreted', 'high-level', 'and', 'general-purpose', 'programming', 'language', 'Python', "'s", 'design', 'philosophy', 'emphasizes', 'code', 'readability', 'with', 'its', 'notable', 'use', 'of', 'significant', 'indentation'], ['C', 'is', 'a', 'general-purpose', 'procedural', 'computer', 'programming', 'language', 'supporting', 'structured', 'programming', 'lexical', 'variable', 'scope', 'and', 'recursion', 'with', 'a', 'static', 'type', 'system', 'By', 'design', 'C', 'provides', 'constructs', 'that', 'map', 'efficiently', 'to', 'typical', 'machine', 'instructions'], ['JavaScript', 'often', 'abbreviated

In [20]:
gensim_model = Word2Vec(tokenized_sentences,min_count=1)

In [21]:
print(gensim_model)

Word2Vec(vocab=82, size=100, alpha=0.025)


In [22]:
words = list(gensim_model.wv.vocab)

In [23]:
print(words)

['Java', 'is', 'a', 'class-based', 'object-oriented', 'programming', 'language', 'that', 'designed', 'to', 'have', 'as', 'few', 'implementation', 'dependencies', 'possible', 'C++', 'general-purpose', 'created', 'by', 'Bjarne', 'Stroustrup', 'an', 'extension', 'of', 'the', 'C', 'Python', 'interpreted', 'high-level', 'and', "'s", 'design', 'philosophy', 'emphasizes', 'code', 'readability', 'with', 'its', 'notable', 'use', 'significant', 'indentation', 'procedural', 'computer', 'supporting', 'structured', 'lexical', 'variable', 'scope', 'recursion', 'static', 'type', 'system', 'By', 'provides', 'constructs', 'map', 'efficiently', 'typical', 'machine', 'instructions', 'JavaScript', 'often', 'abbreviated', 'JS', 'conforms', 'ECMAScript', 'specification', 'just-in-time', 'compiled', 'multi-paradigm', 'It', 'has', 'curly-bracket', 'syntax', 'dynamic', 'typing', 'prototype-based', 'object-orientation', 'first-class', 'functions']


In [24]:
print(gensim_model['is'])

[ 4.2726737e-03 -3.9100577e-04  4.1509629e-03 -2.6738062e-03
  2.0580599e-03  4.7878576e-03  6.8650901e-04  2.8593536e-03
 -1.0317812e-03 -1.7796864e-04  4.7055432e-03 -4.3458277e-03
 -4.0599015e-03  2.4161523e-03 -3.3257129e-03 -4.0281690e-03
  3.4138821e-03 -2.9849554e-03 -2.1566043e-03 -2.7361975e-04
 -6.8157836e-04  1.7237258e-03  8.3424780e-04 -1.2957718e-03
  4.4362992e-03  3.2044107e-03 -1.6104579e-03 -3.7377467e-03
  7.9233403e-04 -3.9141960e-03  4.9907132e-03 -3.8381451e-04
  3.8080120e-03  2.7540969e-03  2.7503672e-03  2.9155274e-03
 -3.5451099e-05 -7.7986083e-04 -5.1021628e-04 -4.1540177e-03
  2.2146155e-04 -3.8468298e-03 -3.2366705e-03  2.0303649e-03
 -7.6719525e-04  2.1604123e-03 -2.8733022e-03 -3.1325468e-03
  7.4999298e-05 -3.5736377e-03  2.3921716e-03 -1.2540602e-03
  1.8112283e-03 -1.9138756e-03 -3.5941773e-04 -3.7515636e-03
  2.1603382e-03 -1.7413433e-05 -4.3483488e-03 -3.0741286e-03
 -1.3941372e-03  2.4732249e-03  4.0281713e-03 -1.9284337e-03
 -1.6206548e-04 -3.02814