<b>In this notebook we will make a comparison between the various word embedding techniques mainly embedding Layer of Keras, GloVe and Word2Vec on twitter dataset<b>

Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer

import tensorflow as tf
from tensorflow import keras
from keras import Sequential
from keras.layers.recurrent import LSTM,GRU
from keras.layers import Dense,Dropout,BatchNormalization,Bidirectional,Embedding,Flatten
from keras.layers import Conv1D,MaxPool1D,GlobalAveragePooling1D

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import gensim
from gensim.models.word2vec import Word2Vec

nltk.download('stopwords')
stopwords=stopwords.words("english")
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
train=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ComparisonB wEmbedding techniques/Twitter_Data.csv")

In [3]:
train.shape

(162980, 2)

In [4]:
train.head(3)

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0


In [5]:
# Changing the category labels
target=[]
for i in train['category']:
  if i==-1.0:
    target.append(0)
  elif i==0.0:
    target.append(1)
  else:
    target.append(2)

train['target']=target

In [6]:
# The dataset looks imbalanced 
train['target'].value_counts()

2    72257
1    55213
0    35510
Name: target, dtype: int64

In [7]:
def cleaning(text):
  cleaned_text=[]
  lemm_obj=WordNetLemmatizer()
  tokens_list=re.split(" ",str(text))
  for token in tokens_list:
    token_small=token.lower() #converting to lower case
    punc_removed=re.sub("[^a-z A-Z 0-9]",'',token_small)

    if punc_removed not in stopwords:
      cleaned_text.append(lemm_obj.lemmatize(punc_removed))
  clean_text=" ".join(cleaned_text)
  return clean_text


In [8]:
%%time
train["Cleaned tweet"]=train["clean_text"].apply(cleaning)

CPU times: user 25.5 s, sys: 283 ms, total: 25.8 s
Wall time: 25.8 s


In [9]:
train.head(3)

Unnamed: 0,clean_text,category,target,Cleaned tweet
0,when modi promised “minimum government maximum...,-1.0,0,modi promised minimum government maximum gover...
1,talk all the nonsense and continue all the dra...,0.0,1,talk nonsense continue drama vote modi
2,what did just say vote for modi welcome bjp t...,1.0,2,say vote modi welcome bjp told rahul main cam...


Lets now create one hot encoding of the cleaned tweet using keras tokenizer class

In [10]:
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(train['Cleaned tweet'])

In [11]:
# Looking at some of the key value pairs generated

# tokenizer_obj.word_index.keys()

In [12]:
#Defining the vocabulary size for training embedding layer
vocab_size=len(tokenizer_obj.word_index)+1
print(vocab_size)

116434


In [13]:
def one_hot_encoding_text(df_list):
  encoded_tweets=[]
  for tweets in df_list:
    encoded_tweets.append(tokenizer_obj.texts_to_sequences([tweets])[0])
  return encoded_tweets

In [14]:
%%time
encoded_tweet=np.array(one_hot_encoding_text(train['Cleaned tweet']))
# test_tweet=np.array(one_hot_encoding_text(test['Cleaned tweet'])) #test data

CPU times: user 3.03 s, sys: 48.3 ms, total: 3.08 s
Wall time: 3.09 s


  """Entry point for launching an IPython kernel.


In [15]:
# Adding padding to make dimensions of all the rows same
max_length=100
padded_encTweet=pad_sequences(encoded_tweet,maxlen=max_length,padding="post") #train data
# padded_testt=pad_sequences(test_tweet,maxlen=max_length,padding="post") #test data
padded_encTweet[0]

array([    1,   239,   633,    27,  1541,   727,  1002,  1243,  1067,
          52, 15427,   105,    41,    14,    24,   976,   105,   404,
        3642,  5050,  1090,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0], dtype=int32)

Train test split

In [16]:
X=padded_encTweet
y=train['target']

In [17]:
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=32)

<h1>Embedding Layer</h1>

In [None]:
model=Sequential()

#Embedding Layer
model.add(Embedding(input_dim=vocab_size,output_dim=300,input_length=max_length))
model.add(Dense(units=500,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.8))

model.add(Dense(units=300,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Dense(units=200,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.6))
model.add(Dense(units=200,activation='relu'))

model.add(Flatten()) # To flatten the 3d matrix to 2d ,can use globalpooling1d also
model.add(Dense(units=3,activation='softmax'))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 300)          34929900  
_________________________________________________________________
dense (Dense)                (None, 200, 500)          150500    
_________________________________________________________________
batch_normalization (BatchNo (None, 200, 500)          2000      
_________________________________________________________________
dropout (Dropout)            (None, 200, 500)          0         
_________________________________________________________________
dense_1 (Dense)              (None, 200, 300)          150300    
_________________________________________________________________
batch_normalization_1 (Batch (None, 200, 300)          1200      
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 300)          0

In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
#The loss is still dreacreasing and the accuracy is increasing, so train it for at least 30 epochs to get a decent accuracy
model.fit(x=x_train,y=y_train,epochs=5,batch_size=512,validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe960032190>

<h1> Using Conv and Pooling layer to check if it helps </h1>

In [None]:
model=Sequential()

#Embedding Layer
model.add(Embedding(input_dim=vocab_size,output_dim=300,input_length=max_length))
model.add(Dense(units=500,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Dense(units=300,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Conv1D(filters=300,kernel_size=3,activation='relu'))
model.add(MaxPool1D(pool_size=2))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Dense(units=200,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(units=200,activation='relu'))
model.add(Dropout(0.6))

model.add(Conv1D(filters=100,kernel_size=3,activation='relu'))
model.add(MaxPool1D(pool_size=3))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Flatten()) # To flatten the 3d matrix to 2d ,can use globalpooling1d also
model.add(Dense(units=3,activation='softmax'))

In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [None]:
#The loss is still dreacreasing and the accuracy is increasing, so train it for at least 30 epochs to get a decent accuracy
model.fit(x=x_train,y=y_train,epochs=5,batch_size=512,validation_data=(x_test,y_test))

# Epoch 1/5
# 223/223 [==============================] - 165s 606ms/step - loss: 1.1508 - accuracy: 0.4791 - val_loss: 1.6432 - val_accuracy: 0.3389
# Epoch 2/5
# 223/223 [==============================] - 134s 600ms/step - loss: 0.8148 - accuracy: 0.6325 - val_loss: 1.4757 - val_accuracy: 0.5078
# Epoch 3/5
# 223/223 [==============================] - 133s 598ms/step - loss: 0.5691 - accuracy: 0.7765 - val_loss: 0.7508 - val_accuracy: 0.6751
# Epoch 4/5
# 223/223 [==============================] - 133s 598ms/step - loss: 0.5024 - accuracy: 0.8135 - val_loss: 0.6313 - val_accuracy: 0.7545
# Epoch 5/5
# 223/223 [==============================] - 133s 598ms/step - loss: 0.3840 - accuracy: 0.8760 - val_loss: 0.4831 - val_accuracy: 0.8462

# <keras.callbacks.History at 0x7fe8f3ec6650>



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fe8f3ec6650>

Though in this case the presence of pooling layers isn't helping the model that much but sometimes pooling also helps in increasing the accuracy

<h1>Using Glove</h1>

The data in the glove text file that we have downloaded from the Standford website is in key value pair, where the word is represented as the key and the 100 dimensional vector is as the value

In [None]:
%%time
#Lets first load the file and look at first few words

data_file = open("/content/drive/MyDrive/Colab Notebooks/ComparisonB wEmbedding techniques/glove.6B.100d.txt",encoding="UTF-8")

count=0
for i in data_file:
  if count<=3:
    print(i)
  else:
    break
  count+=1


the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062

, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158

In [None]:
%%time
#Now lets create a dictionary having words as keys and vectors as value

glove_dict={}
for i in data_file:
  splitted_data=i.split()
  word=splitted_data[0]
  vect=np.array(splitted_data[1:],dtype="float32")
  glove_dict[word]=vect

print(f"The length of the dictionary created is ",{len(glove_dict)})

The length of the dictionary created is  {399995}
CPU times: user 11.7 s, sys: 670 ms, total: 12.4 s
Wall time: 12.9 s


Now lets one hot encode encode the cleaned tweets on the basis of this glove vectors 

In [None]:
train['target'][1]

1

In [None]:
%%time
def OneHotUsingGlove(df_data):
  OneHotVect=[]
  noVec=[]
  y_new=[]
  counter=0
  for j in df_data:
    temp=[]
    tokens_list=j.split()
    for i in tokens_list:
      if i in glove_dict.keys():
        temp.append(glove_dict[i])
      else:
        noVec.append(i)
    if len(temp) != 0:
      OneHotVect.append(np.array(temp))
      y_new.append(train['target'][counter])
      counter+=1
  return np.array(OneHotVect),noVec,y_new

Encoded_vect,exemptedWords,y_new=OneHotUsingGlove(train['Cleaned tweet'])

CPU times: user 4.33 s, sys: 553 ms, total: 4.89 s
Wall time: 4.88 s




In [None]:
# print(exemptedWords)

In [None]:
added_vec=[]
counter=0
for i in range(0,len(Encoded_vect)):
  added_vec.append(Encoded_vect[i].sum(axis=0).reshape(1,100))

added_vec=np.array(added_vec)

One thing we can see that most of the words that didn't had a glove vector are either hindi word or some misspelled words, however it might be that the performance of glove might be affected because we don't have word vectors for hindi words in this case whereas in the embedding layer since we trained our own vocabulary so it might be that in that case the performance could be little good because we also had word vectors for the hindi words as well in that case

In [None]:
added_vec[0].shape

(1, 100)

In [None]:
type(y_new[0])

numpy.int64

In [None]:
X=added_vec.reshape(-1,100)
y=np.array(y_new)

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=32)

In [None]:
scalar=StandardScaler()
x_scaled=scalar.fit_transform(x_train)
x_valid_scaled=scalar.transform(x_test)

In [None]:
#training simple ANN

model = Sequential()

model.add(Dense(500, input_dim=100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(400, activation='relu'))

model.add(Dense(200, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(3,activation="softmax"))

# compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 500)               50500     
_________________________________________________________________
dropout_4 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 400)               200400    
_________________________________________________________________
dense_14 (Dense)             (None, 200)               80200     
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 3)                 603       
Total params: 331,703
Trainable params: 331,703
Non-trainable params: 0
________________________________________________

In [None]:
model.fit(x_scaled,y_train,batch_size=512,epochs=30,validation_data=(x_valid_scaled,y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fec60a01790>

As we can see using Simple ANN with glove vectors we get a very bad training and test accuracy. So, lets try to create a non trainable embedding layer having the weights from the Glovevectors

<h1>Embedding layer using Glove</h1>

In [None]:
#check the upper code to understand these variables
#here since i is starting from i=0 therefore while scripting we use i+1 since in the tokenized word
#indexing starts from 1

weight_matrix = np.zeros((vocab_size,100))
novec=[]
for i,word in enumerate(tokenizer_obj.word_index.keys()):
  temp=glove_dict.get(word)
  if temp is not None:
    weight_matrix[i+1]=temp
  else:
    novec.append(word)


In [None]:
X=padded_encTweet
y=train['target']
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=32)

In [None]:
model=Sequential()

#Embedding Layer
model.add(Embedding(input_dim=vocab_size,output_dim=100,input_length=max_length,weights=[weight_matrix],trainable="False"))
model.add(Dense(units=500,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Dense(units=300,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Conv1D(filters=300,kernel_size=3,activation='relu'))
model.add(MaxPool1D(pool_size=2))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Dense(units=200,activation='relu'))
model.add(BatchNormalization())
model.add(Dense(units=200,activation='relu'))
model.add(Dropout(0.6))

model.add(Conv1D(filters=100,kernel_size=3,activation='relu'))
model.add(MaxPool1D(pool_size=3))
model.add(BatchNormalization())
model.add(Dropout(0.6))

model.add(Flatten()) # To flatten the 3d matrix to 2d ,can use globalpooling1d also
model.add(Dense(units=3,activation='softmax'))
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 100)          11643400  
_________________________________________________________________
dense_21 (Dense)             (None, 200, 500)          50500     
_________________________________________________________________
batch_normalization_5 (Batch (None, 200, 500)          2000      
_________________________________________________________________
dropout_11 (Dropout)         (None, 200, 500)          0         
_________________________________________________________________
dense_22 (Dense)             (None, 200, 300)          150300    
_________________________________________________________________
batch_normalization_6 (Batch (None, 200, 300)          1200      
_________________________________________________________________
dropout_12 (Dropout)         (None, 200, 300)         

In [None]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])

In [None]:
model.fit(x_train,y_train,epochs=10,batch_size=512 ,validation_data=(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7febe50c1e10>

So we can see the accuracy of Glove vectors with embedding layer is pretty good and its the best till now though glove vectors didn't had some of the hindi words in their vocabulary still it performs very good on the twitter data.
Now, we will try to train Word2Vec model using Gensim Library

<h1>Word2Vec Using Gensim </h1>

In [None]:
# pip install gensim



In [None]:
#lets first tokenize the words and convert it into sepratae lists

#It does do any sort of stemming or lemmatizing, we can always create a custom function for this, but lets
#leave it for simplicity

gensim.utils.simple_preprocess('Hello my name is! N@Ilay, I am a runner')

['hello', 'my', 'name', 'is', 'ilay', 'am', 'runner']

In [18]:
gensim_tweet=[]
for i in train['clean_text']:
  gensim_tweet.append(gensim.utils.simple_preprocess(str(i)))

train['gensim_tweet']=gensim_tweet

In [19]:
train.head(3)

Unnamed: 0,clean_text,category,target,Cleaned tweet,gensim_tweet
0,when modi promised “minimum government maximum...,-1.0,0,modi promised minimum government maximum gover...,"[when, modi, promised, minimum, government, ma..."
1,talk all the nonsense and continue all the dra...,0.0,1,talk nonsense continue drama vote modi,"[talk, all, the, nonsense, and, continue, all,..."
2,what did just say vote for modi welcome bjp t...,1.0,2,say vote modi welcome bjp told rahul main cam...,"[what, did, just, say, vote, for, modi, welcom..."


In [20]:
#lets create the word2vec model
%%time
w2v_model= Word2Vec(sentences=train['gensim_tweet'],window=5,min_count=3,workers=4)
w2v_model.train(train['gensim_tweet'],epochs=5,total_examples=len(train))

CPU times: user 1min 35s, sys: 971 ms, total: 1min 36s
Wall time: 53.9 s


In [21]:
#lets check similar word
w2v_model.wv.most_similar('hello',topn=5)

[('dear', 0.7226555347442627),
 ('hey', 0.6569077968597412),
 ('mam', 0.6175224184989929),
 ('maam', 0.6160340905189514),
 ('pranam', 0.588625431060791)]

In [22]:
#lets see the vector representation of the word
w2v_model.wv["good"]

array([ 3.7579234 ,  0.96683294,  0.48309773,  2.692498  ,  0.24553989,
        0.52746487,  0.32010522,  3.112537  ,  1.3808085 ,  1.3502463 ,
        0.37154937, -0.42744398, -0.23473057,  0.6549205 ,  1.4279842 ,
        1.2216411 , -1.3362641 , -1.7412187 , -1.3088381 , -0.8881871 ,
       -1.1485415 , -1.1617924 ,  0.01498502,  0.52090627,  0.37374774,
       -1.4493463 ,  0.50938857, -1.1663167 , -0.14603762, -2.1022344 ,
        1.5571824 ,  0.9696644 , -0.05373155,  2.2256715 ,  1.8663635 ,
       -0.22856787,  1.7512127 ,  1.8316784 , -0.8983375 ,  1.1543334 ,
        2.7531672 , -1.3410981 ,  0.13554361,  1.0550512 ,  0.20747016,
        0.23559046, -0.28939912, -0.89447325, -0.6129041 ,  2.347943  ,
        1.8267337 , -0.38103226,  1.2160603 ,  0.44800016,  0.91664493,
        0.20376664, -0.50375396, -0.23011392, -2.572044  , -0.4118387 ,
        0.30619743,  0.8000849 ,  0.743947  ,  0.18785729,  1.6524303 ,
        1.5168678 , -0.13562828,  0.14737622,  2.7937303 ,  0.14

In [23]:
w2v_model.wv["king"]-w2v_model.wv["man"]+w2v_model.wv["woman"]

array([ 0.19628221,  1.8531872 , -0.6784609 ,  0.7182686 , -0.5344882 ,
       -2.1931028 , -0.6790533 , -1.2975703 ,  0.04049045,  0.96968144,
       -1.9196577 , -1.3964196 ,  0.39594972, -0.41075093,  0.15451872,
        0.01967683, -0.3364096 , -1.2504339 , -2.322707  , -1.8248136 ,
        2.1848774 ,  1.5554055 ,  0.27298686,  1.3110242 ,  1.8206143 ,
        0.5060966 ,  1.9493535 ,  0.1421197 ,  1.033382  ,  1.7560831 ,
       -1.5630133 ,  0.7653849 ,  0.31148082,  0.7013148 ,  0.52121454,
       -1.163305  , -2.0484345 , -0.8873914 ,  0.2631318 , -2.8481276 ,
       -0.7232118 ,  0.6034346 , -0.63455445, -2.6366854 ,  0.47316036,
        1.302081  , -0.578097  ,  2.2107816 ,  0.34801233, -0.64011335,
       -0.7415501 ,  0.44810128,  0.6837585 ,  1.6672035 ,  1.894511  ,
       -0.998379  , -2.8618336 ,  1.1272916 , -0.7398779 ,  0.7874045 ,
        1.2355528 , -0.648952  ,  2.212763  , -0.69036245,  0.71633327,
       -0.29016796, -1.6684973 ,  0.23296297, -1.0226988 , -1.70

Now we will create the word2vec weight matrix in the similar fashion as we created for the Glove vectors and then feed it to the embedding layer of the neural network

In [24]:
w2v_weight_matrix = np.zeros((vocab_size,100))
novec=[]
for word,i in (tokenizer_obj.word_index.items()):
  try:
    temp=w2v_model.wv[word]
    w2v_weight_matrix[i]=temp
  except:
    novec.append(word)


In [25]:
# novec

In [26]:
X=padded_encTweet
y=train['target']
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=32)

In [27]:
#Just copy paste the previous Model that we used for the glove vectors
model=Sequential()

#Embedding Layer
model.add(Embedding(input_dim=vocab_size,output_dim=100,input_length=max_length,weights=[w2v_weight_matrix],trainable="False"))
model.add(Dense(units=500,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.8))

model.add(Dense(units=300,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.9))

model.add(Dense(units=200,activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.8))
model.add(Dense(units=200,activation='relu'))
model.add(Dropout(0.8))
model.add(Flatten()) # To flatten the 3d matrix to 2d ,can use globalpooling1d also
model.add(Dense(units=3,activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          11643400  
_________________________________________________________________
dense (Dense)                (None, 100, 500)          50500     
_________________________________________________________________
batch_normalization (BatchNo (None, 100, 500)          2000      
_________________________________________________________________
dropout (Dropout)            (None, 100, 500)          0         
_________________________________________________________________
dense_1 (Dense)              (None, 100, 300)          150300    
_________________________________________________________________
batch_normalization_1 (Batch (None, 100, 300)          1200      
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 300)          0

In [28]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
model.fit(x_train,y_train,epochs=30,batch_size=512 ,validation_data=(x_test,y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f31f5d54710>

<center><h1> Thank You</h1></center>

In case of any queries or suggestions, you can reach me over LinkedIn :- https://www.linkedin.com/in/nilaykush/