<a href="https://colab.research.google.com/github/maheravi/Deep-Learning/blob/main/Text_Classification_Using_Word_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tensorflow.keras.layers import Dense, Dropout, LSTM ,GRU, SimpleRNN
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing import sequence

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [5]:
def read_csv(filename):
  data_frame = pd.read_csv(filename)
  X = np.asarray(data_frame['sentence'])
  Y = np.asarray(data_frame['label'], dtype=int)
  return X, Y

In [6]:
X_train, Y_train = read_csv('/content/drive/MyDrive/Emoji_Text_Classification/train.csv')
X_test, Y_test = read_csv('/content/drive/MyDrive/Emoji_Text_Classification/test.csv')

In [7]:
def label_to_emoji(label):
  emojies=["❤️",'⚽','😂','😔','🍽️']
  return emojies[label]

In [8]:
X_train[1],label_to_emoji(Y_train[1])


('I am proud of your achievements', '😂')

In [9]:
#the largest sentence by word count
max_len = len(max(X_train,key=len).split())
max_len

10

In [10]:
unique, counts = np.unique(Y_train, return_counts=True)
dict(zip(unique, counts))

{0: 22, 1: 19, 2: 38, 3: 36, 4: 17}

In [11]:
Y_train_oh = tf.keras.utils.to_categorical(Y_train, 5)
Y_test_oh = tf.keras.utils.to_categorical(Y_test, 5)

X_train[1],Y_train_oh[1]

('I am proud of your achievements', array([0., 0., 1., 0., 0.], dtype=float32))

In [12]:
def read_glove_vectors(glove_file):

  f = open(glove_file , encoding = 'utf8')
  words = set()
  words_to_vec = dict()

  for line in f:
    line = line.strip().split()
    word = line[0]
    vec = line[1:]
    words.add(word)
    words_to_vec[word] = np.array(vec, dtype=np.float64)

  return words_to_vec

In [13]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip -d glov.6B

--2022-11-24 17:11:46--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-11-24 17:11:47--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-11-24 17:11:47--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [14]:
words_to_vec = read_glove_vectors('/content/glov.6B/glove.6B.50d.txt')

In [15]:
words_to_vec['flower']

array([ 0.075439 ,  1.2659   , -1.3179   ,  0.11341  ,  1.4513   ,
        0.17337  , -0.56265  , -1.0706   ,  0.54898  ,  0.30163  ,
       -0.11471  ,  0.38498  ,  0.9205   , -0.2491   ,  0.3308   ,
        0.060113 , -0.0068846,  0.086864 , -0.20535  , -0.86098  ,
        0.10007  , -0.75486  ,  0.48225  , -0.33253  , -0.23791  ,
        0.17345  ,  0.49777  ,  0.88761  ,  0.089471 , -0.56217  ,
        1.8535   , -0.0055493,  0.45845  ,  0.53943  ,  0.3247   ,
        0.43479  , -0.027253 ,  0.44744  , -0.27514  , -0.016152 ,
       -0.51024  , -0.10113  , -0.80985  , -0.31571  ,  1.5817   ,
        0.2105   , -0.1844   , -1.7266   ,  0.092685 , -0.55696  ])

In [16]:
def sentence_to_avg(sentence):

  words = sentence.lower().split() # Convert uppercase to lowercase
  sum_vectors = np.zeros((50,))

  for w in words:
    sum_vectors += words_to_vec[w]
  avg_vectors = sum_vectors / len(words)

  return avg_vectors

In [17]:
sentence_to_avg("Pasta is my favorite food")

array([ 0.242832  ,  0.370774  , -0.524396  ,  0.018644  ,  0.568756  ,
        0.0219878 , -0.48206322, -0.152204  ,  0.235412  ,  0.1979466 ,
       -0.178818  ,  0.3203976 ,  0.3379962 ,  0.1399654 ,  0.56775044,
        0.118648  , -0.04531252,  0.335416  ,  0.149832  , -0.522814  ,
        0.095746  , -0.0468764 ,  0.5508066 ,  0.39369132,  0.275182  ,
       -1.275018  , -0.76076   ,  0.449102  ,  0.7542772 , -0.2332608 ,
        2.82554   ,  0.287742  , -0.325976  ,  0.608572  , -0.020543  ,
        0.286476  , -0.24984   ,  0.899408  ,  0.38995   , -0.270266  ,
        0.3004734 ,  0.315962  , -0.2408782 ,  0.1586226 ,  0.5400462 ,
        0.412066  , -0.1657008 , -0.253566  ,  0.3091806 ,  0.371192  ])

In [18]:
X_train_avg = []

for i in range(X_train.shape[0]):
  X_train_avg.append(sentence_to_avg(X_train[i]))

X_train_avg = np.array(X_train_avg)
X_train_avg.shape, Y_train_oh.shape

((132, 50), (132, 5))

##Network1: EmojiNet V2

In [19]:
class EmojiNet_V1(Model):
    def __init__(self):
        super().__init__()
        self.dense = Dense(5, input_shape=(50,), activation='softmax')

    def call(self, x):
        x = self.dense(x)
        return x

In [20]:
model = EmojiNet_V1()
model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train_avg, Y_train_oh, epochs=400, shuffle=True)

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/400
Epoch 77/400
Epoch 78

<keras.callbacks.History at 0x7effd03c88d0>

In [21]:
X_me = np.array(["not happy", "not sad", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy and funny", "i love airplane"])
Y_me = np.array([[2], [0], [0], [2], [1], [4], [3], [0]])
X_me_avg = []

for x in X_me:
    X_me_avg.append(sentence_to_avg(x))

X_me_avg = np.array(X_me_avg)
pred = model.predict(X_me_avg)

for i in range(X_me.shape[0]):
    print(X_me[i], label_to_emoji(np.argmax(pred[i])))

print("")
print("Number of classification error", len(np.nonzero(Y_me-np.argmax(pred[i]))))

not happy 😔
not sad 😔
i adore you ❤️
i love you ❤️
funny lol 😂
lets play with a ball ⚽
food is ready 🍽️
not feeling happy and funny 😂
i love airplane ❤️

Number of classification error 2


##Network1: EmojiNet V2

In [22]:
def convert_sentences_to_embeddings(X):

    emb_dim = words_to_vec["cucumber"].shape[0]  # define dimensionality of your GloVe word vectors (= 50)
    emb_matrix = np.zeros((X.shape[0], max_len, emb_dim))

    for i in range(X.shape[0]):
        words = X[i].lower().split()
        for j in range(len(words)):
            emb_matrix[i, j, :] = words_to_vec[words[j]]

    return emb_matrix

In [23]:
X_me = np.array(["funny lol", "lets play baseball", "food is ready for you"])
print(X_me)
print(convert_sentences_to_embeddings(X_me))

['funny lol' 'lets play baseball' 'food is ready for you']
[[[-0.014547 -0.20208  -0.75278  ... -0.13429   0.21133   1.5368  ]
  [-0.54289   0.053743 -0.46978  ...  0.20745  -0.074958  0.080575]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.30423  -0.24405   1.0303   ... -0.43296  -0.096168  0.43463 ]
  [-0.73571   0.19937  -0.89408  ... -0.075279 -0.44448   0.47437 ]
  [-1.9327    1.0421   -0.78515  ...  0.55667  -0.70315   0.17157 ]
  ...
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]
  [ 0.        0.        0.       ...  0.        0.        0.      ]]

 [[ 0.47222  -0.44545  -0.51833  ...  0.34932   0.33934   0.25499 ]
  [ 0.6185    0.64254  -0.46552  ... -0.2

In [24]:
X_train_embs =convert_sentences_to_embeddings(X_train)
X_train_embs.shape

(132, 10, 50)

In [25]:
class EmojiNet_V2(Model):
    def __init__(self):
        super().__init__()
        
        self.GRU_1 = LSTM(500, return_sequences=True)
        self.dropout_1 = Dropout(0.1)
        self.GRU_2 = GRU(200, return_sequences=True)
        self.dropout_2 = Dropout(0.1)
        self.GRU_3 = SimpleRNN(100)
        self.dense = Dense(5, activation='softmax')

    def call(self, x):
        x = self.GRU_1(x)
        x = self.dropout_1(x)
        x = self.GRU_2(x)
        x = self.dropout_2(x)
        x = self.GRU_3(x)
        x = self.dense(x)
        return x

In [26]:
model2 = EmojiNet_V2()
model2.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001, beta1=0.9, beta2=0.999),
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

model2.fit(X_train_embs, Y_train_oh, epochs=30, shuffle=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7effd0138510>

In [27]:
X_test_embs = convert_sentences_to_embeddings(X_test)
print(X_test_embs.shape)
model2.evaluate(X_test_embs, Y_test_oh)

(56, 10, 50)


[0.6366869211196899, 0.8214285969734192]

In [74]:
# Inference
X_me = np.array(["not sad", "i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "not feeling happy", "not good"])
Y_me = np.array([2, 0, 0, 2, 1, 4, 3, 3])
X_me_embed = convert_sentences_to_embeddings(X_me) 

pred = model2.predict(X_me_embed)
prediction = []

for i in range(X_me.shape[0]):
  prediction.append(np.argmax(pred[i]))
  print(X_me[i], label_to_emoji(np.argmax(pred[i])))

print("")
print("Number of classification error", len(np.nonzero(Y_me-prediction)))

not sad 😔
i adore you ❤️
i love you ❤️
funny lol 😂
lets play with a ball ⚽
food is ready 🍽️
not feeling happy 😔
not good 😔

Number of classification error 1
