In [1]:
import numpy as np
import torch
import torch.nn as nn
import pickle
from tqdm import tqdm
import pandas as pd

### Emoji Prediction Classifier

<img src="data_set.png" style="width:100px;height:100px;">
<caption><center> **Figure 1**: EMOJISET - a classification problem with 5 classes. A few examples of sentences are given here. </center></caption>



In [2]:
data=pd.read_csv("emojify_data.csv" )
data.head()

Unnamed: 0,Sentence,label,Unnamed: 2,Unnamed: 3
0,French macaroon is so tasty,4,,
1,work is horrible,3,,
2,I am upset,3,,[3]
3,throw the ball,1,,[2]
4,Good joke,2,,


In [3]:
test_data=pd.read_csv("test_emoji.csv" )
test_data.head()

Unnamed: 0,Sentence,label
0,I want to eat,4
1,he did not answer,3
2,he got a raise,2
3,she got me a present,0
4,ha ha ha it was so funny,2


In [4]:
type(data['Sentence'])


pandas.core.series.Series

In [5]:
vocab = set()
labels = []
raw_sents = []
for i in range(len(data['Sentence'])):
    line = data['Sentence'][i].strip()
    words = line.split(' ')
    raw_sents.append(words)
    labels.append(int(data['label'][i]))
    for word in words:
        vocab.add(word.lower())

In [6]:
test_raw_sents=[]
test_clabels=[]
for i in range(len(test_data['Sentence'])):
    tline = test_data['Sentence'][i].strip()
    twords = tline.split(' ')
    test_raw_sents.append(twords)
    test_clabels.append(int(test_data['label'][i]))
    for tword in twords:
        vocab.add(tword.lower())

vocab = ['<PAD>'] + sorted(list(vocab))

In [7]:
# raw_sents
len(vocab)

314

In [8]:
indexed_sents = []
for cur_raw_sent in raw_sents:
    cur_indexed_sent = []
    for word in cur_raw_sent:
        cur_indexed_sent.append(vocab.index(word.lower()))
    indexed_sents.append(cur_indexed_sent)

In [9]:
indexed_sents[0:5]

[[111, 186, 160, 246, 267],
 [304, 160, 148],
 [154, 13, 284],
 [275, 270, 34],
 [124, 165]]

In [10]:
test_indexed_sents = []
for tcur_raw_sent in test_raw_sents:
    tcur_indexed_sent = []
    for word in tcur_raw_sent:
        tcur_indexed_sent.append(vocab.index(word.lower()))
    test_indexed_sents.append(tcur_indexed_sent)

In [11]:
max_len = -1
for cur_indexed_sent in indexed_sents:
    max_len = max(max_len, len(cur_indexed_sent))

In [12]:
max_len

10

In [13]:
padded_sents = []
for cur_indexed_sent in indexed_sents:
    cur_padded_sent=[]
    for x in range(max_len-len(cur_indexed_sent)):
        cur_padded_sent.append(0)
    for words in cur_indexed_sent:
        cur_padded_sent.append(words)
    padded_sents.append(cur_padded_sent)

In [14]:
padded_sents[0:5]

[[0, 0, 0, 0, 0, 111, 186, 160, 246, 267],
 [0, 0, 0, 0, 0, 0, 0, 304, 160, 148],
 [0, 0, 0, 0, 0, 0, 0, 154, 13, 284],
 [0, 0, 0, 0, 0, 0, 0, 275, 270, 34],
 [0, 0, 0, 0, 0, 0, 0, 0, 124, 165]]

In [15]:
tmax_len = -1
for tcur_indexed_sent in test_indexed_sents:
    tmax_len = max(tmax_len, len(tcur_indexed_sent))
tmax_len=10

In [16]:
test_padded_sents = []
for tcur_indexed_sent in test_indexed_sents:
    tcur_padded_sent=[]
    for x in range(tmax_len-len(tcur_indexed_sent)):
        tcur_padded_sent.append(0)
    for words in tcur_indexed_sent:
        tcur_padded_sent.append(words)
    test_padded_sents.append(tcur_padded_sent)

In [17]:
print(len(raw_sents))
print(len(indexed_sents))
print(len(padded_sents))
print(len(test_raw_sents))
print(len(test_indexed_sents))
print(len(test_padded_sents))

183
183
183
56
56
56


In [18]:
print(len(raw_sents[0]))
print(len(indexed_sents[0]))
print(len(padded_sents[0]))
X_test=np.array(test_padded_sents)
Y_test=np.array(test_clabels)

5
5
10


In [19]:
X = np.array(padded_sents)
Y = np.array(labels)

In [20]:
X.shape

(183, 10)

In [21]:
Y.shape

(183,)

In [22]:
X = torch.from_numpy(X)
Y = torch.from_numpy(Y).long()
X_test = torch.from_numpy(X_test)
Y_test = torch.from_numpy(Y_test).long()

In [23]:
trainX = X
trainY = Y
testX = X_test
testY = Y_test


In [24]:
testX.shape

torch.Size([56, 10])

In [25]:
embeddings_index = {}
f = open('glove.6B.50d.txt')
for line in f:
    values = line.split(' ')
    word = values[0] ## The first entry is the word
    coefs = np.asarray(values[1:], dtype='float32') ## These are the vecotrs representing the embedding for the word
    embeddings_index[word] = coefs
f.close()

In [26]:
embeddings_index['happy']

array([ 0.092086,  0.2571  , -0.58693 , -0.37029 ,  1.0828  , -0.55466 ,
       -0.78142 ,  0.58696 , -0.58714 ,  0.46318 , -0.11267 ,  0.2606  ,
       -0.26928 , -0.072466,  1.247   ,  0.30571 ,  0.56731 ,  0.30509 ,
       -0.050312, -0.64443 , -0.54513 ,  0.86429 ,  0.20914 ,  0.56334 ,
        1.1228  , -1.0516  , -0.78105 ,  0.29656 ,  0.7261  , -0.61392 ,
        2.4225  ,  1.0142  , -0.17753 ,  0.4147  , -0.12966 , -0.47064 ,
        0.3807  ,  0.16309 , -0.323   , -0.77899 , -0.42473 , -0.30826 ,
       -0.42242 ,  0.055069,  0.38267 ,  0.037415, -0.4302  , -0.39442 ,
        0.10511 ,  0.87286 ], dtype=float32)

In [27]:
#unk_vector = embeddings_index.vectors.mean(axis=0)
avg=np.zeros((50,1),dtype=float)
sum1=0
for word in vocab[1:]:
    sum1=sum1+embeddings_index[word]
avg=sum1/len(vocab)
avg.shape

(50,)

In [28]:
avg

array([ 0.10477203,  0.10030416, -0.11105632, -0.21022114,  0.43222502,
       -0.01908446, -0.4056496 ,  0.12348095, -0.21135695,  0.1374001 ,
       -0.15094624,  0.14368933, -0.18328223,  0.04002608,  0.5344314 ,
        0.11114672,  0.10141597,  0.09200446, -0.16404863, -0.40154225,
       -0.07510382,  0.32814342,  0.2774701 ,  0.12376319,  0.41074434,
       -1.270989  , -0.38609952,  0.1566317 ,  0.43546626, -0.3857825 ,
        2.4428022 ,  0.40749273, -0.21845146, -0.0455374 , -0.0095862 ,
        0.01084257,  0.07582554,  0.21608587,  0.01072802, -0.31934437,
       -0.15144747,  0.05009922, -0.10021205,  0.18748248,  0.12095157,
        0.06794936, -0.09993503, -0.10877771,  0.00478268,  0.22292644],
      dtype=float32)

In [29]:
vectors = [np.zeros((50),dtype=float)]
for word in vocab[1:]:
    if word in embeddings_index.keys():
        vectors.append(embeddings_index[word])
    else:
        print('unk')
        vectors.append(unk_vector)

vectors = np.array(vectors, dtype=np.float32)
vectors = torch.from_numpy(vectors)

In [30]:
vectors.shape

torch.Size([314, 50])

### Using Simple Neural Network

In [31]:
class Network(nn.Module):
    
    def __init__(self, max_seq_len, embed_dim, hidden_dim, output_dim):
        super(Network, self).__init__()
        self.max_seq_len = max_seq_len
        self.hidden_dim = hidden_dim
        self.embed_dim = embed_dim
        self.output_dim = output_dim
        self.embedder = nn.Embedding(len(vocab), self.embed_dim)
        self.embedder.weight.data.copy_(vectors)
        self.fc1 = nn.Linear(self.max_seq_len * self.embed_dim, self.hidden_dim)
        self.soft1 = nn.Softmax()
        self.fc2 = nn.Linear(self.hidden_dim, self.output_dim)
        
    def embed(self, x):
        return self.embedder(x)
    
    def forward(self, x):
        x = self.embedder(x)
#         print(x.size())
        x = x.view(-1, self.max_seq_len * self.embed_dim)
#         print(x.size())
        x = self.fc1(x)
        x = self.soft1(x)
        x = self.fc2(x)
        return x

In [32]:
model = Network(max_len, 50, 200,5 )
model

Network(
  (embedder): Embedding(314, 50)
  (fc1): Linear(in_features=500, out_features=200, bias=True)
  (soft1): Softmax()
  (fc2): Linear(in_features=200, out_features=5, bias=True)
)

In [33]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 300
for epoch_cntr in range(num_epochs):
    optimizer.zero_grad()
    trainO = model(trainX)
    loss = criterion(trainO, trainY)
    loss.backward()
    optimizer.step()
    if epoch_cntr % 10 == 0:
        print('Epoch ', epoch_cntr, ' loss = ', loss.item())



Epoch  0  loss =  1.6189097166061401
Epoch  10  loss =  1.60488760471344
Epoch  20  loss =  1.5914041996002197
Epoch  30  loss =  1.576784372329712
Epoch  40  loss =  1.558550238609314
Epoch  50  loss =  1.5351389646530151
Epoch  60  loss =  1.5065057277679443
Epoch  70  loss =  1.4743640422821045
Epoch  80  loss =  1.4404727220535278
Epoch  90  loss =  1.4062623977661133
Epoch  100  loss =  1.373213529586792
Epoch  110  loss =  1.3415652513504028
Epoch  120  loss =  1.3111374378204346
Epoch  130  loss =  1.281215786933899
Epoch  140  loss =  1.2516783475875854
Epoch  150  loss =  1.2229323387145996
Epoch  160  loss =  1.1956560611724854
Epoch  170  loss =  1.1694023609161377
Epoch  180  loss =  1.1438428163528442
Epoch  190  loss =  1.1187708377838135
Epoch  200  loss =  1.0956748723983765
Epoch  210  loss =  1.0731490850448608
Epoch  220  loss =  1.0524749755859375
Epoch  230  loss =  1.0332132577896118
Epoch  240  loss =  1.0148608684539795
Epoch  250  loss =  0.9972827434539795
Epo

In [34]:
#train accuracy
trainO = model(trainX)
trainP = torch.argmax(trainO, dim=1)
(trainY == trainP).sum().item() / trainY.size(0)



0.994535519125683

In [35]:
testO = model(testX)
testP = torch.argmax(testO, dim=1)

(testY == testP).sum().item() / testY.size(0)



0.9821428571428571

In [36]:
def encode_sent(sent):
    sent = sent.strip()
    words = sent.split(' ')
    padded_sents = []
    indexed_sent = []
    for word in words:
        indexed_sent.append(vocab.index(word.lower()))
    indexed_len = len(indexed_sent)
    for x in range(max_len-len(indexed_sent)):
        padded_sents.append(0)
    for word in indexed_sent:
        padded_sents.append(word)
    x = np.array(padded_sents)
    x = torch.from_numpy(x)
    return x

In [37]:
test_sent = encode_sent('i am happy by your work')
test_sent

tensor([  0,   0,   0,   0, 154,  13, 135,  51, 312, 304])

In [38]:
test_sent_output = model(test_sent)
test_sent_pred = torch.argmax(test_sent_output, dim=1)
test_sent_pred



tensor([2])

### Using RNN

In [39]:
class RecurrentNetwork(nn.Module):
    
    def __init__(self, max_seq_len, embed_dim, hidden_dim, hidden_state_dim, output_dim):
        super(RecurrentNetwork, self).__init__()
        self.max_seq_len = max_seq_len
        self.hidden_dim = hidden_dim
        self.hidden_state_dim = hidden_state_dim
        self.embed_dim = embed_dim
        self.output_dim = output_dim
        self.embedder = nn.Embedding(len(vocab), self.embed_dim)
        self.embedder.weight.data.copy_(vectors)
        
        self.U = nn.Linear(self.hidden_state_dim, self.hidden_state_dim)
        self.W = nn.Linear(self.embed_dim, self.hidden_state_dim)
        self.relu1 = nn.ReLU()
        self.relu2 = nn.ReLU()
        self.fc1 = nn.Linear(self.hidden_state_dim, self.hidden_dim)
        self.fc2 = nn.Linear(self.hidden_dim, self.output_dim)
        
    def embed(self, x):
        return self.embedder(x)
    
    def forward(self, x):
        
        x = self.embedder(x)
        h = torch.zeros(x.size(0), self.hidden_state_dim)
#         print(x.size(), self.W.weight.shape)
#         print(h.size(), self.U.weight.shape)
        for i in range(x.size(1)):
#             print("h shape:-",h.shape)
#             print("x shape",x.shape)
            a = self.U(h)
            b = self.W(x[:,i])
#             print(a.shape, b.shape)
            h = torch.nn.functional.relu(self.U(h) + self.W(x[:,i]))
        h = self.fc1(h)
        h = self.relu2(h)
        h = self.fc2(h)
    
        return h
            

In [40]:
rec_model = RecurrentNetwork(max_len, 50, 100, 200, 5)
rec_model

RecurrentNetwork(
  (embedder): Embedding(314, 50)
  (U): Linear(in_features=200, out_features=200, bias=True)
  (W): Linear(in_features=50, out_features=200, bias=True)
  (relu1): ReLU()
  (relu2): ReLU()
  (fc1): Linear(in_features=200, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=5, bias=True)
)

In [41]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rec_model.parameters(), lr=0.001)

num_epochs = 300
for epoch_cntr in range(num_epochs):
    optimizer.zero_grad()
    trainO = rec_model(trainX)
    loss = criterion(trainO, trainY)
    loss.backward()
    optimizer.step()
    if epoch_cntr % 10 == 0:
        print('Epoch ', epoch_cntr, ' loss = ', loss.item())

Epoch  0  loss =  1.6116819381713867
Epoch  10  loss =  1.2526684999465942
Epoch  20  loss =  0.7527338862419128
Epoch  30  loss =  0.3805331587791443
Epoch  40  loss =  0.16208380460739136
Epoch  50  loss =  0.05343855172395706
Epoch  60  loss =  0.01836515963077545
Epoch  70  loss =  0.00832908134907484
Epoch  80  loss =  0.0047454689629375935
Epoch  90  loss =  0.00325506622903049
Epoch  100  loss =  0.0024645666126161814
Epoch  110  loss =  0.0019869725219905376
Epoch  120  loss =  0.0016608238220214844
Epoch  130  loss =  0.0014193409588187933
Epoch  140  loss =  0.0012315179919824004
Epoch  150  loss =  0.001081392401829362
Epoch  160  loss =  0.0009582837228663266
Epoch  170  loss =  0.000855709018651396
Epoch  180  loss =  0.0007696021348237991
Epoch  190  loss =  0.0006962291663512588
Epoch  200  loss =  0.0006329348543658853
Epoch  210  loss =  0.0005777051555924118
Epoch  220  loss =  0.0005297582829371095
Epoch  230  loss =  0.00048750737914815545
Epoch  240  loss =  0.0004

In [42]:
trainO = rec_model(trainX)
trainP = torch.argmax(trainO, dim=1)

print((trainY == trainP).sum().item() / trainY.size(0))

testO = rec_model(testX)
testP = torch.argmax(testO, dim=1)

print((testY == testP).sum().item() / testY.size(0))

1.0
0.9821428571428571


In [43]:
test_sen=encode_sent('are you ready for dinner ')
test_sen=test_sen.reshape((1,10))
test_sent_output =rec_model(test_sen)
test_sen

tensor([[  0,   0,   0,   0,   0,  22, 311, 230, 109,  85]])

In [44]:
test_sent_pred = torch.argmax(test_sent_output, dim=1)

In [45]:
print(test_sent_output)
print(test_sent_pred)

tensor([[-4.3561, -4.1964, -4.5877, -5.0749,  8.0769]],
       grad_fn=<AddmmBackward>)
tensor([4])
