In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import emoji
from sklearn import preprocessing

In [154]:
#dataset from https://www.kaggle.com/rexhaif/emojifydata-en
with open('./tweet_data/emojitweets-01-04-2018.txt','r',encoding = 'UTF-8') as f:
    reader=f.readlines()[0:10000]
    df=pd.DataFrame(data=reader,columns=["tweet"])

In [155]:
df.head()

Unnamed: 0,tweet
0,Squad arriving for Game 2 üöÄ\n
1,Dude is like 5‚Äô8 140 pounds his dick was long ...
2,FOLLOWERSüëá\n
3,I CANT BREATIUHW üíÄüíÄüíÄ\n
4,2Ô∏è‚É£4Ô∏è‚É£ hours 'til our schedule drops!\n


In [156]:
print("len of dataset ",len(df.index))

len of dataset  10000


In [11]:
df['tweet'] = df['tweet'].str.replace('\n', '')

In [12]:
df['tweet']=df['tweet'].str.lower()
df['tweet']=df['tweet'].str.replace('\d+', '')

In [13]:
df['tweet'][7]

'i am so scared of birdsü§ß'

In [16]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.head()

Unnamed: 0,tweet
0,squad arriving game üöÄ
1,dude like ‚Äô pounds dick long strong(always lit...
2,followersüëá
3,cant breatiuhw üíÄüíÄüíÄ
4,Ô∏è‚É£Ô∏è‚É£ hours 'til schedule drops!


In [17]:
emojis=[]

for j in range(len(df.index)):
    exist=0
    em=""
    for i in df['tweet'][j]:
        if i in emoji.UNICODE_EMOJI and exist==0:
            emojis.append(i)
            em=i
            exist=1
    df['tweet'][j]=df['tweet'][j].replace(em,'')
    if(exist==0):
        emojis.append('NAN')

emojis[0:5]

['üöÄ', 'üçÜ', 'üëá', 'üíÄ', 'NAN']

In [18]:
df['emojis'] = emojis 

In [19]:
df.head()

Unnamed: 0,tweet,emojis
0,squad arriving game,üöÄ
1,dude like ‚Äô pounds dick long strong(always lit...,üçÜ
2,followers,üëá
3,cant breatiuhw,üíÄ
4,Ô∏è‚É£Ô∏è‚É£ hours 'til schedule drops!,NAN


In [20]:
le = preprocessing.LabelEncoder()
df['emojis'] = le.fit_transform(df.emojis.values)

In [21]:
df.head()

Unnamed: 0,tweet,emojis
0,squad arriving game,545
1,dude like ‚Äô pounds dick long strong(always lit...,158
2,followers,293
3,cant breatiuhw,330
4,Ô∏è‚É£Ô∏è‚É£ hours 'til schedule drops!,0


In [22]:
print(le.inverse_transform(df['emojis'][8:12]))
print(df['emojis'][8],df['emojis'][11])

['üòÇ' 'üíñ' '‚ù§' 'üòÇ']
469 469


In [23]:
#number of classes
print('the number of classes is ',len(le.classes_)) 

the number of classes is  633


In [24]:
tokenizer = Tokenizer(num_words=5000, split=" ")
tokenizer.fit_on_texts(df['tweet'].values)

X = tokenizer.texts_to_sequences(df['tweet'].values)#transforms each word to a integer from the word_index dictionary.
X = pad_sequences(X) # padding our text vector so they all have the same length

X[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        378,    1, 2301, 3705,  826,  250,  720,   56,  162, 1657, 2302])

In [25]:
model = Sequential()
model.add(Embedding(5000, 256, input_length=X.shape[1]))#256 is the number of neurons,5000 is the max number of words
model.add(LSTM(256, return_sequences=True, dropout=0.3,recurrent_dropout=0.2))
model.add(LSTM(256, dropout=0.3, recurrent_dropout=0.2))
model.add(Dense(len(le.classes_), activation='softmax'))

In [26]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 22, 256)           1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 22, 256)           525312    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 633)               162681    
Total params: 2,493,305
Trainable params: 2,493,305
Non-trainable params: 0
_________________________________________________________________


In [27]:
y = pd.get_dummies(df['emojis']).values

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [29]:
batch_size = 32
epochs = 25

model.fit(X_train, y_train,validation_data=(X_test,y_test),epochs=epochs, batch_size=batch_size, verbose=2)

Epoch 1/25
250/250 - 60s - loss: 5.1907 - accuracy: 0.1147 - val_loss: 5.0051 - val_accuracy: 0.1200
Epoch 2/25
250/250 - 59s - loss: 4.9553 - accuracy: 0.1181 - val_loss: 4.8901 - val_accuracy: 0.1510
Epoch 3/25
250/250 - 58s - loss: 4.6337 - accuracy: 0.1640 - val_loss: 4.6974 - val_accuracy: 0.1770
Epoch 4/25
250/250 - 59s - loss: 4.1890 - accuracy: 0.2176 - val_loss: 4.6042 - val_accuracy: 0.2015
Epoch 5/25
250/250 - 60s - loss: 3.7804 - accuracy: 0.2731 - val_loss: 4.5619 - val_accuracy: 0.2410
Epoch 6/25
250/250 - 61s - loss: 3.4397 - accuracy: 0.3260 - val_loss: 4.6681 - val_accuracy: 0.2555
Epoch 7/25
250/250 - 57s - loss: 3.1360 - accuracy: 0.3766 - val_loss: 4.6822 - val_accuracy: 0.2710
Epoch 8/25
250/250 - 62s - loss: 2.8792 - accuracy: 0.4155 - val_loss: 4.8059 - val_accuracy: 0.2785
Epoch 9/25
250/250 - 59s - loss: 2.6606 - accuracy: 0.4521 - val_loss: 4.9876 - val_accuracy: 0.2825
Epoch 10/25
250/250 - 63s - loss: 2.4647 - accuracy: 0.4870 - val_loss: 5.0458 - val_accura

<tensorflow.python.keras.callbacks.History at 0x1ef8e88a730>

In [31]:
predictions = model.predict(X_test)

In [32]:
le.inverse_transform([np.argmax(predictions[50])])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    8   10 1386]


array(['üòÄ'], dtype=object)

In [33]:
le.inverse_transform([np.argmax(y_test[50])])

array(['üëÄ'], dtype=object)

In [34]:
T=0
F=0
for i in range(len(y_test)):
    if np.argmax(predictions[i])==np.argmax(y_test[i]):
        T+=1
    else:
        F+=1
print('the number of true values is :',T)
print('the number of false values is :',F)

the number of true values is : 686
the number of false values is : 1314


In [None]:
#save model
model.save('text_to_emoji.h5')

In [37]:
#save tokenizer
import json
tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [38]:
#save label encoder
import pickle
filehandler = open("le.obj","wb")
pickle.dump(le,filehandler)
filehandler.close()