### Dataset
[https://www.kaggle.com/praveengovi/emotions-dataset-for-nlp](https://www.kaggle.com/praveengovi/emotions-dataset-for-nlp)

this project will suggest emojis when provided with a text

### imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras import regularizers
from keras.models import Sequential
from keras.layers import LSTM

from numpy import zeros
from keras.layers import Flatten
from keras.layers.core import Activation, Dropout, Dense

### join datasets
> the dataset was already split into train,test,validation...this block joins the datasets into a single dataset so that the dataset split can be carried out later

In [2]:
#column names
labels=["text","feeling"]
# read datasets
chunk1=pd.read_csv("data/test.txt",delimiter=";",names=labels,skiprows=1)
chunk2=pd.read_csv("data/train.txt",delimiter=";",names=labels,skiprows=1)
chunk3=pd.read_csv("data/val.txt",delimiter=";",names=labels,skiprows=1)

# join datasets
data=pd.concat([chunk1,chunk2,chunk3], axis=0)

# reset index since concatinating keeps the orginal index of the datasets
data.reset_index(drop=True, inplace=True)
data.tail()

Unnamed: 0,text,feeling
19992,im having ssa examination tomorrow in the morn...,sadness
19993,i constantly worry about their fight against n...,joy
19994,i feel its important to share this info for th...,joy
19995,i truly feel that if you are passionate enough...,joy
19996,i feel like i just wanna buy any cute make up ...,joy


### preprocessing

In [3]:
# cleaning
# all text to lower case
data['text'] = data['text'].apply(lambda x: x.lower())
# removing common words - noise
x_words=["i ","im ","feel ","feeling "]
for x_word in x_words:
    data['text'] = data['text'].apply(lambda x: x.replace(x_word, ' '))
# preview cleaned data
data.head()

Unnamed: 0,text,feeling
0,updating my blog because shitty,sadness
1,never make her separate from me because don ...,sadness
2,left with my bouquet of red and yellow tulips...,joy
3,was a little vain when did this one,sadness
4,cant walk into a shop anywhere where do not ...,fear


In [4]:
# encoding

tmp_ct=data["feeling"].astype("category")
#data["feeling"] = tmp_ct.cat.codes

# used to refer to the category codes
# i.e {0: 'anger', 1: 'fear', 2: 'joy', 3: 'love', 4: 'sadness', 5: 'surprise'}
category_map=dict(enumerate(tmp_ct.cat.categories))

# preview encoded data
data.tail()

Unnamed: 0,text,feeling
19992,having ssa examination tomorrow in the mornin...,sadness
19993,constantly worry about their fight against na...,joy
19994,its important to share this info for those t...,joy
19995,truly that if you are passionate enough abou...,joy
19996,like just wanna buy any cute make up see o...,joy


### train test validation split

In [5]:
X=data["text"]
y=data["feeling"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [6]:
def emoji(feeling):
    if feeling=="anger":return "😡"
    if feeling=="joy":return "😃"
    if feeling=="love":return "🥰"
    if feeling=="sadness":return "🙁"
    if feeling=="surprise":return "😲"

In [7]:
max_len = 100
max_words = 5000
tokenizer=Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(list(X_train))
vocab_size = len(tokenizer.word_index) + 1

sequences = tokenizer.texts_to_sequences(X_train.values)
X_train_seq = pad_sequences(sequences,maxlen=max_len)
X_test_seq = pad_sequences(sequences,maxlen=max_len)

In [8]:
model = Sequential()
embedding_layer = Embedding(vocab_size, 100,input_length=100,trainable=False)
model.add(embedding_layer)
model.add(LSTM(15,dropout=0.5))
model.add(Dense(1,activation='softmax'))

#model.add(Dense(1, activation='sigmoid'))

#model.add(Dense(64,activation='relu'))
#model.add(Dense(2,activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

print(model.summary())
history = model.fit(np.array(X_train_seq), np.array(y_train), epochs=20, verbose=1)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          1405900   
_________________________________________________________________
lstm (LSTM)                  (None, 15)                6960      
_________________________________________________________________
dense (Dense)                (None, 1)                 16        
Total params: 1,412,876
Trainable params: 6,976
Non-trainable params: 1,405,900
_________________________________________________________________
None
Epoch 1/20


UnimplementedError:  Cast string to float is not supported
	 [[node categorical_crossentropy/Cast (defined at /usr/local/lib/python3.9/site-packages/keras/losses.py:1654) ]] [Op:__inference_train_function_3142]

Errors may have originated from an input operation.
Input Source operations connected to node categorical_crossentropy/Cast:
 ExpandDims (defined at /usr/local/lib/python3.9/site-packages/keras/engine/data_adapter.py:1449)

Function call stack:
train_function


In [None]:
X_train.dtypes
