In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# import libraries

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from wordcloud import WordCloud
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.preprocessing import LabelEncoder


from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense,LSTM,Embedding,Bidirectional,Dropout

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import BertTokenizer,BertConfig,AdamW,BertForSequenceClassification,get_linear_schedule_with_warmup

# import datasets

In [20]:
train_data = pd.read_csv("../input/emotions-dataset-for-nlp/train.txt",header=None,sep=";",names=["comment","emotion"],encoding="utf-8")
test_data = pd.read_csv("../input/emotions-dataset-for-nlp/test.txt",header=None,sep=";",names=["comment","emotion"],encoding="utf-8")
validate_data = pd.read_csv("../input/emotions-dataset-for-nlp/val.txt",header=None,sep=";",names=["comment","emotion"],encoding="utf-8")

# data processing

In [22]:
train_data.shape
test_data.shape
validate_data.shape

In [18]:
train_data.head()

In [24]:
train_data["length"] = [len(x) for x in train_data["comment"]]
train_data.head()

In [27]:
train_data[train_data["emotion"]=="sadness"]["length"].max()

In [29]:
train_data[train_data["emotion"]=="love"]

In [33]:
all_data = {"train data": train_data , "validation data": validate_data,"test data":test_data}
fig,ax = plt.subplots(1,3,figsize=(30,10))
for i,df in enumerate(all_data.values()):
    df2 = df.copy()
    df2["length"]=[len(x) for x in df2["comment"]]
    sns.kdeplot(data=df2,x="length",hue="emotion",ax=ax[i])
plt.show()

# function

In [42]:
def words_cloud(wordcloud,df):
    plt.figure(figsize=(10,10))
    plt.title(df+" word cloud ",size=16)
    plt.imshow(wordcloud)
    plt.axis("off")

In [43]:
emotions_list = train_data["emotion"].unique()
emotions_list

In [44]:
for emotion in emotions_list:
    text = '  '.join([sentence for sentence in train_data.loc[train_data["emotion"]==emotion,"comment"]])
    wordcloud = WordCloud(width=600,height=600).generate(text)
    
    words_cloud(wordcloud,emotion)

# onehot coding 

In [46]:
lb =LabelEncoder()
train_data["emotion"] = lb.fit_transform(train_data["emotion"])
test_data["emotion"] = lb.fit_transform(test_data["emotion"])
validate_data["emotion"] = lb.fit_transform(validate_data["emotion"])


In [47]:
train_data.head()

In [48]:
nltk.download("stopwords")
stopwords = set(nltk.corpus.stopwords.words("english"))

In [62]:
max_len = train_data["length"].max()
max_len
vocabSize=11000


# text cleaning

In [164]:
from tensorflow.keras.preprocessing.text import one_hot
def text_cleaning(df, column):
    """Removing unrelevent chars, Stemming and padding"""
    stemmer = PorterStemmer()
    corpus = []
    
    for text in df[column]:
        text = re.sub("[^a-zA-Z]", " ", text)
        text = text.lower()
        text = text.split()
        text = [stemmer.stem(word) for word in text if word not in stopwords]
        text = " ".join(text)
        corpus.append(text)
    one_hot_word = [one_hot(input_text=word, n=vocabSize) for word in corpus]
    pad = pad_sequences(sequences=one_hot_word,maxlen=max_len,padding='pre')
    print(pad.shape)
    return pad

In [165]:
x_train = text_cleaning(train_data,"comment")
x_test = text_cleaning(test_data,"comment")
x_val = text_cleaning(validate_data,"comment")

In [166]:
y_train = train_data["emotion"]
y_test = test_data["emotion"]
y_val = validate_data["emotion"]

In [167]:

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
y_val = to_categorical(y_val)

# Sequential model

In [168]:
model = Sequential()
model.add(Embedding(input_dim=vocabSize,output_dim=150,input_length=300))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(64,activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(6,activation='softmax'))

In [169]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
model.summary()

In [170]:
callback = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

In [171]:
x_train.shape

In [172]:
y_train.shape


# fit data in model

In [173]:
hist = model.fit(x_train,y_train,epochs=10,batch_size=64,validation_data=(x_val,y_val),
                verbose=1, callbacks=[callback])

In [174]:
model.evaluate(x_val,y_val,verbose=1)

# accuracy

In [175]:
accuracy = hist.history["accuracy"]
val_acc = hist.history["val_accuracy"]
loss = hist.history["loss"]
val_loss = hist.history["val_loss"]
epochs = range(len(accuracy))

plt.plot(epochs,accuracy,"b",label="training accuracy")
plt.plot(epochs,val_acc,'r',label="validation accuracy")
plt.title("training and validation accuracy")
plt.legend()
plt.figure()

plt.plot(epochs,accuracy,"b",label="training loss")
plt.plot(epochs,val_acc,'r',label="validation loss")
plt.title("training and validation loss")
plt.legend()

plt.show()

# sentence cleaning

In [177]:
def sentence_cleaning(sentence):
    stemmer = PorterStemmer()
    corpus = []
    text = re.sub("[^a-zA-Z]"," ",sentence)
    text = text.lower()
    text = text.split()
    
    text = [stemmer.stem(word) for word in text if word not in stopwords]
    text = " ".join(text)
    corpus.append(text)
    one_hot_word = [one_hot(input_text=word, n=vocabSize) for word in corpus]
    pad = pad_sequences(sequences=one_hot_word,maxlen=max_len,padding="pre")
    
    return pad

# sentence input

In [195]:
sentences = [
            "Don’t you dare tell lies to me?","For God’s sake, leave me alone!",
    "For f*ck’s sake!"
            ]

In [196]:
for sentence in sentences:
    print(sentence)

In [197]:

for sentence in sentences:
    print(sentence)
    sentence = sentence_cleaning(sentence)
    result = lb.inverse_transform(np.argmax(model.predict(sentence), axis=-1))[0]
    proba =  np.max(model.predict(sentence))
    print(f"{result} : {proba}\n\n")

# predict emotions

In [193]:
model.predict(sentence)