In [30]:
## This code aims to detect emotions from text using LSTM

!pip install tweet-preprocessor
!pip install nltk
!pip install scikit-learn
!pip install numpy
!pip install tensorflow
!pip install joblib



In [37]:
import pandas as pd
import preprocessor.api as p
import re
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np
from sklearn import metrics

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from google.colab import files
import io
import joblib

import types
import tempfile
import keras.models
import pickle

In [10]:
# Reading the downloaded content and turning it into a pandas dataframe

#df = pd.read_csv("text_emotion.csv")
#df.head()

## Alternatively, upload the file to google colab
uploaded = files.upload()

Saving text_emotion.csv to text_emotion (4).csv


In [12]:
df = pd.read_csv(io.StringIO(uploaded['text_emotion.csv'].decode('utf-8')))
df

Unnamed: 0,tweet_id,sentiment,author,content
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...
...,...,...,...,...
39995,1753918954,neutral,showMe_Heaven,@JohnLloydTaylor
39996,1753919001,love,drapeaux,Happy Mothers Day All my love
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [13]:
## Step 1: Data pre-processing


## Remove mentions and "#" symbol in tweet
df['clean_content'] = df.content.apply(lambda x: re.sub('@(\w+)','',x))
df['clean_content'] = df.clean_content.apply(lambda x: re.sub('#',"", x))

## Clean using the tweet-processing package, removing emojis and urls
df['clean_content'] = df.clean_content.apply(lambda x: p.clean(x))

## Remove unnecessary punctuation in the data, but tag ! and ?

def punctuation(val): 
  
    punctuations = '''()-[]{};:'"\,<>./@#$%^&_~'''
  
    for x in val.lower(): 
        if x in punctuations: 
            val = val.replace(x, " ") 
        elif x == "!":
            val = val.replace(x, " XXEXLMARK ")
        elif x == "?":
            val = val.replace(x, " XXQUESMARK ")
    return val

df['clean_content'] = df.clean_content.apply(lambda x: punctuation(x))

## Remove empty data 

df = df[df.clean_content != ""]

In [14]:
df.sentiment.value_counts()

neutral       8577
worry         8455
happiness     5208
sadness       5162
love          3841
surprise      2187
fun           1776
relief        1526
hate          1323
empty          815
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [15]:
## Step 2: Modelling the data
## We will use a LSTM model and train it on this dataset

## First, we encode the emotion as numbers
sent_id = {"anger":0, "hate":1, "worry":2, "sadness":3, "neutral":4, "empty":5, "boredom":6,
          "relief":7, "happiness":8, "love":9, "enthusiasm":10, "surprise":11, "fun":12}

df["sentiment_id"] = df['sentiment'].map(sent_id)

  
# Encode labels in column 'sent_id'. 
label_encoder = preprocessing.LabelEncoder()
integer_encoded = label_encoder.fit_transform(df.sentiment_id)

onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
Y = onehot_encoder.fit_transform(integer_encoded)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(df.clean_content,Y, random_state=69, test_size=0.2, shuffle=True)


In [17]:
## Train the LSTM model


# Use the tokenizer that comes with Keras.
tokenizer = Tokenizer(num_words=None)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

max_len = 160
Epoch = 5

# Next, convert the text into padded sequences
X_train_pad = sequence.pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_pad = sequence.pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

In [18]:
w_idx = tokenizer.word_index

embed_dim = 160
lstm_out = 250

model = Sequential()
model.add(Embedding(len(w_idx) +1 , embed_dim,input_length = X_test_pad.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(13, activation='softmax'))
#adam rmsprop 
model.compile(loss = "categorical_crossentropy", optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 160, 160)          4744000   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 160, 160)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 250)               411000    
_________________________________________________________________
dense (Dense)                (None, 13)                3263      
Total params: 5,158,263
Trainable params: 5,158,263
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
## Fit the LSTM Model

batch_size = 32
model.fit(X_train_pad, y_train, epochs = Epoch, batch_size=batch_size,validation_data=(X_test_pad, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f79790e4710>

In [23]:
def clean_text(val):
    val = p.clean(val)
    val = re.sub('@(\w+)','',val)
    val = re.sub('#',"", val)
    val = punctuation(val)
    return val


def get_sentiment(model,text):
    text = clean_text(text)
    #tokenize
    twt = tokenizer.texts_to_sequences([text])
    twt = sequence.pad_sequences(twt, maxlen=max_len, dtype='int32')
    sentiment = model.predict(twt,batch_size=1,verbose = 2)
    sent = np.round(np.dot(sentiment,100).tolist(),0)[0]
    result = pd.DataFrame([sent_id.keys(),sent]).T
    result.columns = ["sentiment","percentage"]
    result=result[result.percentage !=0]
    return result.sort_values(by = ['percentage'], ascending = False).sentiment.iloc[0]

In [24]:
y_test_pred = model.predict(X_test_pad)

In [26]:
## Calculate the AUC score (% of correct predictions)

from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

auc_lstm = roc_auc_score(y_test, y_test_pred)
auc_lstm

0.6631185755733271

In [27]:
get_sentiment(model, "I am happy")

1/1 - 0s


'happiness'

In [36]:
def make_keras_picklable():
    def __getstate__(self):
        model_str = ""
        with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
            keras.models.save_model(self, fd.name, overwrite=True)
            model_str = fd.read()
        d = { 'model_str': model_str }
        return d

    def __setstate__(self, state):
        with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
            fd.write(state['model_str'])
            fd.flush()
            model = keras.models.load_model(fd.name)
        self.__dict__ = model.__dict__


    cls = keras.models.Model
    cls.__getstate__ = __getstate__
    cls.__setstate__ = __setstate__

In [38]:
make_keras_picklable()

m = model

pickle.dumps(m)