# Sentence Classification with Keras and ELMO Embeddings

In [2]:
# Base Imports 
import json
import pandas as pd
from pprint import pprint
# Import our dependencies
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
from keras import backend as K
import keras.layers as layers
from keras.models import Model, load_model
from keras.engine import Layer
import numpy as np
from sklearn.model_selection import train_test_split
from keras.utils import multi_gpu_model
# Initialize session and make sure tensorflow doesn't hog all of the gpu for itself by setting the allow growth config
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)

W0524 14:06:20.805979 31728 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14
Using TensorFlow backend.


In [3]:
# Ignore all this it's code to run benchmarks
# with open("H:/NLP/intent/NLU-Evaluation-Corpora/Export.json",encoding="latin-1") as f:
#     a = json.loads(f.read())

In [4]:
# dft = pd.DataFrame(columns=["text","intents"])
# for i in a["resource"]["intents"]:
#     intnt =  i["name"]
#     print(len(i["sampleUtterances"]))
#     for j in i["sampleUtterances"]:
#         dft = dft.append({'text': j,'intents':intnt}, ignore_index=True)
# dft

In [None]:
# df = pd.read_json(json.dumps(a["sentences"]))

In [None]:
# df.columns

In [5]:
#Read training data -- replace path with current local path for your files
df1 = pd.read_csv("emotion.csv",names=["text","intents"])

In [13]:
#Multiprocessing - Code to parallelize data preprocessing
import pandas as pd
import dask.dataframe as dd
from dask.multiprocessing import get
# df1["a"] = pd.to_numeric(df["a"])
def label_data(row):
    import string
    #Do any row transformations you need to do in this function. 
    return row["text"].lower().translate(str.maketrans('', '', string.punctuation))
#SPEED BOOST 
data = df1 #<--Input Dataframe
ddata = dd.from_pandas(data, npartitions=30)#You might want to reduce the number of partitions based on availiable threads I had 16 cores
df1['text'] = ddata.map_partitions(lambda df1: df1.apply((lambda row: label_data(row)), axis=1)).compute(scheduler='threads') 

In [14]:
df1.columns

Index(['text', 'intents'], dtype='object')

In [16]:
#15884 sentences classified as 7 intents
df1=df1[1:]

In [17]:
df1.sample(5)

Unnamed: 0,text,intents
2157,i felt something akin to shame after a heavy n...,shame
6528,when my cousin passed away,sadness
6305,my lovely girlfriend doublecrossed me and so t...,anger
6307,i was disgusted when my brother was arrested b...,disgust
2561,i was waiting to receive the participation on ...,anger


In [18]:
# Create a custom layer that allows us to update weights (lambda layers do not have trainable parameters!)
# Essentially the advantage of doing this is that we can have elmo embeddings in our network that can be trained 
# Other methods only allow you load in pre trained elmo embeddings

class ElmoEmbeddingLayer(Layer):
    def __init__(self, **kwargs):
        self.dimensions = 1024
        self.trainable=True
        super(ElmoEmbeddingLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.elmo = hub.Module('https://tfhub.dev/google/elmo/2', trainable=self.trainable,
                               name="{}_module".format(self.name))

        self.trainable_weights += K.tf.trainable_variables(scope="^{}_module/.*".format(self.name))
        super(ElmoEmbeddingLayer, self).build(input_shape)

    def call(self, x, mask=None):
        result = self.elmo(K.squeeze(K.cast(x, tf.string), axis=1),
                      as_dict=True,
                      signature='default',
                      )['default']
        return result

    def compute_mask(self, inputs, mask=None):
        return K.not_equal(inputs, '--PAD--')

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.dimensions)

In [19]:
# One hot encode the intents
df2 = pd.get_dummies(df1,columns=['intents'])

In [20]:
# Shuffle Dataset
df2 =df2.sample(frac=1).reset_index(drop=True)

In [21]:
df2.sample(5)

Unnamed: 0,text,intents_anger,intents_disgust,intents_fear,intents_guilt,intents_joy,intents_sadness,intents_shame
6486,i reached the bus stop and realized that i had...,1,0,0,0,0,0,0
5352,i was trying to have sex with my best friends ...,0,0,0,1,0,0,0
4836,the way my husband and his family treated me ...,0,1,0,0,0,0,0
7011,when i was in the 12th standard i could not sp...,0,0,0,0,0,0,1
4902,my best friend started moving out with my boyf...,1,0,0,0,0,0,0


In [22]:
# Function to build model
# Can be tweaked based on size of dataset to prevent overfitting
# Change pred layer to match the size of your intent set- currenty set to 7
def build_model(): 
  input_text = layers.Input(shape=(1,), dtype="string")
  embedding = ElmoEmbeddingLayer()(input_text)
  dense1 = layers.Dense(1024, activation='relu')(embedding)
  dense2 = layers.Dense(256,activation="relu")(dense1)
  pred = layers.Dense(7, activation='softmax')(dense2)

  model = Model(inputs=[input_text], outputs=pred)
#   model = multi_gpu_model(base_model, gpus=4)
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.summary()
  
  return(model)

In [23]:
# Preprocessing to get the data in a suitable shape for elmo embedding
train_text = df2['text'].tolist()
train_text = [' '.join(t.split()) for t in train_text]
train_text = np.array(train_text, dtype=object)[:, np.newaxis]

In [24]:
df2.columns

Index(['text', 'intents_anger', 'intents_disgust', 'intents_fear',
       'intents_guilt', 'intents_joy', 'intents_sadness', 'intents_shame'],
      dtype='object')

In [25]:
# To features and targets
x = df2[["text"]]
y= df2[['intents_anger', 'intents_disgust', 'intents_fear',
       'intents_guilt', 'intents_joy', 'intents_sadness', 'intents_shame']]

In [26]:
# Test - Train split
x_train, x_test, y_train, y_test = train_test_split(    
    x, y, test_size=0.15, random_state=420)

In [27]:
model = build_model()

Instructions for updating:
Colocations handled automatically by placer.


W0524 14:11:29.388374 31728 deprecation.py:323] From C:\Users\mehul.kumar\AppData\Local\Continuum\anaconda3\envs\local\lib\site-packages\tensorflow\python\ops\control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0524 14:11:30.387258 31728 saver.py:1483] Saver not created because there are no variables in the graph to restore


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1)                 0         
_________________________________________________________________
elmo_embedding_layer_1 (Elmo (None, 1024)              4         
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_2 (Dense)              (None, 256)               262400    
_________________________________________________________________
dense_3 (Dense)              (None, 7)                 1799      
Total params: 1,313,803
Trainable params: 1,313,803
Non-trainable params: 0
_________________________________________________________________


In [None]:
# IMPORTANT : Change batch size to match how much memmory you have left on your GPU 
# I reccomend starting small and looking at how much of your GPU memmory is being used and scaling up from there
# Change epochs based on size of dataset...Early stopping can be incorporated if you dont want to do that
model.fit(x_train, 
          y_train,
          validation_data=(x_test, y_test),
          epochs=10,
          batch_size=32)

Instructions for updating:
Use tf.cast instead.


W0524 14:11:30.704404 31728 deprecation.py:323] From C:\Users\mehul.kumar\AppData\Local\Continuum\anaconda3\envs\local\lib\site-packages\tensorflow\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.


Train on 6401 samples, validate on 1130 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

In [None]:
# Save all our hard work
model.save('emot.h5')

### Making predictions on test data and post processing to make sure it can be used for visualizations

In [None]:
a = model.predict(x_test)
pred = np.argmax(a,axis=1)

In [None]:
y_test2 = np.argmax(y_test.values.astype(np.float32),axis=1)

In [None]:
strList = ['intents_anger', 'intents_disgust', 'intents_fear',
       'intents_guilt', 'intents_joy', 'intents_sadness', 'intents_shame']

In [None]:
result = zip(a[0], strList)

In [None]:
set(result)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
def plot_confusion_matrix(cm, names, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(names))
    plt.xticks(tick_marks, names, rotation=45)
    plt.yticks(tick_marks, names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_
cm = confusion_matrix(y_test2, pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
intent = encode_text_index(df1,'intents')
plot_confusion_matrix(cm, intent)

# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized confusion matrix')
print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, intent, title='Normalized confusion matrix')

plt.show()

### Looks like it does a good job but lets try it out by ourselves to make sure

In [5]:
model.load_weights('emo.h5')

In [13]:
res = model.predict(np.array([input()]))
strList = ['intents_anger', 'intents_disgust', 'intents_fear',
       'intents_guilt', 'intents_joy', 'intents_sadness', 'intents_shame']
result = zip(res[0], strList)
set(result)

What a wonderful day to be at the park


{(3.561739e-12, 'intents_anger'),
 (8.854974e-12, 'intents_guilt'),
 (2.7508438e-09, 'intents_shame'),
 (4.4459987e-09, 'intents_disgust'),
 (3.0533151e-06, 'intents_fear'),
 (3.5546036e-06, 'intents_sadness'),
 (0.9999933, 'intents_joy')}

In [None]:
#predicted answers are at the bottom with highest probabiltiy

In [8]:
# CC coordinating conjunction
# CD cardinal digit
# DT determiner
# EX existential there (like: “there is” … think of it like “there exists”)
# FW foreign word
# IN preposition/subordinating conjunction
# JJ adjective ‘big’
# JJR adjective, comparative ‘bigger’
# JJS adjective, superlative ‘biggest’
# LS list marker 1)
# MD modal could, will
# NN noun, singular ‘desk’
# NNS noun plural ‘desks’
# NNP proper noun, singular ‘Harrison’
# NNPS proper noun, plural ‘Americans’
# PDT predeterminer ‘all the kids’
# POS possessive ending parent’s
# PRP personal pronoun I, he, she
# PRP$ possessive pronoun my, his, hers
# RB adverb very, silently,
# RBR adverb, comparative better
# RBS adverb, superlative best
# RP particle give up
# TO, to go ‘to’ the store.
# UH interjection, errrrrrrrm
# VB verb, base form take
# VBD verb, past tense took
# VBG verb, gerund/present participle taking
# VBN verb, past participle taken
# VBP verb, sing. present, non-3d take
# VBZ verb, 3rd person sing. present takes
# WDT wh-determiner which
# WP wh-pronoun who, what
# WP$ possessive wh-pronoun whose
# WRB wh-abverb where, when

In [9]:
topic = ""
for i in a:
    if i[1][0] == "N":
        topic += (i[0]+" ")
        continue
    else:
        if topic != "":
            print(topic)
            topic = ""
            continue
if topic != "":
    print(topic)

joke 


In [10]:
a

[('tell', 'VB'), ('me', 'PRP'), ('a', 'DT'), ('russian', 'JJ'), ('joke', 'NN')]

In [None]:
sentence = input("Sentence in lower case : ")
final_sentence = ""
for i in enumerate(sentence.split()):
    j = sentence.split()
    j1= sentence.split()
    bef = pos_tag(sentence.split())
    j[i[0]] = j[i[0]].capitalize()
    sent = ' '.join(word for word in j)
    aft = pos_tag(word_tokenize(sent))
    #print(bef[i[0]][1][0])
    if bef[i[0]][1] != aft[i[0]][1] or bef[i[0]][1][0]=="N":
        final_sentence += (j[i[0]]+" ")
        #sentence[i[0]] = j[i[0]]
    else:
        final_sentence += (j1[i[0]]+" ")

In [None]:
final_sentence.strip()

In [None]:
a = pos_tag(word_tokenize(final_sentence))
topic = ""
for i in a:
    if i[1][0] == "N":
        topic += (i[0]+" ")
        continue
    else:
        if topic != "":
            print(topic)
            topic = ""
            continue
if topic != "":
    print(topic)

In [None]:
pos_tag(word_tokenize("Yan Goodfellow Works for google brain"))