In [1]:
import re
import torch
from tqdm import tqdm_notebook

import pandas as pd
import numpy as np
import tensorflow as tf

from keras.preprocessing.text import Tokenizer

from tensorflow.keras import regularizers, initializers, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

MAX_NB_WORDS = 20000    # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 128 # max length of each entry (sentence), including padding
VALIDATION_SPLIT = 0.2   # data for validation (not used in training)
EMBEDDING_DIM = 50      # embedding dimensions for word vectors (word2vec/GloVe)
EMBEDDING_DIM_100 = 100      # embedding dimensions for word vectors (word2vec/GloVe)
EMBEDDING_DIM_200 = 200      # embedding dimensions for word vectors (word2vec/GloVe)
EMBEDDING_DIM_300 = 300      # embedding dimensions for word vectors (word2vec/GloVe)


GLOVE_DIR = "glove/glove.6B."+str(EMBEDDING_DIM)+"d.txt"
GLOVE_DIR_100 = "glove/glove.6B."+str(EMBEDDING_DIM_100)+"d.txt"
GLOVE_DIR_200 = "glove/glove.6B."+str(EMBEDDING_DIM_200)+"d.txt"
GLOVE_DIR_300 = "glove/glove.6B."+str(EMBEDDING_DIM_300)+"d.txt"

Using TensorFlow backend.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce GTX 1080 Ti'

In [4]:
df = pd.read_csv('data/sarcasm_v2.csv')
df.head()

Unnamed: 0,Corpus,Label,ID,Quote Text,Response Text
0,GEN,sarc,GEN_sarc_0000,"First off, That's grade A USDA approved Libera...",Therefore you accept that the Republican party...
1,GEN,sarc,GEN_sarc_0001,watch it. Now you're using my lines. Poet has ...,More chattering from the peanut gallery? Haven...
2,GEN,sarc,GEN_sarc_0002,Because it will encourage teens to engage in r...,"Yep, suppressing natural behavior is always th..."
3,GEN,sarc,GEN_sarc_0003,Obviously you missed the point. So sorry the t...,"I guess we all missed your point Justine, what..."
4,GEN,sarc,GEN_sarc_0004,This is pure paranoia. What evidence do you ha...,"Evidence, I dont need no sticking evidence. Th..."


In [7]:
# Cleaning up dataframe columns
df = df.rename(columns = {'Quote Text': 'context', 'Response Text': 'response', 'Label': 'label'})
df['label'] = df['label'].map({'sarc': 1, 'notsarc': 0})
df.head()

Unnamed: 0,Corpus,label,ID,context,response
0,GEN,1,GEN_sarc_0000,"First off, That's grade A USDA approved Libera...",Therefore you accept that the Republican party...
1,GEN,1,GEN_sarc_0001,watch it. Now you're using my lines. Poet has ...,More chattering from the peanut gallery? Haven...
2,GEN,1,GEN_sarc_0002,Because it will encourage teens to engage in r...,"Yep, suppressing natural behavior is always th..."
3,GEN,1,GEN_sarc_0003,Obviously you missed the point. So sorry the t...,"I guess we all missed your point Justine, what..."
4,GEN,1,GEN_sarc_0004,This is pure paranoia. What evidence do you ha...,"Evidence, I dont need no sticking evidence. Th..."


In [8]:
df = df.reindex(np.random.permutation(df.index))

In [9]:
# Cleaning text
def clean_text(text):
    output = ""
    text = str(text).replace("\n", "")
    text = re.sub(r'[^\w\s]','',text).lower()
    return text

In [10]:
df.loc[:,'clean_context'] = df['context'].apply(clean_text)
df.loc[:,'clean_response'] = df['response'].apply(clean_text)

#### Label Values

In [12]:
labels = df['label'].values

## Tokenizing 

#### Context Dataframe Pre-Processing

In [13]:
# Isolating the columns of interest
context = df[['clean_context', 'label']]

#Tokenizing
tknzr = Tokenizer(num_words=MAX_NB_WORDS)
tknzr.fit_on_texts(context['clean_context'])
sequences = tknzr.texts_to_sequences(context['clean_context'])
context_data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)

In [14]:
context_data.shape

(4692, 128)

In [15]:
word_index = tknzr.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 19048


#### Response Dataframe Pre-Processing

In [16]:
# Isolating the columns of interest
response = df[['clean_response', 'label']]

#Tokenizing
tknzr = Tokenizer(num_words=MAX_NB_WORDS)
tknzr.fit_on_texts(response['clean_response'])
sequences = tknzr.texts_to_sequences(response['clean_response'])
response_data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)

In [17]:
response_data.shape

(4692, 128)

In [18]:
word_index = tknzr.word_index
print('Vocabulary size:', len(word_index))

Vocabulary size: 16990


## Creating Training and Test Set

In [19]:
num_validation_samples = int(VALIDATION_SPLIT*context_data.shape[0])

#### Labels

In [20]:
labels_train = labels[: -num_validation_samples]
labels_test = labels[-num_validation_samples:]

#### Context

In [21]:
context_train = context_data[: -num_validation_samples]
context_test = context_data[-num_validation_samples:]

#### Response

In [22]:
response_train = response_data[: -num_validation_samples]
response_test = response_data[-num_validation_samples:]

## Model

In [23]:
embeddings_index = {}
#f = open(GLOVE_DIR)
#print('Loading GloVe from:', GLOVE_DIR,'...', end='')
f = open(GLOVE_DIR_100)
print('Loading GloVe from:', GLOVE_DIR_100,'...', end='')
#f = open(GLOVE_DIR_300)
#print('Loading GloVe from:', GLOVE_DIR_300,'...', end='')

Loading GloVe from: glove/glove.6B.100d.txt ...

In [24]:
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32') 
f.close()
print("Done.\n Proceeding with Embedding Matrix...", end="")

#embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM_100))
#embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM_300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print(" Completed!")

Done.
 Proceeding with Embedding Matrix... Completed!


In [25]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(word_index) + 1,
                           #EMBEDDING_DIM,
                           EMBEDDING_DIM_100,
                           #EMBEDDING_DIM_300,  
                           weights = [embedding_matrix],
                           input_length = MAX_SEQUENCE_LENGTH,
                           trainable=False,
                           name = 'embeddings')
embedded_sequences = embedding_layer(sequence_input)

W1212 22:54:31.267709 140456920401728 deprecation.py:506] From /opt/conda/lib/python3.7/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


### Convolution

In [26]:
print('Build model...')
model = tf.keras.Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(MAX_NB_WORDS,
                    #EMBEDDING_DIM,
                    EMBEDDING_DIM_100,
                    #EMBEDDING_DIM_300,
                    input_length=MAX_SEQUENCE_LENGTH))
                    #input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(300, #128
                 3,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(128))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
#model.add(Activation('softmax'))

#model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

print(model.summary())
model.fit(response_train, labels_train,
          batch_size=32,
          #batch_size=20,
          #batch_size=10,
          #epochs=10,
          epochs=10,
          validation_split=0.2)
          #validation_data=(response_test, labels_test))
scores = model.evaluate(response_test, labels_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

W1212 22:54:36.069691 140456920401728 deprecation.py:506] From /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W1212 22:54:36.192432 140456920401728 deprecation.py:323] From /opt/conda/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Build model...
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 128, 100)          2000000   
_________________________________________________________________
dropout (Dropout)            (None, 128, 100)          0         
_________________________________________________________________
conv1d (Conv1D)              (None, 126, 300)          90300     
_________________________________________________________________
global_max_pooling1d (Global (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               38528     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
activation (Activation)      (None, 128) 

In [27]:
predictions = np.round(np.array(model.predict(response_test, batch_size=32)))
actualPredictions = []
actualLabels = []
for i in predictions:
    if (i == 1):
        actualPredictions.append(1)
    else:
        actualPredictions.append(0)
for i in labels_test:
    if (i == 1):
        actualLabels.append(1)
    else:
        actualLabels.append(0)

In [28]:
tn = 0
fp = 0
fn = 0
tp = 0
numCorrect = 0

for i in range(len(actualPredictions)):
    if actualPredictions[i] == actualLabels[i]:
        if actualPredictions[i] == 0:
            tp += 1
        else:
            tn += 1
        numCorrect += 1
    else:
        if actualPredictions[i] == 0:
            fn += 1
        else:
            fp += 1

print("tp " + str(tp))
print("tn " + str(tn))
print("fp " + str(fp))
print("fn " + str(fn))

tp 304
tn 337
fp 150
fn 147


In [29]:
print("acc: " + str((1.0*numCorrect)/(len(actualPredictions))))
precision = (1.0*tp)/(tp+fp)
recall = (1.0*tp)/(tp+fn)

acc: 0.6833688699360341


In [30]:
confusionMatrix = np.zeros((2,2))
temp = confusionMatrix[0][0]
confusionMatrix[0][0] = recall
confusionMatrix[0][1] = 1-recall
confusionMatrix[1][1] = float(tn)/(tn+fp)
confusionMatrix[1][0] = 1-confusionMatrix[1][1]
print(confusionMatrix)
print("precision: " + str(precision))
print("recall: " + str(recall))

[[0.67405765 0.32594235]
 [0.30800821 0.69199179]]
precision: 0.6696035242290749
recall: 0.6740576496674058


In [31]:
#Based on: https://datascience.stackexchange.com/questions/45165/how-to-get-accuracy-f1-precision-and-recall-for-a-keras-model
from keras import backend as K

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [32]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m])

In [33]:
hist = model.fit(response_train, labels_train, validation_split=0.3, epochs=10, verbose=0)

In [34]:
loss_t, accuracy_t, f1_score_t, precision_t, recall_t = model.evaluate(response_test, labels_test, verbose=0)

In [35]:
loss_t, accuracy_t, f1_score_t, precision_t, recall_t

(1.7396234896645617, 0.6759062, 0.6679221, 0.7117318, 0.6451939)

### Vanilla LSTM Model

In [36]:
x = LSTM(300, return_sequences=True,name='lstm_layer')(embedded_sequences)
x = GlobalMaxPool1D()(x)
x = Dropout(0.2)(x)
x = Dense(128)(x)
x = Dropout(0.2)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.2)(x)
preds = Dense(2, activation="softmax")(x)

In [54]:
from keras import backend as K

def recall_r(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_r(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

In [55]:
from keras.metrics import categorical_accuracy
model = Model(sequence_input, preds)

model.compile(loss = 'sparse_categorical_crossentropy',              
             optimizer='adam',
             #metrics = ['accuracy'])
              metrics = ['accuracy', precision_r, recall_r])
print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embeddings (Embedding)       (None, 128, 100)          1699100   
_________________________________________________________________
lstm_layer (LSTM)            (None, 128, 300)          481200    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 300)               0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               38528     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0   

In [56]:
print('Training progress:')
history = model.fit(response_train, labels_train, epochs = 3, batch_size=32, 
                    validation_split=0.2)
scores = model.evaluate(response_test, labels_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Training progress:
Train on 3003 samples, validate on 751 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 70.26%


In [57]:
loss, accuracy, precision, recall = model.evaluate(response_test, labels_test, verbose=0)

In [58]:
accuracy, precision, recall

(0.70255864, 0.5179167, 1.0)