In [2]:
# imports
import json
import urllib.request


In [3]:
# open the training file
url = "https://raw.githubusercontent.com/rishabhmisra/News-Headlines-Dataset-For-Sarcasm-Detection/master/Sarcasm_Headlines_Dataset.json"
# url = "https://raw.githubusercontent.com/lin-justin/sarcasm-detection/master/Sarcasm_Headlines_Dataset_v2.json"

# need to massage the text to put it into proper json format
with urllib.request.urlopen(url) as url_json:
    json_text = "["

    for line in url_json:
        line_string = line.decode()
#        print("got line with type: {}".format(type(line_string))
        json_text = json_text + line_string + ","

    json_text = json_text[:-1] + "]"
    file_data = json.loads(json_text)
    print("got data of type: {}".format(type(url_json)))
    print("got json data of type: {}".format(type(json_text)))


got data of type: <class 'http.client.HTTPResponse'>
got json data of type: <class 'str'>


In [4]:
# parse the json
sentence_list = []
label_list = []
url_list = []

for item in file_data:
    sentence_list.append(item['headline'])
    label_list.append(item['is_sarcastic'])
    url_list.append(item['article_link'])

print("got {} elements in my training data".format(len(sentence_list)))


got 28619 elements in my training data


In [58]:
def reverse_print(word_index):
    # reverse the word index
    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

    # print
    for i in range(0, 10):
        print("the word at position {} is: {}".format(i, reverse_word_index.get(i, "?")))


In [60]:
# tokenize the text data for the NN
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# tokenize
tokenizer = Tokenizer(oov_token = '<OOV>')
tokenizer.fit_on_texts(sentence_list)

# get the word index
word_index = tokenizer.word_index
print("got word index of size: {}".format(len(word_index)))

# add in space
word_index['<pad>'] = 0

reverse_print(word_index)


got word index of size: 30885
the word at position 0 is: <pad>
the word at position 1 is: <OOV>
the word at position 2 is: to
the word at position 3 is: of
the word at position 4 is: the
the word at position 5 is: in
the word at position 6 is: for
the word at position 7 is: a
the word at position 8 is: on
the word at position 9 is: and


In [49]:
# get the sentence word sequences
sequences = tokenizer.texts_to_sequences(sentence_list)

# pad tomake sure all row sequnces have the same length and print
padded_sequences = pad_sequences(sequences, value= word_index["<pad>"], padding = 'post')
print("the first sequence is: {}".format(padded_sequences[0]))
print("the shape of the padded seuqnces is: {} with type: {}".format(padded_sequences.shape, type(padded_sequences)))

the first sequence is: [16004   355  3167  7474  2644     3   661  1119     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
the shape of the padded seuqnces is: (28619, 152) w

In [50]:
# split into train and test sets
# specify training split percent
train_split = 0.85

# get the index for the split
train_index = int(train_split * padded_sequences.shape[0])
print("the train row index is: {}".format(train_index))



the train row index is: 24326


In [51]:
# split into train and test
feature_train = padded_sequences[0: train_index]
label_train = label_list[0: train_index]
feature_test = padded_sequences[train_index:]
label_test = label_list[train_index:]

print("split data into train of size {} and test of size {}".format(len(label_train), len(label_test)))

split data into train of size 24326 and test of size 4293


In [52]:
# now need to create a new training tokenizer fit only on the traim data
tokenizer_train = Tokenizer(oov_token='<OOV\>', num_words=10000)

# split the train and test senetences
sentence_train = sentence_list[:train_index]
sentence_test = sentence_list[train_index:]

# train this tokenizer on the training features
tokenizer_train.fit_on_texts(sentence_train)

train_word_index = tokenizer_train.word_index
print("the train sentences word index is of size {}".format(len(train_word_index)))


the train sentences word index is of size 28488


In [65]:
# now get the sequences for the train and test sentence lists
train_sequence = tokenizer_train.texts_to_sequences(sentence_train)
test_sequence = tokenizer_train.texts_to_sequences(sentence_test)

# pad (TODO: parameterize the constants)
train_sequence_padded = pad_sequences(train_sequence, padding='post', truncating='post', maxlen=120)
test_sequence_padded = pad_sequences(test_sequence, padding='post', truncating='post', maxlen=120)

# make sure the shapes match on the column dimension (needed for the NN)
print("the train padded sequence is of shape {} and the test padded sequence is of shape {}".format(train_sequence_padded.shape, test_sequence_padded.shape))

the train padded sequence is of shape (24326, 120) and the test padded sequence is of shape (4293, 120)


In [64]:
print("the word index is of type {} and size {}".format(type(train_word_index), len(word_index)))

iterator = iter(word_index.items())
for i in range(10):
    print(next(iterator))

reverse_print(train_word_index)
# print(train_word_index["<pad>"])


the word index is of type <class 'dict'> and size 30886
('<OOV>', 1)
('to', 2)
('of', 3)
('the', 4)
('in', 5)
('for', 6)
('a', 7)
('on', 8)
('and', 9)
('with', 10)
the word at position 0 is: ?
the word at position 1 is: <OOV\>
the word at position 2 is: to
the word at position 3 is: of
the word at position 4 is: the
the word at position 5 is: in
the word at position 6 is: for
the word at position 7 is: a
the word at position 8 is: on
the word at position 9 is: and


In [55]:
import tensorflow as tf 

# build the NN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

print("the tf model summary is {}".format(model.summary()))

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d_6 ( (None, 16)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 16)                272       
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________
the tf model summary is None


In [71]:
# Need this block to get it to work with TensorFlow 2.x
import numpy as np
training_padded_np = np.array(train_sequence_padded)
training_labels_np = np.array(label_train)
testing_padded_np = np.array(test_sequence_padded)
testing_labels_np = np.array(label_test)

print("train sequence has type {} and shape {}".format(type(train_sequence_padded), train_sequence_padded.shape))
print("train sequence numpy has type {} and shape {}".format(type(training_padded_np), training_padded_np.shape))

print("train labels has type {} and shape {}".format(type(label_train), len(label_train)))
print("train labels numpy has type {} and shape {}".format(type(training_labels_np), training_labels_np.shape))



train sequence has type <class 'numpy.ndarray'> and shape (24326, 120)
train sequence numpy has type <class 'numpy.ndarray'> and shape (24326, 120)
train labels has type <class 'list'> and shape 24326
train labels numpy has type <class 'numpy.ndarray'> and shape (24326,)


In [56]:
print(" train sequence has type {} and length {}".format(type(train_sequence_padded), train_sequence_padded.shape))
print(train_sequence_padded[0])
print(" train label has type {} and length {}".format(type(label_train), len(label_train)))
print(label_train[1:10])


train sequence has type <class 'numpy.ndarray'> and length (24326, 120)
[   1  321 3400 6636 2414    3  662 1013    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]
 train label has type <class 'list'> and length 24326
[0, 0, 1, 1, 0, 0, 1, 1, 0]


In [75]:
# train the tf model
num_epochs = 20

model.fit(training_padded_np, training_labels_np, epochs= num_epochs, validation_data= (testing_padded_np, testing_labels_np), verbose= 1)
# model.fit(train_sequence_padded, training_labels_np, epochs= num_epochs, validation_data= (test_sequence_padded, testing_labels_np), verbose= 1)



Train on 24326 samples, validate on 4293 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fad399fa810>