In [1]:
#!pip install tensorflow==2.0.0b1


In [13]:
import json
import tensorflow as tf
from keras.optimizers import *

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.callbacks import ModelCheckpoint

print(tf.__version__)

2.0.0-beta0


In [15]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 1990000
batch_size=32
epochs=50

version=4
download_data=False
load_weights=False
do_training=True
save_model=True
model_name="sarcasm_model"+"_v"+str(version)
model_file=model_name+".h5"

In [4]:
#-------------------
# Get Data
#-------------------

sentences = []
context = []
labels = []
urls = []

# sarcasm_data.json
if(download_data):
    !wget --no-check-certificate \
    https://storage.googleapis.com/laurencemoroney-blog.appspot.com/sarcasm.json \
    -O /tmp/sarcasm.json               
        
def clean_line(line):
    try:
        line.lower()
    except:
        print("failed to lower: "+str(line))            
    
    line = " ".join(line.split())
    line = line.strip()
    line = line.replace("[_-]", ' ')
    line = line.replace("\'", '')
    line = line.replace("at&amp;t", "at&t")
                        
    return line
        
def parseJsonToMemory(filename, labelField, commentField, contextField):
    print(f'Processing {filename}')
    with open("./data/sarcasm_data.json", 'r') as f:
        datastore = json.load(f)

        for item in datastore:
            sentences.append(item[commentField])
            labels.append(int(item[labelField]))
            if contextField == -1 :
                context.append("")
            else:
                context.append(row[contextField])
    print(f'Processed {filename}')
    print("  sentences len: "+str(len(sentences)))
    print("  labels len: "+str(len(labels)))
    print("  context len: "+str(len(context)))
    
def joinArray(arr):
    s = " "
    s = s.join(arr)
    return s

def parseJsonToMemory2(filename, labelField, commentField, contextField):
    print(f'Processing {filename}')
    with open("./data/sarcasm_data.json", 'r') as f:
        datastore = json.load(f)

        for item in datastore:
            sentences.append(item[commentField])
            labels.append(int(item[labelField]))
            context.append(joinArray(item[contextField]))
            
    print(f'Processed {filename}')
    print("  sentences len: "+str(len(sentences)))
    print("  labels len: "+str(len(labels)))
    print("  context len: "+str(len(context)))
  
import csv 
def parseCSVToMemory(filename, labelCol, commentCol, contextCol):
    print(f'Processing {filename}')
    with open(filename) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            #print(f'\t label: {row[labelCol]} of {type(row[labelCol])} sentence: {row[commentCol]} context: {row[contextCol]}')
            try:
                if line_count == 0:
                    print(f'  Column names are {", ".join(row)}')
                    line_count += 1
                else:
                    
                    labels.append(int(row[labelCol]))
                    sentences.append(row[commentCol])
                    line_count += 1
                    if contextCol == -1 :
                        context.append("")
                    else:
                        context.append(row[contextCol])               
            except:
                print("error parsing label: " + row[labelCol] + " comment: " + row[commentCol])        
        print(f'Processed {filename} w/ {line_count} lines.')
        print("  sentences len: "+str(len(sentences)))
        print("  labels len: "+str(len(labels)))
        print("  context len: "+str(len(context)))



In [5]:
# add class data
parseJsonToMemory('./data/sarcasm_data.json', 'is_sarcastic', 'headline', -1)

#add kaggle data
parseCSVToMemory('./data/sarcasm_kaggle.csv', 0, 1, -1)

#add reddit data
parseCSVToMemory('./data/train-balanced-sarcasm.csv', 0, 1, 9)


#TODO: doesn't parse correctly
#parseJsonToMemory2('./data/sarcasm_data_mustard.json', "sarcasm", "utterance", "context") 

# shuffle

# combine x and y cols
def concat_cols(x,y):
    y = y.reshape(y.shape[0],1)
    return np.concatenate((x,y), axis=1)

# split x and y
def split_cols(a):
    x_res = a[:,0:a.shape[1]-1]
    y_res = a[:,a.shape[1]-1:a.shape[1]]
    return x_res, y_res

# shuffle rows of a
def shuffle(a, subset_size):
    subset = a[np.random.choice(a.shape[0], subset_size, replace=False), :]
    return subset


                  
print("done processing")

Processing ./data/sarcasm_data.json
Processed ./data/sarcasm_data.json
  sentences len: 26709
  labels len: 26709
  context len: 26709
Processing ./data/sarcasm_kaggle.csv
  Column names are label, comment
Processed ./data/sarcasm_kaggle.csv w/ 1010819 lines.
  sentences len: 1037527
  labels len: 1037527
  context len: 1037527
Processing ./data/train-balanced-sarcasm.csv
  Column names are label, comment, author, subreddit, score, ups, downs, date, created_utc, parent_comment
Processed ./data/train-balanced-sarcasm.csv w/ 1010827 lines.
  sentences len: 2048353
  labels len: 2048353
  context len: 2048353
done processing


In [6]:
training_sentences = sentences[0:training_size]
testing_sentences = sentences[training_size:]
training_labels = labels[0:training_size]
testing_labels = labels[training_size:]
print(testing_sentences[0:5])
print(testing_labels[0:5])

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

def sentencesToPaddedSequences(sentences):
    sequences = tokenizer.texts_to_sequences(sentences)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
    return padded_sequences

training_padded = sentencesToPaddedSequences(training_sentences)
print("training_padded.shape: "+str(training_padded.shape))
print(str(training_padded))
print(type(training_padded))

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = sentencesToPaddedSequences(testing_sentences)
print("testing_padded.shape: "+str(testing_padded.shape))
print(type(testing_padded))

print("type of test labels: " + str(type(testing_labels)))
print("type of training labels: " + str(type(training_labels)))


["I think I'm ok with a very spartan environment, tbh.", 'Lawl', "I know people who didn't stick with MD/PhD programs (eg only ended up with a MD/MS) who see themselves as quitters or failures.", 'Today, I miss Ba.', 'Lol yeah he tends to do that sometimes.']
[0, 0, 0, 0, 0]
training_padded.shape: (1990000, 120)
[[2129    1  875 ...    0    0    0]
 [   2    1    1 ...    0    0    0]
 [ 813 1019    4 ...    0    0    0]
 ...
 [  31 1011  218 ...    0    0    0]
 [  13 2769    1 ...    0    0    0]
 [2834   20   83 ...    0    0    0]]
<class 'numpy.ndarray'>
testing_padded.shape: (58353, 120)
<class 'numpy.ndarray'>
type of test labels: <class 'list'>
type of training labels: <class 'list'>


In [7]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [8]:
if(load_weights):
    print("loading weights from: "+model_file)
    model.load_weights(model_file)
    print("weights: ")
    #print(str(model.get_weights()))
    print("loading weights complete!")

In [9]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
bidirectional (Bidirectional (None, 120, 256)          148480    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 735,745
Trainable params: 735,745
Non-trainable params: 0
_________________________________________________________________


In [None]:
checkpointer = ModelCheckpoint(filepath='./'+model_name+'-weights.hdf5', verbose=2, save_best_only=True)

if(do_training):
    history = model.fit(training_padded, 
                        training_labels, 
                        batch_size=batch_size, 
                        epochs=epochs, 
                        validation_data=(testing_padded, testing_labels),
                        callbacks=[checkpointer],
                        verbose=1)


Train on 1990000 samples, validate on 58353 samples
Epoch 1/50
  23040/1990000 [..............................] - ETA: 4:28:19 - loss: 0.6074 - accuracy: 0.6679

In [None]:
if(do_training):
    import matplotlib.pyplot as plt

    #print(str(history.history))
    def plot_graphs(history, string):
      plt.plot(history.history[string])
      plt.plot(history.history['val_'+string])
      plt.xlabel("Epochs")
      plt.ylabel(string)
      plt.legend([string, 'val_'+string])
      plt.show()

    plot_graphs(history, 'accuracy')
    plot_graphs(history, 'loss')

In [None]:
if(save_model):
    model.save(model_file)

In [None]:
# prediction

x_pred_sentences = ["that dress looks so cute on you", 
                    "jeans that make my butt look good", 
                    "phantom menace was the best movie ever",
                    "i am so over it",
                    "babies are ugly",
                    "you are so badass"]
x_padded_sequence = sentencesToPaddedSequences(x_pred_sentences)
pred = model.predict(x_padded_sequence)
print(pred)