In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import keras
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding,Bidirectional,GRU
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
%matplotlib inline
import os
import re

### Analyzing Data

In [None]:
#removing column width to observe haedline
pd.set_option('display.max_colwidth', None)

In [None]:
df = pd.read_json("data/Sarcasm_Headlines_Dataset.json", lines=True)
df.head()

In [None]:
df.info()

- Target colum - is_sarcastic having values as 0 and 1
- Headline column carries the headline of the article, these characters hold importent words that distinguishes the sarcastic words and serious words in headlines
- Article_link column carries the link for the article, link details doesnt contain much additional info compared to headline but just the repeat of words to represent the link
- Total 26,709 data points available

In [None]:
df.isna().sum()

- No null values found

- Removing the article link column as its not adding value to dataframe

In [None]:
del df['article_link']

In [None]:
df.info()

#### Preprocessing and Visualization

In [None]:
sns.countplot(df['is_sarcastic']);

In [None]:
df.is_sarcastic.value_counts()

- We can notice we have more of non sracastic data points compared to sarcastic
- This is still not a very significant difference as we have a wide data set

In [None]:
df.sample(10)

###### Cleaning Headline column

In [None]:
df['headline'] = df['headline'].apply(lambda x: x.lower())
df['headline'] = df['headline'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

###### Setting up tokenizer and max words we will be using to speed up the model building process

In [None]:
for idx,row in df.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['headline'].values)
X = tokenizer.texts_to_sequences(df['headline'].values)
X = pad_sequences(X)

###### Splitting data into train and test sets

In [None]:
Y = pd.get_dummies(df['is_sarcastic']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

##### LSTM + RNN model

In [None]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
batch_size = 32
history = model.fit(X_train, Y_train, epochs = 25, batch_size=batch_size, verbose = 2)

In [None]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
#plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.savefig('model_accuracy.png')
# summarize history for loss
plt.plot(history.history['loss'])
#plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
plt.savefig('model_loss.png')

In [None]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("Sarcasm_acc", pos_correct/pos_cnt*100, "%")
print("Non-Sarcasm_acc", neg_correct/neg_cnt*100, "%")

In [None]:
headline = ['Chowkidar hi chor hai']
headline = tokenizer.texts_to_sequences(headline)
headline = pad_sequences(headline, maxlen=29, dtype='int32', value=0)

sentiment = model.predict(headline,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("Non-sarcastic")
elif (np.argmax(sentiment) == 1):
    print("Sarcasm")

In [None]:
headline = ['unaware sons eats lipstick as lunch']
headline = tokenizer.texts_to_sequences(headline)
headline = pad_sequences(headline, maxlen=29, dtype='int32', value=0)

sentiment = model.predict(headline,batch_size=1,verbose = 2)[0]
if(np.argmax(sentiment) == 0):
    print("Non-sarcastic")
elif (np.argmax(sentiment) == 1):
    print("Sarcasm")

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model2.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model2.h5")

##### Some more data preparation steps

#### Removing stop words

In [None]:
from nltk.corpus import stopwords
from string import punctuation
import re,string,unicodedata

stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

In [None]:
#Removing the stopwords from text
def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

In [None]:
df['headline'] = df['headline'].apply(remove_stopwords)
df['headline'] = df['headline'].apply(lambda x: x.lower())
df['headline'] = df['headline'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

In [None]:
words = []
for i in df.headline.values:
    words.append(i.split())
words[:5]

In [None]:
!pip install gensim

In [None]:
import gensim
#Dimension of vectors we are generating
EMBEDDING_DIM = 200

#Creating Word Vectors by Word2Vec Method (takes time...)
w2v_model = gensim.models.Word2Vec(sentences = words , vector_size=EMBEDDING_DIM , window = 5 , min_count = 1)

In [None]:
#vocab size
#https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4
vocab_len = len(w2v_model.wv)
print(vocab_len)

In [None]:
from keras.preprocessing import text, sequence

#tokenizer = text.Tokenizer(num_words=max_fatures, split=' ') #keeping same as earier will update later
tokenizer = text.Tokenizer(num_words=28000, split=' ') 
tokenizer.fit_on_texts(words)
tokenized_train = tokenizer.texts_to_sequences(words)
x = sequence.pad_sequences(tokenized_train, maxlen = 20)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# Function to create weight matrix from word2vec gensim model
def get_weight_matrix(model, vocab):
    vocab_size = len(vocab) + 1
    weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    for word, i in vocab.items():
        weight_matrix[i] = model.wv[word]
    return weight_matrix

In [None]:

embedding_vectors = get_weight_matrix(w2v_model, tokenizer.word_index)

In [None]:
#Defining Neural Network
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=20, trainable=True))
#LSTM 
model.add(Bidirectional(LSTM(units=128 , recurrent_dropout = 0.3 , dropout = 0.3,return_sequences = True)))
model.add(Bidirectional(GRU(units=32 , recurrent_dropout = 0.1 , dropout = 0.1)))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=keras.optimizers.Adam(lr = 0.01), loss='binary_crossentropy', metrics=['acc'])

del embedding_vectors

In [None]:
model.summary()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, df.is_sarcastic , test_size = 0.3 , random_state = 0)

In [None]:
history = model.fit(x_train, y_train, batch_size = 128 , validation_data = (x_test,y_test) , epochs = 25)

In [None]:
print("Accuracy of the model on Training Data is - " , model.evaluate(x_train,y_train)[1]*100)
print("Accuracy of the model on Testing Data is - " , model.evaluate(x_test,y_test)[1]*100)

In [None]:
import gc
gc.collect()

In [None]:
from sklearn.model_selection import train_test_split

train_sentences, val_sentences, train_labels, val_labels = train_test_split(df['headline'], df['is_sarcastic'], test_size=0.2, random_state=0)

print(train_sentences.shape)
print(val_sentences.shape)
print(train_labels.shape)
print(val_labels.shape)

In [None]:
# Tokenize and pad
vocab_size = 10000
oov_token = '<00V>'
max_length = 120
padding_type = 'post'
trunc_type = 'post'
embedding_dim = 16
num_epochs = 10

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

val_sequences = tokenizer.texts_to_sequences(val_sentences)
val_padded = pad_sequences(val_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

#### Creating multiple models

#### Dense layer with GlobalAveragePooling

In [None]:
import tensorflow as tf
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model.summary()
history = model.fit(train_padded, 
                    train_labels, 
                    validation_data=(val_padded, val_labels), 
                    epochs=num_epochs, 
                    verbose=2)

#### Dense layer with Flatten

In [None]:
model2 = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model2.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model2.summary()
history_flatten = model2.fit(train_padded, 
                    train_labels, 
                    validation_data=(val_padded, val_labels), 
                    epochs=num_epochs, 
                    verbose=2)

#### Single Bidirectional LSTM layer 

In [None]:
model_lstm = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model_lstm.summary()
history_lstm = model_lstm.fit(train_padded, 
                    train_labels, 
                    validation_data=(val_padded, val_labels), 
                    epochs=num_epochs, 
                    verbose=2)

#### Multi layer bidirectional lstm

In [None]:
model_mul_lstm = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_mul_lstm.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])
model_mul_lstm.summary()
history_mul_lstm = model_mul_lstm.fit(train_padded, 
                    train_labels, 
                    validation_data=(val_padded, val_labels), 
                    epochs=num_epochs, 
                    verbose=2)

#### Data Prep with GLove 200d

In [None]:
df = pd.read_json("data/Sarcasm_Headlines_Dataset.json", lines=True)

In [None]:
import re

def decontracted(phrase):
    phrase=str(phrase)
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
import spacy
#https://spacy.io/usage/models
nlp = spacy.load('en_core_web_sm')
def preprocessing(text):
  text = text.replace('#','')
  text = decontracted(text)
  text = re.sub('\S*@\S*\s?','',text)
  text = re.sub('http[s]?:(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',text)

  token=[]
  result=''
  text = re.sub('[^A-z]', ' ',text.lower())
  
  text = nlp(text)
  for t in text:
    if not t.is_stop and len(t)>2:  
      token.append(t.lemma_)
  result = ' '.join([i for i in token])

  return result.strip()

In [None]:
#df.text = df.headline.apply(lambda x : preprocessing(x))

In [None]:

df['headline'] = df['headline'].apply(lambda x: x.lower())
df['headline'] = df['headline'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))


In [None]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
stop=set(stopwords.words('english'))
def create_corpus(df):
    corpus=[]
    for tweet in tqdm(df['headline']):
        words=[word.lower() for word in word_tokenize(tweet) if((word.isalpha()==1) & (word not in stop))]
        corpus.append(words)
    return corpus

In [None]:
corpus=create_corpus(df)

In [None]:
import numpy as np
embedding_dict={}
with open('data/glove.6B.200d.txt','r',encoding='utf8') as f:
    for line in f:
        values=line.split()
        word=values[0]
        vectors=np.asarray(values[1:],'float32')
        embedding_dict[word]=vectors
f.close()

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
MAX_LEN=40
tokenizer_obj=Tokenizer()
tokenizer_obj.fit_on_texts(corpus)
sequences=tokenizer_obj.texts_to_sequences(corpus)

tweet_pad=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

In [None]:
word_index=tokenizer_obj.word_index
print('Number of unique words:',len(word_index))

In [None]:
num_words=len(word_index)+1
embedding_matrix=np.zeros((num_words,200))

for word,i in tqdm(word_index.items()):
    if i > num_words:
        continue
    
    emb_vec=embedding_dict.get(word)
    if emb_vec is not None:
        embedding_matrix[i]=emb_vec


#### Single layer LSTM

In [None]:
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense,SpatialDropout1D,Dropout
from keras.initializers import Constant
from keras.optimizers import Adam
model=Sequential()

embedding=Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64,dropout=0.2, recurrent_dropout=0.2))


model.add(Dense(1, activation='sigmoid'))



optimzer=Adam(learning_rate=0.001)

model.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])
model.summary()

In [None]:
from keras.callbacks import ModelCheckpoint,EarlyStopping
filepath="weights-model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
es = EarlyStopping(monitor='val_accuracy', mode='max', verbose=1, patience=30)
callbacks_list = [checkpoint,es]

In [None]:
tweet=df.iloc[:,:]
X=tweet_pad[:tweet.shape[0]]

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,tweet['is_sarcastic'].values,test_size=0.2)
print('Shape of train',X_train.shape)
print("Shape of Validation ",X_test.shape)

In [None]:
history=model.fit(X_train,y_train,batch_size=100,epochs=25,validation_data=(X_test,y_test),verbose=1)

In [None]:
#history=model.fit(X_train,y_train,batch_size=100,epochs=100,validation_data=(X_test,y_test),verbose=1)

In [None]:
#headline = ['\'nice to meet you,\' coworkers tell new employee they\'ve studied online for hours']
#headline = pd.DataFrame(headline)
#headline = preprocessing(headline)

headline = ['\'nice to meet you,\' coworkers tell new employee they\'ve studied online for hours']
sequences=tokenizer_obj.texts_to_sequences(headline)

headline=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

sentiment = model.predict(headline,batch_size=1,verbose = 2)[0]
if(sentiment < 0.45):
    print("Non-sarcastic")
else:
    print("Sarcasm")

In [None]:
sentiment

In [None]:
#headline = ['\'nice to meet you,\' coworkers tell new employee they\'ve studied online for hours']
#headline = pd.DataFrame(headline)
#headline = preprocessing(headline)

headline = ['dog is smoking hot']
sequences=tokenizer_obj.texts_to_sequences(headline)

headline=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

sentiment = model.predict(headline,batch_size=1,verbose = 2)[0]
if(sentiment < 0.45):
    print("Non-sarcastic")
else:
    print("Sarcasm")

In [None]:
sentiment

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
from sklearn.externals import joblib 
  
# Save the model as a pickle in a file 

joblib.dump(model, 'model_embd_2.pkl') 

#### Multi layer bidirectional lstm

In [None]:
model2=Sequential()

embedding=Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model2.add(embedding)
model2.add(SpatialDropout1D(0.2))
model2.add(Bidirectional(LSTM(64,dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model2.add(Bidirectional(LSTM(32,dropout=0.2, recurrent_dropout=0.2)))
#model2.add(Dense(24, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))



optimzer=Adam(learning_rate=0.001)

model2.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])
model2.summary()

In [None]:
history=model2.fit(X_train,y_train,batch_size=100,epochs=25,validation_data=(X_test,y_test),verbose=1)

In [None]:
model2=Sequential()

embedding=Embedding(num_words,200,embeddings_initializer=Constant(embedding_matrix),
                   input_length=MAX_LEN,trainable=False)

model2.add(embedding)
model2.add(SpatialDropout1D(0.5))
#model2.add(Bidirectional(LSTM(64,dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model2.add(Bidirectional(LSTM(64,dropout=0.2, recurrent_dropout=0.2)))
#model2.add(Dense(128, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))



optimzer=Adam(learning_rate=0.001)

model2.compile(loss='binary_crossentropy',optimizer=optimzer,metrics=['accuracy'])
model2.summary()

In [None]:
history=model2.fit(X_train,y_train,batch_size=100,epochs=25,validation_data=(X_test,y_test),verbose=1)

In [None]:
from sklearn.externals import joblib 
  
# Save the model as a pickle in a file 

joblib.dump(model2, 'model_embd_3.pkl') 

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
headline = ['nice to meet you, coworkers tell new employee they\'ve studied online for hours']
headline = pd.DataFrame(headline)
headline = preprocessing(headline)

sequences=tokenizer_obj.texts_to_sequences(headline)

headline=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

sentiment = model2.predict(headline,batch_size=1,verbose = 2)[0]
if(sentiment < 0.1):
    print("Non-sarcastic")
else:
    print("Sarcasm")

In [None]:
sentiment

In [None]:
headline = ['Silence is golden. Duct tape is silver.']
headline = pd.DataFrame(headline)
headline = preprocessing(headline)

sequences=tokenizer_obj.texts_to_sequences(headline)

headline=pad_sequences(sequences,maxlen=MAX_LEN,truncating='post',padding='post')

sentiment = model2.predict(headline,batch_size=1,verbose = 2)[0]
if(sentiment < 0.1):
    print("Non-sarcastic")
else:
    print("Sarcasm")

In [None]:
sentiment

- with multiple models and results we can observe that increasing the layers not necessarily incresing the prediction capacity but sometimes decreasing it as well if not properly fine tuned.
- We were able to acheieve close to 83% accuracy for test sets.

###### Visualizing the words in the json

In [None]:
!pip install stylecloud

In [None]:
import stylecloud

In [None]:
stylecloud.gen_stylecloud(file_path='data/Sarcasm_Headlines_Dataset.json',icon_name='fas fa-apple-alt')

In [None]:
from PIL import Image

Image.open('stylecloud.png')