# **Load Dataset**

# **REMOVE UNWANTED SYMBOLS**

In [None]:
import pandas as pd

df = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [None]:
import string
from string import digits, punctuation

hl_cleansed = []
for hl in df['headline']:
#     Remove punctuations
    clean = hl.translate(str.maketrans('', '', punctuation))
#     Remove digits/numbers
    clean = clean.translate(str.maketrans('', '', digits))
    hl_cleansed.append(clean)
    
# View comparison
print('Original texts :')
print(df['headline'][37])
print('\nAfter cleansed :')
print(hl_cleansed[37])

Original texts :
'moana' sails straight to the top of the box office with massive $81.1 million opening

After cleansed :
moana sails straight to the top of the box office with massive  million opening


# **TOKENS**

In [None]:
# Tokenization process
hl_tokens = []
for hl in hl_cleansed:
    hl_tokens.append(hl.split())

# View Comparison
index = 100
print('Before tokenization :')
print(hl_cleansed[index])
print('\nAfter tokenization :')
print(hl_tokens[index])

Before tokenization :
demi lovato drops emotional nightingale music vid

After tokenization :
['demi', 'lovato', 'drops', 'emotional', 'nightingale', 'music', 'vid']


# **LEMMATIZER**

In [None]:
# Lemmatize with appropriate POS Tag
# Credit : www.machinelearningplus.com/nlp/lemmatization-examples-python/

import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

hl_lemmatized = []
for tokens in hl_tokens:
    lemm = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]
    hl_lemmatized.append(lemm)
    
# Example comparison
word_1 = ['skyrim','dragons', 'are', 'having', 'parties']
word_2 = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in word_1]
print('Before lemmatization :\t',word_1)
print('After lemmatization :\t',word_2)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Before lemmatization :	 ['skyrim', 'dragons', 'are', 'having', 'parties']
After lemmatization :	 ['skyrim', 'dragon', 'be', 'have', 'party']


# **TEXT TO VECTOR**

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
import numpy as np

# Vectorize and convert text into sequences
max_features = 2000
max_token = len(max(hl_lemmatized))
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(hl_lemmatized)
sequences = tokenizer.texts_to_sequences(hl_lemmatized)
X = pad_sequences(sequences, maxlen=max_token)

index = 10
print('Before :')
print(hl_lemmatized[index],'\n')
print('After sequences convertion :')
print(sequences[index],'\n')
print('After padding :')
print(X[index])

Using TensorFlow backend.


Before :
['airline', 'passenger', 'tackle', 'man', 'who', 'rush', 'cockpit', 'in', 'bomb', 'threat'] 

After sequences convertion :
[840, 1011, 1987, 13, 36, 1241, 4, 1689, 629] 

After padding :
[   0    0    0    0    0  840 1011 1987   13   36 1241    4 1689  629]


# **MODEL**

In [None]:
from sklearn.model_selection import train_test_split

Y = df['is_sarcastic'].values
Y = np.vstack(Y)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3, random_state = 42)

In [None]:
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 64

model = Sequential()
model.add(Embedding(max_features, embed_dim,input_length = max_token))
model.add(LSTM(96, dropout=0.2, recurrent_dropout=0.2, activation='relu'))
# model.add(Dense(128))
# model.add(Activation('relu'))
# model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

# **TRAIN**

In [None]:
epoch = 10
batch_size = 128
model.fit(X_train, Y_train, epochs = epoch, batch_size=batch_size, verbose = 2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
 - 6s - loss: 0.5379 - accuracy: 0.7171
Epoch 2/10
 - 5s - loss: 0.3767 - accuracy: 0.8401
Epoch 3/10
 - 5s - loss: 0.3372 - accuracy: 0.8532
Epoch 4/10
 - 5s - loss: 0.3182 - accuracy: 0.8638
Epoch 5/10
 - 5s - loss: 0.3089 - accuracy: 0.8675
Epoch 6/10
 - 5s - loss: 0.2924 - accuracy: 0.8757
Epoch 7/10
 - 5s - loss: 0.2818 - accuracy: 0.8792
Epoch 8/10
 - 5s - loss: 0.2678 - accuracy: 0.8851
Epoch 9/10
 - 5s - loss: 0.2608 - accuracy: 0.8861
Epoch 10/10
 - 5s - loss: 0.2491 - accuracy: 0.8946


<keras.callbacks.callbacks.History at 0x7f68e53b8748>

# **TEST**

In [None]:
loss, acc = model.evaluate(X_test, Y_test, verbose=2)
print("Overall scores")
print("Loss\t\t: ", round(loss, 3))
print("Accuracy\t: ", round(acc, 3))

Overall scores
Loss		:  0.409
Accuracy	:  0.834


# ACCURACY SCORE

In [None]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_test)):
    
    result = model.predict(X_test[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.around(result) == np.around(Y_test[x]):
        if np.around(Y_test[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.around(Y_test[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1

In [None]:
print("Sarcasm accuracy\t: ", round(pos_correct/pos_cnt*100, 3),"%")
print("Non-sarcasm accuracy\t: ", round(neg_correct/neg_cnt*100, 3),"%")

Sarcasm accuracy	:  84.477 %
Non-sarcasm accuracy	:  82.606 %


# **INPUT PUNCTUATION**

In [None]:
"""input2 = ["world crowd applauds for 'dolphin' playfully 98979801 spraying blood from blowhole"]  #sarcastic
input3 = ["former versace store clerk sues over secret 'black code' for minority shoppers"]      #not
"""
inputt=["My name is Akilesh"]

input2=["It’s okay if you don’t like me. Not everyone has good taste."]                          #sarcastic
input_cleansed=[]


#Remove punctuations
clean = inputt[0].translate(str.maketrans('', '', punctuation))
#Remove digits/numbers
clean = clean.translate(str.maketrans('', '', digits))
input_cleansed.append(clean)
print(input_cleansed)

['My name is Akilesh']


# **INPUT TOKEN**

In [None]:
input_tokens = []
for input in input_cleansed:
    input_tokens.append(input.split())

# View Comparison
index = 0
print('Before tokenization :')
print(input_cleansed[index])
print('\nAfter tokenization :')
print(input_tokens[index])

Before tokenization :
My name is Akilesh

After tokenization :
['My', 'name', 'is', 'Akilesh']


# INPUT LEMMA 

In [None]:
# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

input_lemmatized = []
for tokens in input_tokens:
    lemm = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in tokens]
    input_lemmatized.append(lemm)

print(input_lemmatized)

[['My', 'name', 'be', 'Akilesh']]


# **INPUT VECTOR**

In [None]:
# Vectorize and convert text into sequences

max_features = 2000
max_token = len(max(hl_lemmatized))
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(hl_lemmatized)
sequences = tokenizer.texts_to_sequences(input_lemmatized)
X = pad_sequences(sequences, maxlen=max_token)


print(sequences)
print(X)

[[85, 178, 5]]
[[  0   0   0   0   0   0   0   0   0   0   0  85 178   5]]


# **INPUT PREDICTION**

In [None]:
res=model.predict(X)
print(res)
if res>0.5:
  print("Sarcastic")
else:
  print("Not Sarcastic")

[[0.10508372]]
Not Sarcastic
