In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pickle
from collections import Counter
from transformers import BertTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
  
from sklearn.metrics import classification_report, confusion_matrix 

In [2]:
import pandas as pd
from sklearn.model_selection import KFold

# Read the data
df = pd.read_csv('Sarcasm data.txt', sep='\t', header=None, usecols=[0, 1])
df.columns = ['text', 'category']

# Drop rows with missing values and empty text
df = df.dropna()
df = df[df['text'] != '']

In [3]:
df.shape

(5250, 2)

In [4]:
# Split the data using KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(df['text']):
    break

train_df = df.iloc[train_index]

# Further split the test set into validation and test sets using KFold
kf2 = KFold(n_splits=2, shuffle=True, random_state=42)
for val_index, test_index in kf2.split(df.iloc[test_index]['text']):
    break

val_df = df.iloc[val_index]
test_df = df.iloc[test_index]

# Display the first few rows of each split
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())

Train Set:
                                                text category
0          Triple Talaq par Burbak Kuchh nahi bolega       NO
1  Batao ye uss site pr se akki sir ke verdict ni...      YES
2  Hindu baheno par julam bardas nahi hoga @Tripl...       NO
3  Naa bhai.. aisa nhi hai.. mere handle karne se...       NO
4  #RememberingRajiv aaj agar musalman auraten tr...       NO

Validation Set:
                                                 text category
1   Batao ye uss site pr se akki sir ke verdict ni...      YES
4   #RememberingRajiv aaj agar musalman auraten tr...       NO
8   Bachcho ki death par politics ke bajay unke li...       NO
11  #Bollywood @amitdey10510709   Bhaag Milkha Bhaag!       NO
13  Bhai kuchh bhi karna iss @SimplySajidK ke saat...       NO

Test Set:
                                                text category
0          Triple Talaq par Burbak Kuchh nahi bolega       NO
2  Hindu baheno par julam bardas nahi hoga @Tripl...       NO
3  Naa bhai.. aisa nhi ha

In [5]:
import pandas as pd
import re

# Define the cleaning functions
def clean_tweets(text):
    text = text.lower()
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    return text.strip()

def remove_html(text):
    text = text.replace("\n", " ")
    pattern = re.compile('<.*?>')  # all the HTML tags
    return pattern.sub(r'', text)

def remove_email(text):
    text = re.sub(r'[\w.<>]\w+@\w+[\w.<>]', " ", text)
    return text

def remove_all_special_chars(text):
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    return text

def replace_mult_spaces(text):
    text = text.replace("&quot", "")
    pattern = re.compile(' +')
    text = pattern.sub(r' ', text)
    text = text.strip()
    return text

def replace_chars(text, pattern):
    pattern = re.compile(pattern)
    text = pattern.sub(r'', text)
    return text

# Load or define the DataFrames train_df, val_df, and test_df here...

# Apply cleaning functions to the 'Tweet' column of each DataFrame
train_df['text'] = train_df['text'].apply(clean_tweets)
train_df['text'] = train_df['text'].apply(remove_html)
train_df['text'] = train_df['text'].apply(remove_email)
train_df['text'] = train_df['text'].apply(remove_all_special_chars)
train_df['text'] = train_df['text'].apply(replace_mult_spaces)
train_df['text'] = train_df['text'].apply(lambda x: replace_chars(x, '[()!@&;]'))

val_df['text'] = val_df['text'].apply(clean_tweets)
val_df['text'] = val_df['text'].apply(remove_html)
val_df['text'] = val_df['text'].apply(remove_email)
val_df['text'] = val_df['text'].apply(remove_all_special_chars)
val_df['text'] = val_df['text'].apply(replace_mult_spaces)
val_df['text'] = val_df['text'].apply(lambda x: replace_chars(x, '[()!@&;]'))

test_df['text'] = test_df['text'].apply(clean_tweets)
test_df['text'] = test_df['text'].apply(remove_html)
test_df['text'] = test_df['text'].apply(remove_email)
test_df['text'] = test_df['text'].apply(remove_all_special_chars)
test_df['text'] = test_df['text'].apply(replace_mult_spaces)
test_df['text'] = test_df['text'].apply(lambda x: replace_chars(x, '[()!@&;]'))

# Display the first few rows of each DataFrame to verify the changes
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())

Train Set:
                                                text category
0          triple talaq par burbak kuchh nahi bolega       NO
1  batao ye uss site pr se akki sir ke verdict ni...      YES
2  hindu baheno par julam bardas nahi hoga hindu ...       NO
3  naa bhai aisa nhi hai mere handle karne se bhi...       NO
4  aaj agar musalman auraten triple talaq ki waja...       NO

Validation Set:
                                                 text category
1   batao ye uss site pr se akki sir ke verdict ni...      YES
4   aaj agar musalman auraten triple talaq ki waja...       NO
8   bachcho ki death par politics ke bajay unke li...       NO
11                                 bhaag milkha bhaag       NO
13  bhai kuchh bhi karna iss ke saath movie mat ka...       NO

Test Set:
                                                text category
0          triple talaq par burbak kuchh nahi bolega       NO
2  hindu baheno par julam bardas nahi hoga hindu ...       NO
3  naa bhai aisa nhi hai 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df['text'].apply(clean_tweets)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df['text'].apply(remove_html)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df['text'].apply(remove_email)
A value is trying to be set on a copy 

In [6]:
from collections import Counter

def _get_unique(elems):
    if type(elems[0]) == list:
        corpus = [item for sublist in elems for item in sublist]
    else:
        corpus = elems
    elems, freqs = zip(*Counter(corpus).most_common())
    return list(elems)

def convert_categorical_label_to_int(labels):
    if type(labels[0]) == list:
        uniq_labels = _get_unique(labels)
    else:
        uniq_labels = _get_unique(labels)

    label_to_id = {}
    if type(labels[0]) == list:
        label_to_id = {w: i+1 for i, w in enumerate(uniq_labels)}
    else:
        label_to_id = {w: i for i, w in enumerate(uniq_labels)}

    new_labels = []
    if type(labels[0]) == list:
        for i in labels:
            new_labels.append([label_to_id[j] for j in i])
    else:
        new_labels = [label_to_id[j] for j in labels]

    return new_labels, label_to_id

# Convert categorical labels to integer values
train_df['category'], label2idx = convert_categorical_label_to_int(train_df['category'].values)

val_df['category'], _ = convert_categorical_label_to_int(val_df['category'].values)

test_df['category'], _ = convert_categorical_label_to_int(test_df['category'].values)

# Display the first few rows of each DataFrame to verify the changes
print("Train Set:")
train_df.head()


Train Set:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['category'], label2idx = convert_categorical_label_to_int(train_df['category'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['category'], _ = convert_categorical_label_to_int(val_df['category'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['category'], _ = 

Unnamed: 0,text,category
0,triple talaq par burbak kuchh nahi bolega,0
1,batao ye uss site pr se akki sir ke verdict ni...,1
2,hindu baheno par julam bardas nahi hoga hindu ...,0
3,naa bhai aisa nhi hai mere handle karne se bhi...,0
4,aaj agar musalman auraten triple talaq ki waja...,0


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer.fit(train_df['text'])

train_tfidf = tfidf_vectorizer.transform(train_df['text'])
val_tfidf = tfidf_vectorizer.transform(val_df['text'])
test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['text'])

# Convert text data to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
val_sequences = tokenizer.texts_to_sequences(val_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Padding sequences to ensure uniform length
max_len = max([len(seq) for seq in train_sequences])
train_sequences = pad_sequences(train_sequences, maxlen=max_len, padding='post')
val_sequences = pad_sequences(val_sequences, maxlen=max_len, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Display shape of the data to verify
print("Train TF-IDF Shape:", train_tfidf)
print("Validation TF-IDF Shape:", val_tfidf.shape)
print("Test TF-IDF Shape:", test_tfidf.shape)

print("Train Sequences Shape:", train_sequences)
print("Validation Sequences Shape:", val_sequences.shape)
print("Test Sequences Shape:", test_sequences.shape)

# Now, you can proceed with training your LSTM model using these preprocessed data.


Train TF-IDF Shape:   (0, 4628)	0.24722599243672708
  (0, 4483)	0.24484836196955587
  (0, 3463)	0.33138722732960807
  (0, 3071)	0.2515423489851793
  (0, 2565)	0.5021339612553597
  (0, 806)	0.6735718413712931
  (1, 4932)	0.1216209708168168
  (1, 4777)	0.2783739246220301
  (1, 4735)	0.2518095301176143
  (1, 4327)	0.30493831912644587
  (1, 4321)	0.16057071090266997
  (1, 4179)	0.09875741022155622
  (1, 3702)	0.20115413605910926
  (1, 3256)	0.3159635389914152
  (1, 3075)	0.18314156646933635
  (1, 2990)	0.22524513561319853
  (1, 2484)	0.08637668337476942
  (1, 2357)	0.0963613600406013
  (1, 2002)	0.2661694018567466
  (1, 1760)	0.27385996984755306
  (1, 1720)	0.16057071090266997
  (1, 1366)	0.14253046032923378
  (1, 757)	0.2379014237694122
  (1, 583)	0.24522348584200004
  (1, 193)	0.2698220997073872
  :	:
  (4195, 2580)	0.22806915470252515
  (4196, 4604)	0.15680410915740645
  (4196, 3991)	0.3695741542148875
  (4196, 3863)	0.49084177738539736
  (4196, 3176)	0.5149815123797418
  (4196, 3075)	0

In [8]:

vocab_size = 10000
  
# Embedding dimension value 
embedding_dim = 200
  
# Max length of sentence 
max_length = 60
  
# pad_sequences arg 
padding_type = 'post'# Import the TensorFlow library 
import tensorflow as tf 
  
# Define a sequential neural network model 
model = tf.keras.Sequential([ 
    # Embedding layer for creating word embeddings 
    tf.keras.layers.Embedding( 
        vocab_size, embedding_dim, input_length=max_length), 
  
    # GlobalMaxPooling layer to extract relevant features 
    tf.keras.layers.GlobalMaxPool1D(), 
  
    # First Dense layer with 40 neurons and ReLU activation 
    tf.keras.layers.Dense(40, activation='relu'), 
  
    # Dropout layer to prevent overfitting 
    tf.keras.layers.Dropout(0.5), 
  
    # Second Dense layer with 20 neurons and ReLU activation 
    tf.keras.layers.Dense(20, activation='relu'), 
  
    # Dropout layer to prevent overfitting 
    tf.keras.layers.Dropout(0.5), 
  
    # Third Dense layer with 10 neurons and ReLU activation 
    tf.keras.layers.Dense(10, activation='relu'), 
  
    # Dropout layer to prevent overfitting 
    tf.keras.layers.Dropout(0.2), 
  
    # Final Dense layer with 1 neuron and sigmoid activation for binary classification 
    tf.keras.layers.Dense(1, activation='sigmoid') 
]) 
  
model.summary() 



In [9]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_sequences, train_df['category'], 
                    epochs=7, batch_size=32, 
                    validation_data=(val_sequences, val_df['category']))

# Evaluate the model on test set
loss, accuracy = model.evaluate(test_sequences, test_df['category'])
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Epoch 1/7
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 32ms/step - accuracy: 0.8064 - loss: 0.5212 - val_accuracy: 0.9048 - val_loss: 0.2776
Epoch 2/7
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.9017 - loss: 0.3065 - val_accuracy: 0.9048 - val_loss: 0.1519
Epoch 3/7
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 38ms/step - accuracy: 0.9040 - loss: 0.1811 - val_accuracy: 0.9048 - val_loss: 0.1108
Epoch 4/7
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 41ms/step - accuracy: 0.9180 - loss: 0.1261 - val_accuracy: 0.9810 - val_loss: 0.0762
Epoch 5/7
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.9752 - loss: 0.0761 - val_accuracy: 0.9810 - val_loss: 0.0559
Epoch 6/7
[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.9883 - loss: 0.0455 - val_accuracy: 0.9771 - val_loss: 0.0524
Epoch 7/7
[1m132/132[0m [

In [29]:
def predict_sarcasm(text):
    # Clean and preprocess the input text
    cleaned_text = clean_tweets(text)
    cleaned_text = remove_html(cleaned_text)
    cleaned_text = remove_email(cleaned_text)
    cleaned_text = remove_all_special_chars(cleaned_text)
    cleaned_text = replace_mult_spaces(cleaned_text)
    cleaned_text = replace_chars(cleaned_text, '[()!@&;]')
    
    # Tokenize and pad the input sequence
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    
    # Make prediction
    prediction = model.predict(sequence)
    
    # Convert prediction to binary label
    binary_prediction = 1 if prediction > 0.5 else 0
    
    # Return prediction
    if binary_prediction == 1:
        return "Sarcasm"
    else:
        return "Not Sarcasm"

# Example usage
input_text = "Hindu Married men to Mr. Modi: triple talaq toh nipat gaya ab woh saat janmoon wala masla bhi nipata dijeyega  @narendramodi"
prediction = predict_sarcasm(input_text)
print("Prediction:", prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Prediction: Not Sarcasm


In [22]:
model.save('model.h5')

