In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
import os
import pickle
from collections import Counter

# Import the required tokenizer
from transformers import BertTokenizer

In [2]:
import pandas as pd
from sklearn.model_selection import KFold

# Read the data
df = pd.read_csv('Sarcasm data.txt', sep='\t', header=None, usecols=[0, 1])
df.columns = ['text', 'category']

# Drop rows with missing values and empty text
df = df.dropna()
df = df[df['text'] != '']

# Split the data using KFold
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(df['text']):
    break

train_df = df.iloc[train_index]

# Further split the test set into validation and test sets using KFold
kf2 = KFold(n_splits=2, shuffle=True, random_state=42)
for val_index, test_index in kf2.split(df.iloc[test_index]['text']):
    break

val_df = df.iloc[val_index]
test_df = df.iloc[test_index]

# Display the first few rows of each split
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())

Train Set:
                                                text category
0          Triple Talaq par Burbak Kuchh nahi bolega       NO
1  Batao ye uss site pr se akki sir ke verdict ni...      YES
2  Hindu baheno par julam bardas nahi hoga @Tripl...       NO
3  Naa bhai.. aisa nhi hai.. mere handle karne se...       NO
4  #RememberingRajiv aaj agar musalman auraten tr...       NO

Validation Set:
                                                 text category
1   Batao ye uss site pr se akki sir ke verdict ni...      YES
4   #RememberingRajiv aaj agar musalman auraten tr...       NO
8   Bachcho ki death par politics ke bajay unke li...       NO
11  #Bollywood @amitdey10510709   Bhaag Milkha Bhaag!       NO
13  Bhai kuchh bhi karna iss @SimplySajidK ke saat...       NO

Test Set:
                                                text category
0          Triple Talaq par Burbak Kuchh nahi bolega       NO
2  Hindu baheno par julam bardas nahi hoga @Tripl...       NO
3  Naa bhai.. aisa nhi ha

In [3]:
print (train_df.shape, val_df.shape, test_df.shape)

(4200, 2) (525, 2) (525, 2)


In [4]:
train_df.head(5)

Unnamed: 0,text,category
0,Triple Talaq par Burbak Kuchh nahi bolega,NO
1,Batao ye uss site pr se akki sir ke verdict ni...,YES
2,Hindu baheno par julam bardas nahi hoga @Tripl...,NO
3,Naa bhai.. aisa nhi hai.. mere handle karne se...,NO
4,#RememberingRajiv aaj agar musalman auraten tr...,NO


In [5]:
import pandas as pd
import re

# Define the cleaning functions
def clean_tweets(text):
    text = text.lower()
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    return text.strip()

def remove_html(text):
    text = text.replace("\n", " ")
    pattern = re.compile('<.*?>')  # all the HTML tags
    return pattern.sub(r'', text)

def remove_email(text):
    text = re.sub(r'[\w.<>]\w+@\w+[\w.<>]', " ", text)
    return text

def remove_all_special_chars(text):
    text = re.sub("[^a-zA-Z0-9]", " ", text)
    return text

def replace_mult_spaces(text):
    text = text.replace("&quot", "")
    pattern = re.compile(' +')
    text = pattern.sub(r' ', text)
    text = text.strip()
    return text

def replace_chars(text, pattern):
    pattern = re.compile(pattern)
    text = pattern.sub(r'', text)
    return text

# Load or define the DataFrames train_df, val_df, and test_df here...

# Apply cleaning functions to the 'Tweet' column of each DataFrame
train_df['text'] = train_df['text'].apply(clean_tweets)
train_df['text'] = train_df['text'].apply(remove_html)
train_df['text'] = train_df['text'].apply(remove_email)
train_df['text'] = train_df['text'].apply(remove_all_special_chars)
train_df['text'] = train_df['text'].apply(replace_mult_spaces)
train_df['text'] = train_df['text'].apply(lambda x: replace_chars(x, '[()!@&;]'))

val_df['text'] = val_df['text'].apply(clean_tweets)
val_df['text'] = val_df['text'].apply(remove_html)
val_df['text'] = val_df['text'].apply(remove_email)
val_df['text'] = val_df['text'].apply(remove_all_special_chars)
val_df['text'] = val_df['text'].apply(replace_mult_spaces)
val_df['text'] = val_df['text'].apply(lambda x: replace_chars(x, '[()!@&;]'))

test_df['text'] = test_df['text'].apply(clean_tweets)
test_df['text'] = test_df['text'].apply(remove_html)
test_df['text'] = test_df['text'].apply(remove_email)
test_df['text'] = test_df['text'].apply(remove_all_special_chars)
test_df['text'] = test_df['text'].apply(replace_mult_spaces)
test_df['text'] = test_df['text'].apply(lambda x: replace_chars(x, '[()!@&;]'))

# Display the first few rows of each DataFrame to verify the changes
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())

Train Set:
                                                text category
0          triple talaq par burbak kuchh nahi bolega       NO
1  batao ye uss site pr se akki sir ke verdict ni...      YES
2  hindu baheno par julam bardas nahi hoga hindu ...       NO
3  naa bhai aisa nhi hai mere handle karne se bhi...       NO
4  aaj agar musalman auraten triple talaq ki waja...       NO

Validation Set:
                                                 text category
1   batao ye uss site pr se akki sir ke verdict ni...      YES
4   aaj agar musalman auraten triple talaq ki waja...       NO
8   bachcho ki death par politics ke bajay unke li...       NO
11                                 bhaag milkha bhaag       NO
13  bhai kuchh bhi karna iss ke saath movie mat ka...       NO

Test Set:
                                                text category
0          triple talaq par burbak kuchh nahi bolega       NO
2  hindu baheno par julam bardas nahi hoga hindu ...       NO
3  naa bhai aisa nhi hai 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df['text'].apply(clean_tweets)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df['text'].apply(remove_html)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df['text'].apply(remove_email)
A value is trying to be set on a copy 

In [6]:
train_df.text.apply(lambda x: len(x)).describe()

count    4200.000000
mean       94.653333
std        33.336076
min         4.000000
25%        69.000000
50%        99.000000
75%       124.000000
max       167.000000
Name: text, dtype: float64

In [7]:
train_df.text.apply(lambda x: len(x.split())).describe()

count    4200.000000
mean       18.534048
std         6.682988
min         1.000000
25%        14.000000
50%        19.000000
75%        24.000000
max        34.000000
Name: text, dtype: float64

In [8]:
# Convert values in the 'category' column to uppercase
train_df['category'] = train_df['category'].str.upper()
val_df['category'] = val_df['category'].str.upper()
test_df['category'] = test_df['category'].str.upper()

# Display the first few rows of each DataFrame to verify the changes
print("Train Set:")
print(train_df.head())
print("\nValidation Set:")
print(val_df.head())
print("\nTest Set:")
print(test_df.head())

Train Set:
                                                text category
0          triple talaq par burbak kuchh nahi bolega       NO
1  batao ye uss site pr se akki sir ke verdict ni...      YES
2  hindu baheno par julam bardas nahi hoga hindu ...       NO
3  naa bhai aisa nhi hai mere handle karne se bhi...       NO
4  aaj agar musalman auraten triple talaq ki waja...       NO

Validation Set:
                                                 text category
1   batao ye uss site pr se akki sir ke verdict ni...      YES
4   aaj agar musalman auraten triple talaq ki waja...       NO
8   bachcho ki death par politics ke bajay unke li...       NO
11                                 bhaag milkha bhaag       NO
13  bhai kuchh bhi karna iss ke saath movie mat ka...       NO

Test Set:
                                                text category
0          triple talaq par burbak kuchh nahi bolega       NO
2  hindu baheno par julam bardas nahi hoga hindu ...       NO
3  naa bhai aisa nhi hai 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['category'] = train_df['category'].str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['category'] = val_df['category'].str.upper()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['category'] = test_df['category'].str.upper()


In [9]:
from collections import Counter

def _get_unique(elems):
    if type(elems[0]) == list:
        corpus = [item for sublist in elems for item in sublist]
    else:
        corpus = elems
    elems, freqs = zip(*Counter(corpus).most_common())
    return list(elems)

def convert_categorical_label_to_int(labels):
    if type(labels[0]) == list:
        uniq_labels = _get_unique(labels)
    else:
        uniq_labels = _get_unique(labels)

    label_to_id = {}
    if type(labels[0]) == list:
        label_to_id = {w: i+1 for i, w in enumerate(uniq_labels)}
    else:
        label_to_id = {w: i for i, w in enumerate(uniq_labels)}

    new_labels = []
    if type(labels[0]) == list:
        for i in labels:
            new_labels.append([label_to_id[j] for j in i])
    else:
        new_labels = [label_to_id[j] for j in labels]

    return new_labels, label_to_id

# Convert categorical labels to integer values
train_df['category'], label2idx = convert_categorical_label_to_int(train_df['category'].values)

val_df['category'], _ = convert_categorical_label_to_int(val_df['category'].values)

test_df['category'], _ = convert_categorical_label_to_int(test_df['category'].values)

# Display the first few rows of each DataFrame to verify the changes
print("Train Set:")
train_df.head()


Train Set:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['category'], label2idx = convert_categorical_label_to_int(train_df['category'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df['category'], _ = convert_categorical_label_to_int(val_df['category'].values)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['category'], _ = 

Unnamed: 0,text,category
0,triple talaq par burbak kuchh nahi bolega,0
1,batao ye uss site pr se akki sir ke verdict ni...,1
2,hindu baheno par julam bardas nahi hoga hindu ...,0
3,naa bhai aisa nhi hai mere handle karne se bhi...,0
4,aaj agar musalman auraten triple talaq ki waja...,0


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm import tqdm

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer.fit(train_df['text'])

train_tfidf = tfidf_vectorizer.transform(train_df['text'])
val_tfidf = tfidf_vectorizer.transform(val_df['text'])
test_tfidf = tfidf_vectorizer.transform(test_df['text'])

# Tokenizer
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_df['text'])

# Convert text data to sequences
train_sequences = tokenizer.texts_to_sequences(train_df['text'])
val_sequences = tokenizer.texts_to_sequences(val_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

# Padding sequences to ensure uniform length
max_len = max([len(seq) for seq in train_sequences])
train_sequences = pad_sequences(train_sequences, maxlen=max_len, padding='post')
val_sequences = pad_sequences(val_sequences, maxlen=max_len, padding='post')
test_sequences = pad_sequences(test_sequences, maxlen=max_len, padding='post')

# Display shape of the data to verify
print("Train TF-IDF Shape:", train_tfidf.shape)
print("Validation TF-IDF Shape:", val_tfidf.shape)
print("Test TF-IDF Shape:", test_tfidf.shape)

print("Train Sequences Shape:", train_sequences.shape)
print("Validation Sequences Shape:", val_sequences.shape)
print("Test Sequences Shape:", test_sequences.shape)

# Now, you can proceed with training your LSTM model using these preprocessed data.


Train TF-IDF Shape: (4200, 5000)
Validation TF-IDF Shape: (525, 5000)
Test TF-IDF Shape: (525, 5000)
Train Sequences Shape: (4200, 32)
Validation Sequences Shape: (525, 32)
Test Sequences Shape: (525, 32)


In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_sequences, train_df['category'], 
                    epochs=5, batch_size=32, 
                    validation_data=(val_sequences, val_df['category']))

# Evaluate the model on test set
loss, accuracy = model.evaluate(test_sequences, test_df['category'])
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.13834980130195618
Test Accuracy: 0.961904764175415


In [15]:
def predict_sarcasm(text):
    # Clean and preprocess the input text
    cleaned_text = clean_tweets(text)
    cleaned_text = remove_html(cleaned_text)
    cleaned_text = remove_email(cleaned_text)
    cleaned_text = remove_all_special_chars(cleaned_text)
    cleaned_text = replace_mult_spaces(cleaned_text)
    cleaned_text = replace_chars(cleaned_text, '[()!@&;]')
    
    # Tokenize and pad the input sequence
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    sequence = pad_sequences(sequence, maxlen=max_len, padding='post')
    
    # Make prediction
    prediction = model.predict(sequence)
    
    # Convert prediction to binary label
    binary_prediction = 1 if prediction > 0.5 else 0
    
    # Return prediction
    if binary_prediction == 1:
        return "Sarcasm"
    else:
        return "Not Sarcasm"

# Example usage
input_text = "nikitaa tu patli ho gayi hai"
prediction = predict_sarcasm(input_text)
print("Prediction:", prediction)

Prediction: Not Sarcasm
