<a href="https://colab.research.google.com/github/lokeshshekapuram/Sarcasm_Detection/blob/main/Source_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Required Libraries


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score

Loading data


In [None]:
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

Training Linear SVC Model


In [None]:
def train_models(df):
    # Extract context and response (utterance) from DataFrame
    contexts = df['context'].apply(lambda x: ' '.join(x))  # Combine list of context sentences into a single string
    responses = df['utterance']

    # Combine context and response into a single feature for training
    X = contexts + ' ' + responses
    y = df['sarcasm'].astype(int)

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define a pipeline for text classification using CountVectorizer and a classifier
    pipeline = Pipeline([
        ('vect', CountVectorizer()),  # Convert text to token counts
        ('tfidf', TfidfTransformer()),  # Apply TF-IDF transformation
        ('clf', LinearSVC())  # Linear Support Vector Classifier
    ])

    # Train the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Evaluate the model on the test data
    y_pred = pipeline.predict(X_test)
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"Accuracy: {accuracy:.2f}")
    print(f"F1 Score: {f1:.2f}")

    # Return the trained pipeline
    return pipeline# Specify the correct file path to your JSON data file



ACCURACY CALCULATION

In [None]:

file_path = '/content/sarcasm_data.json'

# Load JSON data and convert to DataFrame
data = load_data(file_path)
df = pd.DataFrame(data).T

# Train models using the DataFrame
trained_model = train_models(df)

Accuracy: 0.70
F1 Score: 0.72


In [None]:
# Example usage: Detect sarcasm in a given sentence
input_sentence = "Since it's not bee season, you can have my epinephrine."
is_sarcastic = predict_sarcasm(input_sentence, trained_model)

if is_sarcastic:
    print("sarcastic.")
else:
    print("not sarcastic.")

not sarcastic.


In [None]:
# Example usage: Detect sarcasm in a given sentence
input_sentence = "I think I'm gonna go. Thank you for the burrito and the pork rinds and the 20-minute lecture on why monster trucks are better than regular trucks."
is_sarcastic = predict_sarcasm(input_sentence, trained_model)

if is_sarcastic:
    print("sarcastic.")
else:
    print("not sarcastic.")

sarcastic.


This just used text vectorization and works using the pre-trained data and it can determine for texts outside the dataset also For example:

In [None]:
# Example usage: Detect sarcasm in a given sentence
input_sentence = "Oh, this is exactly what I need today."
is_sarcastic = predict_sarcasm(input_sentence, trained_model)

if is_sarcastic:
    print("sarcastic.")
else:
    print("not sarcastic.")

sarcastic.


In [None]:
# Example usage: Detect sarcasm in a given sentence
input_sentence = "I am a good student.I really like to submit assignments on time"
is_sarcastic = predict_sarcasm(input_sentence, trained_model)

if is_sarcastic:
    print("sarcastic.")
else:
    print("not sarcastic.")

sarcastic.


SVM

In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:

# Step 1: Read JSON file and extract relevant information
def read_json_file(data):
    with open(data, 'r') as file:
        data = json.load(file)
    return data

data = read_json_file('sarcasm_data.json')


In [None]:

# Extract utterances and labels
utterances = []
labels = []
for key, value in data.items():
    utterances.append(value['utterance'])
    labels.append(value['sarcasm'])

vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X = vectorizer.fit_transform(utterances)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

# Step 5: Evaluate the model
predictions = svm_classifier.predict(X_test)
print(classification_report(y_test, predictions))

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


              precision    recall  f1-score   support

       False       0.44      0.62      0.52        53
        True       0.68      0.51      0.58        85

    accuracy                           0.55       138
   macro avg       0.56      0.56      0.55       138
weighted avg       0.59      0.55      0.56       138

Accuracy: 0.5507246376811594


Random Forest and Logistic Regression

In [None]:
import json
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report



Load and preprocess the data

In [None]:

# Step 1: Load and preprocess the data
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def preprocess_data(data):
    X = []
    y = []
    for key, item in data.items():
        X.append(" ".join(item['context']) + " " + item['utterance'])
        y.append(int(item['sarcasm']))
    return X, np.array(y)



Extracting features using TF-IDF

In [None]:

# Step 2: Extract features using TF-IDF
def extract_features(X_train, X_val, max_features=5000):
    vectorizer = TfidfVectorizer(max_features=max_features)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    return X_train_tfidf, X_val_tfidf


In [None]:

# Step 3: Train and evaluate classifiers
def train_and_evaluate_classifier(X_train, y_train, X_val, y_val, classifier='random_forest'):
    if classifier == 'random_forest':
        clf = RandomForestClassifier(n_estimators=100, random_state=42)
    elif classifier == 'logistic_regression':
        clf = LogisticRegression(max_iter=1000, random_state=42)
    else:
        raise ValueError("Invalid classifier specified.")

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_val)

    # Evaluation
    print("Classification Report:")
    print(classification_report(y_val, y_pred))


Loading and preprocessing data

In [None]:

# Load and preprocess data
data = load_data('sarcasm_data.json')
X, y = preprocess_data(data)

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Extract features using TF-IDF
X_train_tfidf, X_val_tfidf = extract_features(X_train, X_val)


Training and Evaluating Classifiers

Random Forest


In [None]:


print("Using Random Forest Classifier:")
train_and_evaluate_classifier(X_train_tfidf, y_train, X_val_tfidf, y_val, classifier='random_forest')


Using Random Forest Classifier:
Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.79      0.59        53
           1       0.77      0.44      0.56        85

    accuracy                           0.57       138
   macro avg       0.62      0.61      0.57       138
weighted avg       0.65      0.57      0.57       138



Logistic Regression

In [None]:

print("\nUsing Logistic Regression Classifier:")
train_and_evaluate_classifier(X_train_tfidf, y_train, X_val_tfidf, y_val, classifier='logistic_regression')



Using Logistic Regression Classifier:
Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.94      0.70        53
           1       0.94      0.53      0.68        85

    accuracy                           0.69       138
   macro avg       0.75      0.74      0.69       138
weighted avg       0.79      0.69      0.69       138



BERT

In [None]:
import json
import numpy as np
from sklearn.model_selection import StratifiedKFold
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score, classification_report


Loading and Preprocessing data


In [None]:

# Define the device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Preprocess data
def preprocess_data(data):
    X = [item['utterance'] for item in data.values()]
    y = [int(item['sarcasm']) for item in data.values()]
    return X, y


BERT Implementation

In [None]:

# Tokenize data using BERT tokenizer
def tokenize_data(texts, labels, tokenizer, max_length):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens = True,
                            max_length = max_length,
                            padding='max_length',
                            truncation=True,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

# Define BERT model and optimizer
def initialize_model():
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False
    )
    optimizer = AdamW(model.parameters(), lr = 2e-5)
    return model, optimizer

# Fine-tune BERT model
def fine_tune_BERT_model(model, optimizer, train_dataloader, val_dataloader, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            model.zero_grad()
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Epoch {epoch + 1}/{epochs}, Average training loss: {avg_train_loss}')


Function for Model Evaluation

In [None]:

# Evaluate model
def evaluate_model(model, val_dataloader):
    model.eval()
    val_preds = []
    val_labels = []
    for batch in val_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()
        labels = b_labels.cpu().numpy().tolist()
        val_preds.extend(preds)
        val_labels.extend(labels)
    accuracy = accuracy_score(val_labels, val_preds)
    print(f'Validation accuracy: {accuracy }')
    print(classification_report(val_labels, val_preds))


In [None]:

# Main function
def main():
    # Load data
    data = load_data('sarcasm_data.json')

    # Preprocess data
    X, y = preprocess_data(data)

    # Tokenize data using BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    max_length = 128
    input_ids, attention_masks, labels = tokenize_data(X, y, tokenizer, max_length)

    # Use k-fold cross-validation
    num_folds = 5
    skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold = 0
    for train_index, val_index in skf.split(input_ids, labels):
        fold += 1
        print(f'Fold {fold}/{num_folds}')
        train_inputs, train_masks, train_labels = input_ids[train_index], attention_masks[train_index], labels[train_index]
        val_inputs, val_masks, val_labels = input_ids[val_index], attention_masks[val_index], labels[val_index]

        # Create dataloaders
        train_data = TensorDataset(train_inputs, train_masks, train_labels)
        val_data = TensorDataset(val_inputs, val_masks, val_labels)
        batch_size = 32
        train_sampler = RandomSampler(train_data)
        val_sampler = SequentialSampler(val_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
        val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

        # Initialize BERT model
        model, optimizer = initialize_model()
        model.to(device)

        # Fine-tune BERT model
        fine_tune_BERT_model(model, optimizer, train_dataloader, val_dataloader)

        # Evaluate model
        evaluate_model(model, val_dataloader)

if __name__ == "__main__":
    main()


Fold 1/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average training loss: 0.6943073140250312
Epoch 2/3, Average training loss: 0.6459419942564435
Epoch 3/3, Average training loss: 0.5975324908892313
Validation accuracy: 0.7749999999999999
              precision    recall  f1-score   support

           0       0.67      0.70      0.68        69
           1       0.68      0.65      0.67        69

    accuracy                           0.67       138
   macro avg       0.67      0.67      0.67       138
weighted avg       0.67      0.67      0.67       138

Fold 2/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average training loss: 0.6721483402782016
Epoch 2/3, Average training loss: 0.5972778068648444
Epoch 3/3, Average training loss: 0.46029050317075515
Validation accuracy: 0.725
              precision    recall  f1-score   support

           0       0.59      0.83      0.69        69
           1       0.71      0.43      0.54        69

    accuracy                           0.63       138
   macro avg       0.65      0.63      0.62       138
weighted avg       0.65      0.63      0.62       138

Fold 3/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average training loss: 0.6864429844750298
Epoch 2/3, Average training loss: 0.6479190256860521
Epoch 3/3, Average training loss: 0.5039657900730768
Validation accuracy: 0.6833333333333332
              precision    recall  f1-score   support

           0       0.57      0.75      0.65        69
           1       0.64      0.43      0.52        69

    accuracy                           0.59       138
   macro avg       0.60      0.59      0.58       138
weighted avg       0.60      0.59      0.58       138

Fold 4/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average training loss: 0.6896953781445821
Epoch 2/3, Average training loss: 0.5917832354704539
Epoch 3/3, Average training loss: 0.480231084757381
Validation accuracy: 0.7
              precision    recall  f1-score   support

           0       0.58      0.75      0.66        69
           1       0.65      0.46      0.54        69

    accuracy                           0.61       138
   macro avg       0.62      0.61      0.60       138
weighted avg       0.62      0.61      0.60       138

Fold 5/5


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average training loss: 0.6904185911019644
Epoch 2/3, Average training loss: 0.6311278541882833
Epoch 3/3, Average training loss: 0.536032486293051
Validation accuracy: 0.7416666666666666
              precision    recall  f1-score   support

           0       0.69      0.52      0.60        69
           1       0.62      0.77      0.68        69

    accuracy                           0.64       138
   macro avg       0.65      0.64      0.64       138
weighted avg       0.65      0.64      0.64       138



BERT AND GLOVE

In [None]:
pip install transformers




Importing Required Libraries


In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import accuracy_score, f1_score


Load And Pre Process the DataSet

In [None]:
class SarcasmDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = str(self.data.iloc[idx]['context'])
        utterance = str(self.data.iloc[idx]['utterance'])
        text = context + ' ' + utterance
        label = self.data.iloc[idx]['sarcasm']
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)
        }
        return item


In [None]:

def load_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

class SarcasmClassifier(torch.nn.Module):
    def __init__(self):
        super(SarcasmClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(0.1)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits


In [None]:

file_path = '/content/sarcasm_data.json'
data = load_data(file_path)
df = pd.DataFrame(data).T
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
dataset = SarcasmDataset(df, tokenizer)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Traning

In [None]:

def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].float().to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)


Evaluation

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SarcasmClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
criterion = torch.nn.BCEWithLogitsLoss()
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)

for epoch in range(5):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    print(f'Epoch {epoch + 1}, Train Loss: {train_loss:.4f}')

def evaluate_model(model, test_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].float()
            outputs = model(input_ids, attention_mask)
            predicted_labels = (torch.sigmoid(outputs) > 0.5).int()
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    print(f'Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}')

test_loader = DataLoader(test_data, batch_size=32, shuffle=False)
evaluate_model(model, test_loader, device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1, Train Loss: 0.6726
Epoch 2, Train Loss: 0.6436
Epoch 3, Train Loss: 0.5722
Epoch 4, Train Loss: 0.4572
Epoch 5, Train Loss: 0.3299
Accuracy: 0.8384, F1 Score: 0.8607
