In [28]:
pip install nltk comet-ml emoji unbabel-comet

Collecting unbabel-comet
  Downloading unbabel_comet-2.2.2-py3-none-any.whl.metadata (15 kB)
Collecting entmax<2.0,>=1.1 (from unbabel-comet)
  Downloading entmax-1.3-py3-none-any.whl.metadata (348 bytes)
Collecting jsonargparse==3.13.1 (from unbabel-comet)
  Downloading jsonargparse-3.13.1-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5.0.0,>=4.24.4 (from unbabel-comet)
  Downloading protobuf-4.25.5-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting pytorch-lightning<3.0.0,>=2.0.0 (from unbabel-comet)
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting sacrebleu<3.0.0,>=2.0.0 (from unbabel-comet)
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece<0.2.0,>=0.1

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/SarcasmNLP')

Mounted at /content/drive


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import emoji

df_isarcasm = pd.read_csv('/content/isarcasm2022.csv', sep=',')
df_gpt = pd.read_csv('/content/GPT_pairs.tsv', sep='\t')

print(df_isarcasm.head())
print(df_gpt.head())

# Add 'is_sarcastic' column with value 1 for all rows in df_gpt
df_gpt['is_sarcastic'] = 1

# Adjust the column names if necessary to match the actual names in your dataset.
# Concatenate the two DataFrames with renamed columns for consistency
df = pd.concat([
    df_isarcasm[['tweet', 'rephrase', 'sarcastic']].rename(
        columns={'tweet': 'text', 'rephrase': 'translation', 'sarcastic': 'is_sarcastic'}
    ),
    df_gpt[['Sarcastic', 'Translation' , 'is_sarcastic']].rename(
        columns={'Sarcastic': 'text', 'Translation': 'translation', 'is_sarcastic': 'is_sarcastic'}
    )
], ignore_index=True)

# Display the first 5 rows of the final DataFrame
print(df.head())

   Unnamed: 0                                              tweet  sarcastic  \
0           0  The only thing I got from college is a caffein...          1   
1           1  I love it when professors draw a big question ...          1   
2           2  Remember the hundred emails from companies whe...          1   
3           3  Today my pop-pop told me I was not “forced” to...          1   
4           4  @VolphanCarol @littlewhitty @mysticalmanatee I...          1   

                                            rephrase  sarcasm  irony  satire  \
0  College is really difficult, expensive, tiring...      0.0    1.0     0.0   
1  I do not like when professors don’t write out ...      1.0    0.0     0.0   
2  I, at the bare minimum, wish companies actuall...      0.0    1.0     0.0   
3  Today my pop-pop told me I was not "forced" to...      1.0    0.0     0.0   
4  I would say Ted Cruz is an asshole and doesn’t...      1.0    0.0     0.0   

   understatement  overstatement  rhetorical

In [4]:
# Remove rows with null values in the 'text' column
df = df.dropna(subset=['text'])

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5780 entries, 0 to 5780
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          5780 non-null   object
 1   translation   3180 non-null   object
 2   is_sarcastic  5780 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 180.6+ KB


In [5]:
# Function to convert emojis to text, handling float values
def convert_emojis(text):
    # Check if text is a float (potentially NaN) and convert to string
    if isinstance(text, float):
        text = str(text)
    return emoji.demojize(text, delimiters=(" ", " "))

# Apply emoji conversion to both input (sarcastic) and output (literal) text
df['text'] = df['text'].apply(convert_emojis)
df[:10]

Unnamed: 0,text,translation,is_sarcastic
0,The only thing I got from college is a caffein...,"College is really difficult, expensive, tiring...",1
1,I love it when professors draw a big question ...,I do not like when professors don’t write out ...,1
2,Remember the hundred emails from companies whe...,"I, at the bare minimum, wish companies actuall...",1
3,Today my pop-pop told me I was not “forced” to...,"Today my pop-pop told me I was not ""forced"" to...",1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,I would say Ted Cruz is an asshole and doesn’t...,1
5,"@jimrossignol I choose to interpret it as ""XD""...",It's a terrible name and the product sounds aw...,1
6,Why would Alexa's recipe for Yorkshire pudding...,Great recipe from Alexa,1
7,someone hit me w a horse tranquilizer istg ive...,Simply “I’m miserable.”,1
8,Loving season 4 of trump does America. Funnies...,this last year of trumps presidency is not goi...,1
9,Holly Arnold ??? Who #ImACeleb #MBE nope not ...,"Holly Arnold seem like a nice lady, just feel ...",1


In [10]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,EarlyStoppingCallback
from torch.utils.data import Dataset
import torch

# Define tokenizer and encode text function
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def encode_texts(texts, targets=None):
    if targets is None:  # For single input (sarcasm detection)
        return tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
    else:  # For two inputs (sarcasm interpretation)
        inputs = interpretation_tokenizer(
            texts.tolist(),
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        targets = interpretation_tokenizer(
            targets.tolist(),
            padding=True,
            truncation=True,
            max_length=128,  # Adjust max length if needed
            return_tensors="pt"
        )
        return inputs, targets



In [21]:
# Prepare text and labels for sarcasm detection
X = df['text']
y = df['is_sarcastic']  # Assume this column has binary labels (1 for sarcastic, 0 for non-sarcastic)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom dataset class for sarcasm classification
class SarcasmClassificationDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = encode_texts(texts)
        self.labels = torch.tensor(labels.values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.texts['input_ids'][idx],
            'attention_mask': self.texts['attention_mask'][idx],
            'labels': self.labels[idx]
        }

# Initialize train and test datasets
train_dataset = SarcasmClassificationDataset(X_train, y_train)
test_dataset = SarcasmClassificationDataset(X_test, y_test)

# Load BERT model for binary classification
classifier_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
classifier_model.to(device)  # Move model to GPU

# Define training arguments
training_args = TrainingArguments(
    output_dir='./sarcasm_classifier_results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    weight_decay=0.01,
    learning_rate=2e-5,
    save_strategy="epoch"
)

# Initialize Trainer for sarcasm classification
trainer = Trainer(
    model=classifier_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Train the sarcasm classifier
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3779,0.356477
2,0.3195,0.36291
3,0.2471,0.707633


TrainOutput(global_step=1734, training_loss=0.3016655271853543, metrics={'train_runtime': 403.0029, 'train_samples_per_second': 114.739, 'train_steps_per_second': 14.342, 'total_flos': 891083144520000.0, 'train_loss': 0.3016655271853543, 'epoch': 3.0})

In [22]:
model_path = '/content/drive/MyDrive/SarcasmNLP/classification'

# Save the model CBOW2 for hotel reviews dataset
trainer.save_model(model_path)

print(f'Model saved to {model_path}')

Model saved to /content/drive/MyDrive/SarcasmNLP/classification


In [17]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Initialize T5 tokenizer and model for text generation
interpretation_tokenizer = T5Tokenizer.from_pretrained('t5-small')
interpretation_model = T5ForConditionalGeneration.from_pretrained('t5-small')

# Custom dataset for sarcasm interpretation
class SarcasmInterpretationDataset(Dataset):
    def __init__(self, texts, targets):
        self.inputs, self.targets = encode_texts(texts, targets)

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': self.targets['input_ids'][idx]
        }

# Prepare sarcastic texts and translations for training
sarcastic_texts = df[df['is_sarcastic'] == 1]['text']
translations = df[df['is_sarcastic'] == 1]['translation']

# Train-test split
X_sarcastic_train, X_sarcastic_test, y_translation_train, y_translation_test = train_test_split(
    sarcastic_texts, translations, test_size=0.2, random_state=42
)

# Initialize train and test datasets for sarcasm interpretation
train_interpretation_dataset = SarcasmInterpretationDataset(X_sarcastic_train, y_translation_train)
test_interpretation_dataset = SarcasmInterpretationDataset(X_sarcastic_test, y_translation_test)

# Training arguments for interpretation model
interpretation_training_args = TrainingArguments(
    output_dir='./sarcasm_interpreter_results',
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    report_to="none",
    weight_decay=0.01,
    learning_rate=2e-5,
)

# Trainer for the interpretation model
interpretation_trainer = Trainer(
    model=interpretation_model,
    args=interpretation_training_args,
    train_dataset=train_interpretation_dataset,
    eval_dataset=test_interpretation_dataset
)

# Train the sarcasm interpretation model
interpretation_trainer.train()

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Epoch,Training Loss,Validation Loss
1,No log,0.600275
2,1.899800,0.462893
3,1.899800,0.387038
4,0.358800,0.373107
5,0.303800,0.366396
6,0.303800,0.362012
7,0.292500,0.359108
8,0.286000,0.357153
9,0.286000,0.356039
10,0.282200,0.35567


TrainOutput(global_step=3180, training_loss=0.5542079913541206, metrics={'train_runtime': 603.7921, 'train_samples_per_second': 42.134, 'train_steps_per_second': 5.267, 'total_flos': 860773857361920.0, 'train_loss': 0.5542079913541206, 'epoch': 10.0})

In [19]:
model_path = '/content/drive/MyDrive/SarcasmNLP/interpretation'

# Save the model CBOW2 for hotel reviews dataset
interpretation_trainer.save_model(model_path)

print(f'Model saved to {model_path}')

Model saved to /content/drive/MyDrive/SarcasmNLP/interpretation


In [23]:
def classify_sarcasm(text):
    inputs = tokenizer(text, return_tensors="pt").to(device)  # Move inputs to the same device as the model
    outputs = classifier_model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return prediction == 1  # Return True if sarcastic

def generate_interpretation(sarcastic_text):
    inputs = interpretation_tokenizer(sarcastic_text, return_tensors="pt").to(device)
    outputs = interpretation_model.generate(inputs['input_ids'], max_length=50, num_beams=5, early_stopping=True)
    return interpretation_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test on a few examples
test_examples = X_test.sample(10)
for text in test_examples:
    is_sarcastic = classify_sarcasm(text)
    print(f"Original: {text} {is_sarcastic}")
    if is_sarcastic:
        interpretation = generate_interpretation(text)
        print(f"Interpretation: {interpretation}\n")
    else:
        print("Not Sarcastic\n")


Original: How amazing! A surprise fire drill right as I sit down to work! True
Interpretation: Fire drills can be frustrating.

Original: simping for guys that don’t even know you exist is a different type of heart break pensive_face  False
Not Sarcastic

Original: Fantastic! More paperwork to fill out for something I didn't even sign up for! True
Interpretation: I don't need more paperwork to fill out for something I didn't sign up for.

Original: How charming! A surprise guest at dinner that I didn't invite. True
Interpretation: I didn't invite a surprise guest at dinner.

Original: if trees would simply stop producing flammable leaves climate change would not be an issue False
Not Sarcastic

Original: first game not starting when it's supposed to? a UCF tradition False
Not Sarcastic

Original: what i love about mine and summers work schedules is that she will finish at 8pm and then i’ll start work at 9pm and then when i get home at 7am she’ll start work at 8am  red_heart  False
Not 

In [26]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Define BLEU scoring function
def compute_bleu(reference, hypothesis):
    reference = [reference.split()]
    hypothesis = hypothesis.split()
    smooth_fn = SmoothingFunction().method1  # Apply smoothing to handle small variations
    return sentence_bleu(reference, hypothesis, smoothing_function=smooth_fn)

# Evaluate BLEU on the test set
bleu_scores = []

# Use enumerate to get the index and value for the test set
for idx, row in enumerate(X_sarcastic_test):  # Change here
    generated_interpretation = generate_interpretation(row)
    reference = y_translation_test.iloc[idx]  # idx now refers to the index in y_translation_test
    bleu_score = compute_bleu(reference, generated_interpretation)
    bleu_scores.append(bleu_score)

average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {average_bleu}")

Average BLEU Score: 0.059792736497491415


In [32]:
from comet import download_model, load_from_checkpoint

# Load COMET model
comet_model_path = download_model("wmt20-comet-da")
comet_model = load_from_checkpoint(comet_model_path)

# Prepare data for COMET evaluation
comet_data = []

for idx, row in enumerate(X_sarcastic_test):
    generated_interpretation = generate_interpretation(row)
    reference = y_translation_test.iloc[idx]
    comet_data.append({"src": row, "mt": generated_interpretation, "ref": reference})

# Compute COMET scores
comet_scores = comet_model.predict(comet_data, batch_size=8, gpus=1)

# Access the actual scores from the 'scores' key within the Prediction object
actual_scores = comet_scores['scores']

# Calculate the average COMET score using the actual scores
average_comet = sum(actual_scores) / len(actual_scores)
print(f"Average COMET Score: {average_comet}")

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../root/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt`
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 80/80 [00:11<00:00,  7.12it/s]


Average COMET Score: -0.5109139076644942
