In [345]:
import pandas as pd
import json
import os

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding
from transformers.modeling_outputs import TokenClassifierOutput

In [208]:
data_path = "../data/archive/"
data = "Sarcasm_Headlines_Dataset_v2.json"
# with open(data_path, 'r', encoding='utf-8') as file:
#     data = [json.loads(json_object) for json_object in file]
# df = pd.json_normalize(data)

df = pd.read_json(os.path.join(data_path, data), lines=True)

In [209]:
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [210]:
SEED = 25

In [211]:
dataset_HF = load_dataset(path=data_path, data_files=data)

dataset_HF = dataset_HF.remove_columns(['article_link'])

dataset_HF.set_format('pandas')
dataset_HF = dataset_HF['train'][:]
dataset_HF.drop_duplicates(subset=['headline'],inplace=True)
dataset_HF.reset_index(drop=True, inplace=True)

dataset_HF = Dataset.from_pandas(dataset_HF)

In [212]:
dataset_HF

Dataset({
    features: ['is_sarcastic', 'headline'],
    num_rows: 28503
})

In [213]:
train_test_split = dataset_HF.train_test_split(test_size=0.2, seed=SEED)

In [214]:
train_test_split

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 5701
    })
})

In [215]:
valid_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=SEED)

In [216]:
valid_test_split

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 2850
    })
    test: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 2851
    })
})

In [217]:
dataset_HF = DatasetDict({
    'train': train_test_split['train'],
    'validation': valid_test_split['train'],
    'test': valid_test_split['test']
})

In [218]:
dataset_HF

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 22802
    })
    validation: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 2850
    })
    test: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 2851
    })
})

In [264]:
CHECKPOINT="distilbert-base-uncased"
MAX_SEQUENCE_LENGTH=512
EMBED_VECTOR_SIZE=768

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
tokenizer.model_max_length = MAX_SEQUENCE_LENGTH

In [286]:
def tokenize(batch):
    return tokenizer(batch['headline'], truncation=True, max_length=MAX_SEQUENCE_LENGTH)

tokenized_dataset = dataset_HF.map(tokenize, batched=True)

Map:   0%|          | 0/22802 [00:00<?, ? examples/s]

Map:   0%|          | 0/2850 [00:00<?, ? examples/s]

Map:   0%|          | 0/2851 [00:00<?, ? examples/s]

In [287]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    validation: Dataset({
        features: ['is_sarcastic', 'headline', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
    test: Dataset({
        features: ['is_sarcastic', 'headline', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
})

In [288]:
tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'is_sarcastic'])

In [290]:
dataCollator = DataCollatorWithPadding(tokenizer=tokenizer)

In [301]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 128 

train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = BATCH_SIZE, collate_fn = dataCollator
)

eval_dataloader = DataLoader(
    tokenized_dataset['validation'], shuffle = True, collate_fn = dataCollator
)

In [346]:
class Customized_Network(nn.Module):
    def __init__(self, checkpoint, num_labels):
        super(Customized_Network, self).__init__()
        self.num_labels = num_labels
        self.model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(checkpoint, 
                                                                                             output_attention=True, 
                                                                                             output_hidden_state=True
                                                                                            ))
                # New Layer
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels )
        
    def forward(self, input_ids = None, attention_mask=None, is_sarcastic = None ):
        """
        Forward pass for the model.
        
        Args:
            input_ids (torch.Tensor, optional): Tensor of input IDs. Defaults to None.
            attention_mask (torch.Tensor, optional): Tensor for attention masks. Defaults to None.
            labels (torch.Tensor, optional): Tensor for labels. Defaults to None.
            
        Returns:
            TokenClassifierOutput: A named tuple with the following fields:
            - loss (torch.FloatTensor of shape (1,), optional, returned when label_ids is provided) – Classification loss.
            - logits (torch.FloatTensor of shape (batch_size, num_labels)) – Classification scores before SoftMax.
            - hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True is passed or when config.output_hidden_states=True) 
            – Tuple of torch.FloatTensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size).
            - attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True is passed or when config.output_attentions=True) 
            – Tuple of torch.FloatTensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length).
        """
        outputs = self.model(input_ids = input_ids, attention_mask = attention_mask  )
        
        last_hidden_state = outputs[0]
        
        sequence_outputs = self.dropout(last_hidden_state)
        
        logits = self.classifier(sequence_outputs[:, 0, : ].view(-1, 768 ))
        
        loss = None
        loss = None
        if is_sarcastic is not None:
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(logits.view(-1, self.num_labels), is_sarcastic.view(-1))
            
            return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
        

In [347]:
model = Customized_Network(checkpoint=CHECKPOINT, num_labels=2 ).to(device)

In [348]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr = 5e-5 )

num_epoch = 3
num_training_steps = num_epoch * len(train_dataloader)
num_warmup_steps = 0 * num_training_steps  # You can adjust the warm-up proportion as needed

# Create the scheduler
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps,
)



In [349]:
from datasets import load_metric
metric = load_metric("f1")

In [350]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader) ))


for epoch in range(num_epoch):
    model.train()
    for batch in train_dataloader:
        batch = { k: v.to(device) for k, v in batch.items() }
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
        
    model.eval()
    for batch in eval_dataloader:
        batch = { k: v.to(device) for k, v in batch.items() }
        with torch.no_grad():
            outputs = model(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim = -1 )
        metric.add_batch(predictions = predictions, references = batch['is_sarcastic'] )
        progress_bar_eval.update(1)
        
    print(metric.compute()) 

  0%|          | 0/6843 [00:00<?, ?it/s]

  0%|          | 0/8550 [00:00<?, ?it/s]

{'f1': 0.8979433449747768}
{'f1': 0.9313653136531366}
{'f1': 0.9307295504789979}


In [359]:
model.eval()

test_dataloader = DataLoader(
    tokenized_dataset['test'], batch_size = 32, collate_fn = dataCollator
)


for batch in test_dataloader:
    batch = { k: v.to(device) for k, v in batch.items() }
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim = -1)
    metric.add_batch(predictions=predictions, references=batch['is_sarcastic'])

metric.compute()


{'f1': 0.9263000374111485}

In [360]:
torch.save(model.state_dict(), 'model.pth')

In [361]:
tokenizer.save_pretrained('tokenizer')

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')