In [190]:
import pandas as pd
import json
import os

import torch
import torch.nn as nn

from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding

In [49]:
data_path = "../data/archive/"
data = "Sarcasm_Headlines_Dataset_v2.json"
# with open(data_path, 'r', encoding='utf-8') as file:
#     data = [json.loads(json_object) for json_object in file]
# df = pd.json_normalize(data)

df = pd.read_json(os.path.join(data_path, data), lines=True)

In [50]:
df

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [95]:
SEED = 25

In [192]:
dataset_HF = load_dataset(path=data_path, data_files=data)

dataset_HF = dataset_HF.remove_columns(['article_link'])


dataset_HF.set_format('pandas')
dataset_HF = dataset_HF['train'][:]

dataset_HF.drop_duplicates(subset=['headline'],inplace=True)
dataset_HF.reset_index(drop=True, inplace=True)

dataset_HF = Dataset.from_pandas(dataset_HF)

In [193]:
dataset_HF

Dataset({
    features: ['is_sarcastic', 'headline'],
    num_rows: 28503
})

In [194]:
train_test_split = dataset_HF.train_test_split(test_size=0.2, seed=SEED)

In [195]:
train_test_split

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 5701
    })
})

In [196]:
valid_test_split = train_test_split['test'].train_test_split(test_size=0.5, seed=SEED)

In [197]:
valid_test_split

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 2850
    })
    test: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 2851
    })
})

In [198]:
dataset_HF = DatasetDict({
    'train': train_test_split['train'],
    'validation': valid_test_split['train'],
    'test': valid_test_split['test']
})

In [199]:
dataset_HF

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 22802
    })
    validation: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 2850
    })
    test: Dataset({
        features: ['is_sarcastic', 'headline'],
        num_rows: 2851
    })
})

In [200]:
CHECKPOINT="distilbert-base-uncased"
MAX_SEQUENCE_LENGTH=512
EMBED_VECTOR_SIZE=768

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
tokenizer.model_max_length = MAX_SEQUENCE_LENGTH

In [206]:
def tokenize(batch):
    return tokenizer(batch['headline'], truncation=True, max_length=MAX_SEQUENCE_LENGTH)

tokenized_dataset = dataset_HF.map(tokenize, batched=True)

Map:   0%|          | 0/22802 [00:00<?, ? examples/s]

Map:   0%|          | 0/2850 [00:00<?, ? examples/s]

Map:   0%|          | 0/2851 [00:00<?, ? examples/s]

In [185]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    validation: Dataset({
        features: ['is_sarcastic', 'headline', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
    test: Dataset({
        features: ['is_sarcastic', 'headline', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
})

In [186]:
tokenized_dataset = tokenized_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'is_sarcastic'])

In [191]:
dataCollator = DataCollatorWithPadding(tokenizer=tokenizer)