# Libraries

In [1]:
# Libraries

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
#from pandas_profiling import ProfileReport


In [2]:
import torch
#from transformers.file_utils import is_tf_available, is_torch_available, is_torch_tpu_available
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

# Data

In [3]:
# Data 
path = '/Users/lucialarraona/Desktop/projects_sem2/dtudeeplearning22/Files/Sarcasm_Headlines_Dataset_v2.json'
df = pd.read_json(path, lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


### Split into train and test

In [4]:
def train_test_split(df, frac=0.2):
    # Get random sample of the dataframe
    test = df.sample(frac=frac, axis=0, random_state= 42) # random state to ensure reproductibility of results
    # Get everything but the test sample
    train = df.drop(index=test.index)
    return train, test

df_train, df_test = train_test_split(df,frac=0.3)
print(f'The training set has a shape of {df_train.shape}')
print(f'The testing set has a shape of {df_test.shape}')

The training set has a shape of (20033, 3)
The testing set has a shape of (8586, 3)


In [5]:
X_train = df_train['headline']
X_test = df_test['headline']

y_train = df_train['is_sarcastic']
y_test = df_test['is_sarcastic']

In [6]:
target_names = list(df['is_sarcastic'].unique())
target_names

[1, 0]

# Transformers
- Distilled-Bert (a smaller version of Bert)

####  Define model and max_lenght

In [7]:
# Define model-name (based on hugging-face library)
model_name = "bert-base-uncased"
# max sequence length for each document/sentence sample (headlines are much shorter, we'll change it)
max_length = 512 

#### Preprocessing

In [8]:
# Hugging Face has its own tokenizer for the transformer: Load the tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
# tokenize the dataset, truncate when passed `max_length`, and pad with 0's when less than `max_length`
train_encodings = X_train.apply(lambda x: tokenizer(x, truncation=True,padding=True, max_length=max_length))
test_encodings = X_test.apply(lambda x: tokenizer(x, truncation=True,padding=True, max_length=max_length))

In [10]:
# Create a new dataset with the tokenized input(headlines) and the labels
class NewsHeadlinesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor([self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

# convert our tokenized data into a torch Dataset
train_dataset = NewsHeadlinesDataset(train_encodings, y_train)
test_dataset = NewsHeadlinesDataset(test_encodings, y_test)

### Train

In [11]:
# Training with Trainer function from HuggingFace

# Load the model and pass to CUDA
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

: 

: 

In [None]:
# Define metrics for evaluating the classification model and pass it to the Trainer object
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  # calculate accuracy using sklearn's function
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    load_best_model_at_end=True,     # load the best model when finished training (default metric is loss)
    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
    logging_steps=400,               # log & save weights each logging_steps
    save_steps=400,
    evaluation_strategy="steps",     # evaluate each `logging_steps`
)

trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,          # evaluation dataset
    compute_metrics=compute_metrics,     # the callback that computes metrics of interest
)

trainer.train()

#### Evaluation

In [None]:
# Evaluate the model after training
trainer.evaluate()

In [None]:
def get_prediction(text):
    # prepare our text into tokenized sequence
    inputs = tokenizer(text, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to("cuda")
    # perform inference to our model
    outputs = model(**inputs)
    # get output probabilities by doing softmax
    probs = outputs[0].softmax(1)
    # executing argmax function to get the candidate label
    return target_names[probs.argmax()]

    

In [None]:
# Save the model and tokenizer
model_path = "Files/saved_models/model_name"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)