# CS: Natural Language Processing

## Hands-on Workshop - Second Session

### Fake News Detection
["Fake News Detection is a *Natural Language Processing* task that involves identifying and classifying news articles or other types of text as Real or Fake. The goal of Fake News Detection is to develop algorithms that can automatically identify and flag fake news articles, which can be used to combat misinformation and promote the dissemination of accurate information."](https://paperswithcode.com/task/fake-news-detection)
<br><br/>
This part of the notebook will go through the topics in order:
- [Load & Prepare Data, Setup Datasets, and Create DataLoaders](#Load-&-Prepare-Data,-Setup-Datasets,-and-Create-DataLoaders)

- [Config the Model and Optimizer](#Config-the-Model-and-Optimizer)

- [Trainer](#Trainer)

- [Fit](#Fit)

- [TensorBoard Logs](#TensorBoard-Logs)

---

In [1]:
import pandas as pd
import torch, evaluate, time, os

from lightning import LightningDataModule, LightningModule, Trainer, seed_everything
from datasets import DatasetDict, Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from lightning.pytorch.callbacks import LearningRateFinder, EarlyStopping, ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# @title Hyperparameters
SEED = 42   # @param {type:"integer"}

CASING = "bert-base-uncased"    # @param ["bert-base-uncased", "bert-large-uncased"]

MAX_LENGTH = 128    # @param {type:"slider", min:64, max:256, step:64}

EPOCHS = 7  # @param {type:"slider", min:1, max:7, step:1}

BATCH_SIZE = 16
NUM_LABELS = 2

#### Load & Prepare Data, Setup Datasets, and Create DataLoaders

In [3]:
class DataModule(LightningDataModule):
    def __init__(self, data_dir:str="../../data/", random_state:int=SEED, tr_ratio:float=0.75, model_name_or_path:str=CASING,
                 max_length:int=MAX_LENGTH, batch_size:int=BATCH_SIZE):
        super().__init__()
        self.data_dir = data_dir
        self.random_state = random_state
        self.tr_ratio = tr_ratio
        self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=True)
        self.max_length = max_length
        self.batch_size = batch_size


    def prepare_data(self):
        fake_df, true_df = pd.read_csv(self.data_dir+"fake.csv"), pd.read_csv(self.data_dir+"true.csv")
        df = pd.concat([fake_df, true_df])
        df["labels"] = fake_df.size*[0] + true_df.size*[1]
        df = df.sample(frac=1, replace=False, random_state=self.random_state)#.reset_index(drop=True)
        self.__text, self.__labels = df.text.to_list(), df.labels.to_list()


    def __text_encoder(self, batch):
        encoded_batch = self.tokenizer(text=batch["text"], padding="max_length", truncation=True, max_length=self.max_length,
                                       return_token_type_ids=False, return_attention_mask=True)
        return encoded_batch


    def setup(self, stage:str="validate"):
        n_tr_samples, n_samples = int(self.tr_ratio*len(self.__text)), len(self.__text)
        self.dataset = DatasetDict()
        f_idx = 0
        for split, l_idx in zip(("train","validate"), (n_tr_samples,n_samples)):
            self.dataset[split] = Dataset.from_dict({"text":self.__text[f_idx:l_idx], "labels":self.__labels[f_idx:l_idx]})
            self.dataset[split] = self.dataset[split].map(function=self.__text_encoder, batched=True, batch_size=100,
                                                          drop_last_batch=False, remove_columns=["text"])
            self.dataset[split].set_format(type="torch", columns=self.dataset[split].column_names)
            f_idx = l_idx


    def train_dataloader(self):
        return DataLoader(self.dataset["train"], batch_size=self.batch_size, shuffle=True, drop_last=False)


    def val_dataloader(self):
        return DataLoader(self.dataset["validate"], batch_size=self.batch_size, shuffle=False, drop_last=False)

#### Config the Model and Optimizer

In [4]:
class TransformerModule(LightningModule):
    def __init__(self, model_name_or_path:str=CASING, num_labels:int=NUM_LABELS, learning_rate:float=2e-5, adam_epsilon:float=1e-8,
                 warmup_ratio:float=0.05):
        super().__init__()
        self.save_hyperparameters(ignore=["model_name_or_path","num_labels"], logger=False)
        self.model = BertForSequenceClassification.from_pretrained(model_name_or_path, num_labels=num_labels)
        for name, prm in self.model.named_parameters():
            if ("embeddings" in name) or ("encoder" in name and int(name.split('.')[3])<4):
                prm.requires_grad = False
        self.loss_criterion = torch.nn.CrossEntropyLoss(reduction="sum")
        self.metric = evaluate.combine(["accuracy","f1"])
        self.__training_step_outputs, self.__validation_step_outputs = [], []


    def configure_optimizers(self):
        num_training_steps = self.trainer.max_epochs * len(self.trainer.datamodule.train_dataloader())
        # optimizer = torch.optim.Adam(params=self.parameters(), lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        optimizer = torch.optim.Adam(params=filter(lambda prm:prm.requires_grad, self.model.parameters()),
                                     lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(self.hparams.warmup_ratio*num_training_steps),
                                                    num_training_steps=num_training_steps, last_epoch=-1)
        return [optimizer], [dict(scheduler=scheduler, interval="step", frequency=1)]


    def training_step(self, batch):
        targets = batch["labels"]
        outputs = self.model.forward(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], return_dict=True)
        loss = self.loss_criterion(outputs.logits, targets)
        self.__training_step_outputs.append({"loss":loss, "targets":targets, "preds":outputs.logits.argmax(dim=1)})
        return loss


    def on_train_epoch_end(self):
        tr_loss, n_smpls = 0, 0
        for entry in self.__training_step_outputs:
            tr_loss += entry["loss"].item()
            n_smpls += entry["targets"].numel()
            self.metric.add_batch(predictions=entry["preds"], references=entry["targets"])
        measures = self.metric.compute()
        self.log_dict(dict(tr_loss=tr_loss/n_smpls, tr_accuracy=measures["accuracy"], tr_f1=measures["f1"]),
                      on_epoch=True, prog_bar=False, logger=True)


    def validation_step(self, batch, batch_idx):
        targets = batch["labels"]
        outputs = self.model.forward(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], return_dict=True)
        loss = self.loss_criterion(outputs.logits, targets)
        self.__validation_step_outputs.append({"loss":loss, "targets":targets, "preds":outputs.logits.argmax(dim=1)})


    def on_validation_epoch_end(self):
        val_loss, n_smpls = 0, 0
        for entry in self.__validation_step_outputs:
            val_loss += entry["loss"].item()
            n_smpls += entry["targets"].numel()
            self.metric.add_batch(predictions=entry["preds"], references=entry["targets"])
        val_loss /= n_smpls
        measures = self.metric.compute()
        self.log(name="monitor", value=val_loss, on_epoch=True, prog_bar=False, logger=False)
        self.log_dict(dict(val_loss=val_loss, val_accuracy=measures["accuracy"], val_f1=measures["f1"]),
                      on_epoch=True, prog_bar=False, logger=True)

#### Trainer

In [5]:
seed_everything(SEED, workers=True)
LOGS_DIR_NAME, version = "Lightning", time.strftime("%y-%m-%d_%H-%M-%S")
ckpt_dir = os.path.join("../../logs/", LOGS_DIR_NAME, version, "checkpoints")

lrFinder_callback = LearningRateFinder(min_lr=1e-6, max_lr=1e-2, num_training_steps=99, mode="exponential")
early_stop_callback = EarlyStopping(monitor="monitor", min_delta=1e-3, patience=2, mode="min")
checkpoint_callback = ModelCheckpoint(dirpath=ckpt_dir, filename=f"[{CASING}] "+"{epoch:02d} {monitor:.3f}", monitor="monitor",
                                      save_top_k=1, mode="min")

trainer = Trainer(
                  # precision=32,
                  accumulate_grad_batches=1,  # {5:3, 10:20}
                  max_epochs=EPOCHS,
                  # limit_train_batches=0.99, # 500
                  val_check_interval=None, check_val_every_n_epoch=1,
                  callbacks=[lrFinder_callback, early_stop_callback, checkpoint_callback],
                  # enable_checkpointing=True,  # The state of the last training epoch
                  deterministic="warn",
                  accelerator="gpu", devices=1,
                  num_sanity_val_steps=0,
                  # fast_dev_run=2, detect_anomaly=True,
                  enable_progress_bar=True,
                  enable_model_summary=False,
                  logger=TensorBoardLogger(save_dir="../../logs/", name=LOGS_DIR_NAME, version=version, default_hp_metric=False)
                  )

Global seed set to 42
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


#### Fit

In [6]:
trainer.fit(model=TransformerModule(), datamodule=DataModule())


# ckpt_path = checkpoint_callback.best_model_path # os.path.join(ckpt_dir, f"[{CASING}] "+"epoch=* monitor=*.ckpt")

# trainer.fit(model=TransformerModule(), datamodule=DataModule(), ckpt_path=ckpt_path)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 6: 100%|██████████| 186/186 [00:19<00:00,  9.59it/s, v_num=7-40]

`Trainer.fit` stopped: `max_epochs=7` reached.


Epoch 6: 100%|██████████| 186/186 [00:22<00:00,  8.32it/s, v_num=7-40]


In [7]:
print(f"The lrFinder_callback set the lr as: {trainer.model.hparams.learning_rate:9.3e}")

The lrFinder_callback set the lr as: 7.925e-05


#### TensorBoard Logs

In [None]:
# %reload_ext tensorboard
%load_ext tensorboard
# %tensorboard --logdir ../../logs