In [1]:
from collections import Counter
import os
from pathlib import Path

from sklearnex import patch_sklearn

patch_sklearn()

from huggingface_hub import notebook_login
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import seaborn as sns
from tqdm.notebook import tqdm
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification


from jjuoda_dl4 import utils
from jjuoda_dl4.utils import BASE_DATA_DIR, BASE_MODEL_DIR

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
nela_gt_2018_articles_df = pd.read_csv(
    BASE_DATA_DIR / "interim/nela-gt-2018-articles.csv", index_col=0
)
nela_gt_2018_scores_df = pd.read_csv(
    BASE_DATA_DIR / "interim/nela-gt-2018-scores.csv", index_col=0
)

In [3]:
nela_gt_2018_articles_df = utils.split_dataframe(
    nela_gt_2018_articles_df, nela_gt_2018_scores_df
)

We have:
33 reliable sources in train
13 unreliable sources in train
For a total of 133215+55210=188425 articles in train
3 reliable sources in val
2 unreliable sources in val
For a total of 6091+5937=12028 articles in val



In [4]:
nela_gt_2018_articles_df = utils._make_dataframe(nela_gt_2018_articles_df)

  0%|          | 0/697538 [00:00<?, ?it/s]

### AutoNLP from HuggingFace

#### DataModules

In [4]:
# I tried using hugging face api, my machine runs out of ram, so we'll use pytorch lightning


class AutoNLPNELADataset(Dataset):
    """Recreates processeing I did for AutoNLP on nela gt 2018 data."""

    def __init__(self, articles_df, tokenizer, root_dir=BASE_DATA_DIR):
        self.articles_df = articles_df
        self.root_dir = root_dir
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.articles_df)

    def __getitem__(self, index):
        article = self.articles_df.iloc[index]
        with open(self.root_dir / article.path, "r") as f:
            text = f.read().replace("\n", " ")
        text = "<TITLE> " + article.title + " </TITLE> " + text
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            return_tensors="pt",
            padding="max_length",
            truncation="longest_first",
        )
        return {
            "model_inputs": inputs,
            "source_score": article.source_score,
            "is_fake": article.source_score < 0,
        }


class AutoNLPNelaDataModule(pl.LightningDataModule):
    def __init__(
        self,
        articles_df,
        tokenizer,
        root_dir=BASE_DATA_DIR,
        batch_size=32,
    ):
        super().__init__()
        self.articles_df = articles_df
        self.root_dir = root_dir
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def prepare_data(self):
        self.train_dataset = AutoNLPNELADataset(
            self.articles_df[self.articles_df.split == "train"],
            self.tokenizer,
            self.root_dir,
        )
        self.val_dataset = AutoNLPNELADataset(
            self.articles_df[self.articles_df.split == "val"],
            self.tokenizer,
            self.root_dir,
        )
        self.test_dataset = AutoNLPNELADataset(
            self.articles_df[self.articles_df.split == "test"],
            self.tokenizer,
            self.root_dir,
        )
        self.pred_dataset = AutoNLPNELADataset(
            self.articles_df[self.articles_df.split == "pred"],
            self.tokenizer,
            self.root_dir,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

    def pred_dataloader(self):
        return DataLoader(
            self.pred_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

I just hope to get quick results that I can look at and refine.

In [5]:
notebook_login()

VBox(children=(HTML(value='<center>\n<img src=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
model_names = [
    "mutusfa/autonlp-Fake_News_Analysis-528914957",
    "mutusfa/autonlp-Fake_News_Analysis-528914958",
    "mutusfa/autonlp-Fake_News_Analysis-528914959",
    "mutusfa/autonlp-Fake_News_Analysis-528914960",
]

In [7]:
y = np.random.randn(1000)
x = np.random.randint(0, 10, size=(1000,))

In [8]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def plot_predictions(articles_df, model_name):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, use_auth_token=True
    )
    model.to(device)

    data_module = AutoNLPNelaDataModule(articles_df, tokenizer, batch_size=64)
    data_module.prepare_data()
    dataloaders = [data_module.pred_dataloader()]

    model.eval()
    preds = []
    source_scores = []
    with torch.no_grad():
        for dataloader in dataloaders:
            for batch in dataloader:
                # I'm not sure why do I get 32 * 1 * 512 tensors
                for k in batch["model_inputs"]:
                    batch["model_inputs"][k] = (
                        batch["model_inputs"][k].squeeze(1).to(device)
                    )
                preds.extend(
                    sigmoid(model(**batch["model_inputs"]).logits[:, 1].cpu().numpy())
                )
                source_scores.extend(batch["source_score"].numpy())

    plt.figure()
    ax = sns.regplot(
        x=source_scores,
        y=preds,
        x_jitter=0.1,
        line_kws={"color": "#859900"},
        scatter_kws={"alpha": 0.5},
    )
    plt.suptitle(model_name)
    plt.ylabel("Predicted probability of being fake")
    plt.xlabel("Source score")
    plt.savefig(
        BASE_DATA_DIR / f"processed/{model_name.replace('/', '_')}_reg_predictions.png"
    )

    return preds, source_scores


# for model_name in model_names:
#     preds, scores = plot_predictions(
#         nela_gt_2018_articles_df[nela_gt_2018_articles_df.split == "pred"].sample(
#             n=1000, random_state=42
#         ),
#         model_name,
#     )

These models achieved much better auc than knn models (all above .95 versus .86 I had with knn) on their validation set. The only problem - training set was quite imbalanced and I have changed both training set and validation set, so they are not directly comparable.

Looking at the data both models didn't see, we see interesting differences.
1. KNNs predict most news as having high probability of being fake news, while AutoNLP models predict most news as having low probability of being fake news.
1. It also seems that AutoNLP models generalize pretty badly - some can't distinguish between sources with -1 score and sources with 1 score. I guess I have to check if I can distinguish that, but knns seem to be able to do that.

Let's try tuning a few models and see if I can get better results.

### My work

In [9]:
class NELADataset(Dataset):
    """Recreates processeing I did for AutoNLP on nela gt 2018 data."""

    def __init__(self, articles_df, tokenizer, root_dir=BASE_DATA_DIR):
        self.articles_df = articles_df
        self.root_dir = root_dir
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.articles_df)

    def __getitem__(self, index):
        article = self.articles_df.iloc[index]
        if "text" in article.keys():
            text = article["text"]
        else:
            with open(self.root_dir / article.path, "r") as f:
                text = f.read()
        inputs = self.tokenizer(
            article.title,
            text,
            add_special_tokens=True,
            return_tensors="pt",
            padding="max_length",
            truncation="longest_first",
        )
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = torch.tensor([article.source_score < 0])
        return inputs


class NelaDataModule(pl.LightningDataModule):
    def __init__(
        self,
        articles_df,
        tokenizer,
        root_dir=BASE_DATA_DIR,
        batch_size=16,
    ):
        super().__init__()
        self.articles_df = articles_df
        self.root_dir = root_dir
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def prepare_data(self):
        self.train_dataset = NELADataset(
            self.articles_df[self.articles_df.split == "train"],
            self.tokenizer,
            self.root_dir,
        )
        self.val_dataset = NELADataset(
            self.articles_df[self.articles_df.split == "val"],
            self.tokenizer,
            self.root_dir,
        )
        self.test_dataset = NELADataset(
            self.articles_df[self.articles_df.split == "test"],
            self.tokenizer,
            self.root_dir,
        )
        self.pred_dataset = NELADataset(
            self.articles_df[self.articles_df.split == "pred"],
            self.tokenizer,
            self.root_dir,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

    def pred_dataloader(self):
        return DataLoader(
            self.pred_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

In [10]:
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)
model = transformers.DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased"
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [11]:
nela_gt_2018_data_module = NelaDataModule(nela_gt_2018_articles_df, tokenizer)
nela_gt_2018_data_module.prepare_data()

In [12]:
class NelaModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, **model_inputs):
        return self.model(**model_inputs)

    def _shared_step(self, batch, batch_idx):
        loss, inference = self.model(**batch)
        return loss, inference

    def training_step(self, batch, batch_idx):
        loss, inference = self._shared_step(batch, batch_idx)
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=3e-5)


nela_model = NelaModel(model)

In [13]:
trainer = pl.Trainer(max_epochs=3, gpus=1)
trainer.fit(nela_model, nela_gt_2018_data_module)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn("You passed in a `val_dataloader` but have no `validation_step`. Skipping val loop.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                                | Params
--------------------------------------------------------------
0 | model | DistilBertForSequenceClassification | 67.0 M
--------------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.820   Total estimated model params size (MB)


Epoch 0:   0%|          | 0/11777 [00:00<?, ?it/s] 

  inputs["labels"] = torch.tensor([article.source_score < 0])
  inputs["labels"] = torch.tensor([article.source_score < 0])
  inputs["labels"] = torch.tensor([article.source_score < 0])
  inputs["labels"] = torch.tensor([article.source_score < 0])


RuntimeError: CUDA out of memory. Tried to allocate 192.00 MiB (GPU 0; 5.80 GiB total capacity; 4.38 GiB already allocated; 192.81 MiB free; 4.41 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF