In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install git+https://github.com/mutusfa/Fake-News-Analysis scikit-learn-intelex huggingface_hub pytorch_lightning transformers

Collecting git+https://github.com/mutusfa/Fake-News-Analysis
  Cloning https://github.com/mutusfa/Fake-News-Analysis to /tmp/pip-req-build-bqea779p
  Running command git clone -q https://github.com/mutusfa/Fake-News-Analysis /tmp/pip-req-build-bqea779p
Collecting scikit-learn-intelex
  Downloading scikit_learn_intelex-2021.5.3-py37-none-manylinux1_x86_64.whl (69 kB)
[K     |████████████████████████████████| 69 kB 5.2 MB/s 
[?25hCollecting huggingface_hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.1 MB/s 
[?25hCollecting pytorch_lightning
  Downloading pytorch_lightning-1.5.10-py3-none-any.whl (527 kB)
[K     |████████████████████████████████| 527 kB 71.4 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 57.6 MB/s 
Collecting daal4py==2021.5.3
  Downloading daal4py-2021.5.3-py37-none-manylinux1_x86_64.whl (22.5 MB)
[K  

In [None]:
# Monkey patch for runtime in colab

from pathlib import Path

imoprt jjuoda_dl4.utils

# the function that reads articles will default to old path but whatever
jjuoda_dl4.utils.BASE_DATA_DIR = Path("/content/drive/MyDrive/Fake News Analysis/Data")
jjuoda_dl4.utils.BASE_MODEL_DIR = Path("/content/drive/MyDrive/Fake News Analysis/Models")

In [None]:
from collections import Counter
import os
from pathlib import Path

from sklearnex import patch_sklearn

patch_sklearn()

from huggingface_hub import notebook_login
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import seaborn as sns
from tqdm.notebook import tqdm
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchmetrics.functional import auroc
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification


from jjuoda_dl4 import utils
from jjuoda_dl4.utils import BASE_DATA_DIR, BASE_MODEL_DIR

In [None]:
nela_gt_2018_articles_df = pd.read_csv(
    BASE_DATA_DIR / "interim/nela-gt-2018-articles.csv", index_col=0
)
nela_gt_2018_scores_df = pd.read_csv(
    BASE_DATA_DIR / "interim/nela-gt-2018-scores.csv", index_col=0
)

In [None]:
nela_gt_2018_articles_df = utils.split_dataframe(
    nela_gt_2018_articles_df, nela_gt_2018_scores_df
)

In [None]:
nela_gt_2018_articles_df = utils._make_dataframe(nela_gt_2018_articles_df)

### AutoNLP from HuggingFace

#### DataModules

In [None]:
# I tried using hugging face api, my machine runs out of ram, so we'll use pytorch lightning


class AutoNLPNELADataset(Dataset):
    """Recreates processeing I did for AutoNLP on nela gt 2018 data."""

    def __init__(self, articles_df, tokenizer, root_dir=BASE_DATA_DIR):
        self.articles_df = articles_df
        self.root_dir = root_dir
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.articles_df)

    def __getitem__(self, index):
        article = self.articles_df.iloc[index]
        with open(self.root_dir / article.path, "r") as f:
            text = f.read().replace("\n", " ")
        text = "<TITLE> " + article.title + " </TITLE> " + text
        inputs = self.tokenizer(
            text,
            add_special_tokens=True,
            return_tensors="pt",
            padding="max_length",
            truncation="longest_first",
        )
        return {
            "model_inputs": inputs,
            "source_score": article.source_score,
            "is_fake": article.source_score < 0,
        }


class AutoNLPNelaDataModule(pl.LightningDataModule):
    def __init__(
        self,
        articles_df,
        tokenizer,
        root_dir=BASE_DATA_DIR,
        batch_size=32,
    ):
        super().__init__()
        self.articles_df = articles_df
        self.root_dir = root_dir
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def prepare_data(self):
        self.train_dataset = AutoNLPNELADataset(
            self.articles_df[self.articles_df.split == "train"],
            self.tokenizer,
            self.root_dir,
        )
        self.val_dataset = AutoNLPNELADataset(
            self.articles_df[self.articles_df.split == "val"],
            self.tokenizer,
            self.root_dir,
        )
        self.test_dataset = AutoNLPNELADataset(
            self.articles_df[self.articles_df.split == "test"],
            self.tokenizer,
            self.root_dir,
        )
        self.pred_dataset = AutoNLPNELADataset(
            self.articles_df[self.articles_df.split == "pred"],
            self.tokenizer,
            self.root_dir,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

    def pred_dataloader(self):
        return DataLoader(
            self.pred_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

I just hope to get quick results that I can look at and refine.

In [None]:
notebook_login()

In [None]:
model_names = [
    "mutusfa/autonlp-Fake_News_Analysis-528914957",
    "mutusfa/autonlp-Fake_News_Analysis-528914958",
    "mutusfa/autonlp-Fake_News_Analysis-528914959",
    "mutusfa/autonlp-Fake_News_Analysis-528914960",
]

In [None]:
y = np.random.randn(1000)
x = np.random.randint(0, 10, size=(1000,))

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def plot_predictions(articles_df, model_name):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, use_auth_token=True
    )
    model.to(device)

    data_module = AutoNLPNelaDataModule(articles_df, tokenizer, batch_size=64)
    data_module.prepare_data()
    dataloaders = [data_module.pred_dataloader()]

    model.eval()
    preds = []
    source_scores = []
    with torch.no_grad():
        for dataloader in dataloaders:
            for batch in dataloader:
                # I'm not sure why do I get 32 * 1 * 512 tensors
                for k in batch["model_inputs"]:
                    batch["model_inputs"][k] = (
                        batch["model_inputs"][k].squeeze(1).to(device)
                    )
                preds.extend(
                    sigmoid(model(**batch["model_inputs"]).logits[:, 1].cpu().numpy())
                )
                source_scores.extend(batch["source_score"].numpy())

    plt.figure()
    ax = sns.regplot(
        x=source_scores,
        y=preds,
        x_jitter=0.1,
        line_kws={"color": "#859900"},
        scatter_kws={"alpha": 0.5},
    )
    plt.suptitle(model_name)
    plt.ylabel("Predicted probability of being fake")
    plt.xlabel("Source score")
    plt.savefig(
        BASE_DATA_DIR / f"processed/{model_name.replace('/', '_')}_reg_predictions.png"
    )

    return preds, source_scores


# for model_name in model_names:
#     preds, scores = plot_predictions(
#         nela_gt_2018_articles_df[nela_gt_2018_articles_df.split == "pred"].sample(
#             n=1000, random_state=42
#         ),
#         model_name,
#     )

These models achieved much better auc than knn models (all above .95 versus .86 I had with knn) on their validation set. The only problem - training set was quite imbalanced and I have changed both training set and validation set, so they are not directly comparable.

Looking at the data both models didn't see, we see interesting differences.
1. KNNs predict most news as having high probability of being fake news, while AutoNLP models predict most news as having low probability of being fake news.
1. It also seems that AutoNLP models generalize pretty badly - some can't distinguish between sources with -1 score and sources with 1 score. I guess I have to check if I can distinguish that, but knns seem to be able to do that.

Let's try tuning a few models and see if I can get better results.

### My work

In [None]:
class NELADataset(Dataset):
    """Recreates processeing I did for AutoNLP on nela gt 2018 data."""

    def __init__(self, articles_df, tokenizer, root_dir=BASE_DATA_DIR):
        self.articles_df = articles_df
        self.root_dir = root_dir
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.articles_df)

    def __getitem__(self, index):
        article = self.articles_df.iloc[index]
        if "text" in article.keys():
            text = article["text"]
        else:
            with open(self.root_dir / article.path, "r") as f:
                text = f.read()
        inputs = self.tokenizer(
            article.title,
            text,
            add_special_tokens=True,
            return_tensors="pt",
            padding="max_length",
            truncation="longest_first",
        )
        inputs = {k: v.squeeze(0) for k, v in inputs.items()}
        inputs["labels"] = torch.tensor([article.source_score < 0], dtype=int)
        return inputs


class NelaDataModule(pl.LightningDataModule):
    def __init__(
        self,
        articles_df,
        tokenizer,
        root_dir=BASE_DATA_DIR,
        batch_size=16,
    ):
        super().__init__()
        self.articles_df = articles_df
        self.root_dir = root_dir
        self.tokenizer = tokenizer
        self.batch_size = batch_size

    def prepare_data(self):
        self.train_dataset = NELADataset(
            self.articles_df[self.articles_df.split == "train"],
            self.tokenizer,
            self.root_dir,
        )
        self.val_dataset = NELADataset(
            self.articles_df[self.articles_df.split == "val"],
            self.tokenizer,
            self.root_dir,
        )
        self.test_dataset = NELADataset(
            self.articles_df[self.articles_df.split == "test"],
            self.tokenizer,
            self.root_dir,
        )
        self.pred_dataset = NELADataset(
            self.articles_df[self.articles_df.split == "pred"],
            self.tokenizer,
            self.root_dir,
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

    def pred_dataloader(self):
        return DataLoader(
            self.pred_dataset, batch_size=self.batch_size, shuffle=False, num_workers=4
        )

In [None]:
tokenizer = transformers.DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)
model = transformers.DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased"
)

In [None]:
nela_gt_2018_data_module = NelaDataModule(nela_gt_2018_articles_df, tokenizer)
nela_gt_2018_data_module.prepare_data()

In [None]:
class NelaModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def backbone_grad(self, value):
        for param in self.model.parameters():
            param.requires_grad = value
        for param in self.model.pre_classifier.parameters():
          param.requires_grad = True
        for param in self.model.classifier.parameters():
          param.requires_grad = True

    def forward(self, **model_inputs):
        return self.model(**model_inputs)

    def _shared_step(self, batch, batch_idx):
        return self.model(**batch)

    def training_step(self, batch, batch_idx):
        results = self._shared_step(batch, batch_idx)
        self.log("train_loss", results.loss.detach())
        return results

    def validation_step(self, batch, batch_idx):
        results = self._shared_step(batch, batch_idx)
        self.log("val_loss", results.loss.detach())
        self.log("val_auc", auroc(torch.sigmoid(results.logits.detach()[:, 1]), batch["labels"], num_classes=2))
        return results

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            params=[p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate,
        )

        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, patience=3
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": scheduler,
            "monitor": "val_loss",
        }


nela_model = NelaModel(model)

In [None]:
nela_model.backbone_grad(False)
nela_model.learning_rate = 1e-3
trainer = pl.Trainer(
    min_epochs=3,
    gpus=1,                     
    callbacks=[
        pl.callbacks.EarlyStopping(
            patience=5, min_delta=0.0001, monitor="val_loss/dataloader_idx_1"
        ),
        pl.callbacks.LearningRateMonitor(logging_interval="step", log_momentum=False),
        pl.callbacks.model_checkpoint.ModelCheckpoint(
            BASE_DATA_DIR / "best",
            filename='{epoch}-{val_loss:.2f}-{val_auc:.2f}',
            monitor="val_loss",
        )
    ],
    default_root_dir=BASE_MODEL_DIR,
)
trainer.fit(nela_model, nela_gt_2018_data_module)

In [None]:
nela_model.backbone_grad(True)
nela_model.learning_rate = 1e-5
trainer = pl.Trainer(
    min_epochs=3,
    gpus=1,                     
    callbacks=[
        pl.callbacks.EarlyStopping(
            patience=5, min_delta=0.0001, monitor="val_loss/dataloader_idx_1"
        ),
        pl.callbacks.LearningRateMonitor(logging_interval="step", log_momentum=False),
        pl.callbacks.model_checkpoint.ModelCheckpoint(
            BASE_DATA_DIR / "best",
            filename='{epoch}-{val_loss:.2f}-{val_auc:.2f}',
            monitor="val_loss",
        )
    ],
    default_root_dir=BASE_MODEL_DIR,
)
trainer.fit(nela_model, nela_gt_2018_data_module)