<a href="https://colab.research.google.com/github/optimopium/is-this-political/blob/main/Reports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Active Learning Reports

## Preliminaries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Code is adapted from [here.](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) and [here](https://github.com/adapter-hub/adapter-transformers/blob/cffdf3974ea19f49e1febe6e3f5b74be4e2d496a/examples/pytorch/text-classification/run_glue.py)

In [None]:
! mkdir data

In [None]:
!pip install --quiet --upgrade gdown
!pip install --quiet -U transformers
!pip install --quiet datasets
!pip install --quiet scikit-learn
!pip install --quiet evaluate
!pip install --quiet sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m86.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m954.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [None]:
import logging
import random
import sys
import os
from tqdm import tqdm

from scipy.stats import entropy
import torch
from datasets import load_dataset

random.seed("42")

In [None]:
# Creating an object
logger = logging.getLogger()
 
# Setting the threshold of logger to DEBUG
logger.setLevel(logging.INFO)

In [None]:
task_to_keys = {
    "politics": ("sentence", None),
}

base_dir = './annotated/'

In [None]:
import gdown

url = "https://drive.google.com/drive/folders/1_0qVo_iLOtjVcnybhBCOXMpguxCeUD1t"
gdown.download_folder(url, output="./", quiet=True, use_cookies=False)

['./annotated/annotator1.csv',
 './annotated/annotator2.csv',
 './annotated/dataset.csv',
 './annotated/full_dataset.csv',
 './annotated/test.csv',
 './annotated/train.csv',
 './annotated/validation.csv']

In [None]:
data_files = {"train": base_dir + "train.csv", "validation": base_dir + "validation.csv", "test": base_dir + "test.csv"}

In [None]:
# Active learning variables
BUDGET = 750
INITIAL_DATASET_SIZE = 150
ACQUISITION_SIZE = 100
ITERATIONS = int((BUDGET - INITIAL_DATASET_SIZE) / ACQUISITION_SIZE)
print(f"Budget: {BUDGET}")
print(f"Initial Dataset Size: {INITIAL_DATASET_SIZE}")
print(f"Acquisition size: {ACQUISITION_SIZE}")
print(f"Iterations: {ITERATIONS}")

# General variables
MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5
LOGGING_STEPS = 50
EVAL_STEPS = 50
# epoch * (budget/batch size)
MAX_STEPS = 500
CANDIDATE_TO_SAMPLE_RATIO=5
BASE_MODEL = 'xlm-roberta-base'

print(MAX_STEPS)

Budget: 750
Initial Dataset Size: 150
Acquisition size: 100
Iterations: 6
500


## Model

In [None]:
import numpy as np
from torch.utils.data import SequentialSampler, DataLoader

import datasets
from datasets import concatenate_datasets, load_dataset, load_metric
from datasets import load_dataset

import transformers
from transformers.trainer_utils import get_last_checkpoint
from transformers import (
    AutoModelForSequenceClassification,
    AutoConfig,
    AutoTokenizer,
)

import evaluate

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Report

### Load Model from Checkpoints

In [None]:
import plotly.graph_objects as go

def PlotRocAuc(y_test, y_pred, color, model_name):
    fig = go.Figure()
      
    fig.add_trace(
        go.Scatter(
            x=[0,1],
            y=[0,1],
            name="TPR = FPR",
            line=dict(color="black", dash="dash")
        )
    )
        
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc_score = roc_auc_score(y_test,y_pred)

    fig.add_trace(
        go.Scatter(
            x=fpr,
            y=tpr,
            name=f"{model_name}(AUC={auc_score})",
            marker=dict(color=color)
        )
    )

    fig.update_layout(title="ROC curve",
                xaxis_title="False Positive Rate",
                yaxis_title="True Positive Rate")

    return auc_score, fig

In [None]:
from datasets import ClassLabel

c2l = ClassLabel(num_classes=2, names=['Nonpolitical', 'Political'])

In [None]:
from torch.utils.data import TensorDataset

def transform_to_embedding_dataset(tokenizer, dataset, is_bert=True):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    labels = []

    for item in dataset:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            item["sentence1"],                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 256,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )
        
        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])
        if is_bert:
            token_type_ids.append(encoded_dict["token_type_ids"])

        labels.append(c2l.str2int(item["label"]))


    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    if is_bert:
        token_type_ids = torch.cat(token_type_ids, dim=0)
    labels = torch.tensor(labels)
    


    # Combine the training inputs into a TensorDataset.
    if is_bert:
        dataset = TensorDataset(input_ids, attention_masks, token_type_ids, labels)
    else:
        dataset = TensorDataset(input_ids, attention_masks, labels)

    return dataset

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score, plot_roc_curve

def report_model_metrics(dataset, model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path, do_lower_case=True)
    model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
    test_dataset = transform_to_embedding_dataset(tokenizer, dataset, is_bert=False)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

    dataset_logits = torch.Tensor().to(device)
    truth_labels = torch.Tensor().to(device)

    model.eval()

    for batch in tqdm(test_dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            logits = torch.argmax(model(**inputs).logits, dim=-1)
            dataset_logits = torch.cat((dataset_logits, logits), dim=0)
            truth_labels = torch.cat((truth_labels, batch[2]), dim=0)

    dataset_logits = dataset_logits.cpu().numpy()
    truth_labels = truth_labels.cpu().numpy()

    print("Metrics Report:\n")
    print(f"Accuracy: {accuracy_score(truth_labels, dataset_logits)}")
    print(f"Confusion Matrix:\n{confusion_matrix(truth_labels, dataset_logits)}")
    print(f"Precision: {precision_score(truth_labels, dataset_logits)}")
    print(f"Recall: {recall_score(truth_labels, dataset_logits)}")
    print(f"F1-score: {f1_score(truth_labels, dataset_logits)}")  
    auc, fig = PlotRocAuc(truth_labels, dataset_logits, "red", "base_clf")
    print(f"AUC Score: {auc}")
    fig.show()

In [None]:
test_dataset = load_dataset("csv", data_files=data_files)["test"]



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-5f29d8a25f345cb8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-5f29d8a25f345cb8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

### Full Dataset Report

In [None]:
base_path = "./drive/MyDrive/Thesis/Data/experiments"

In [None]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/full_dataset/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).

Evaluating: 100%|██████████| 188/188 [00:20<00:00,  9.31it/s]

Metrics Report:

Accuracy: 0.924
Confusion Matrix:
[[1045   46]
 [  68  341]]
Precision: 0.8811369509043928
Recall: 0.8337408312958435
F1-score: 0.8567839195979899
AUC Score: 0.895788839112633





### Random Sampling Report

In [None]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/random_sampling/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).

Evaluating: 100%|██████████| 188/188 [00:20<00:00,  9.35it/s]

Metrics Report:

Accuracy: 0.908
Confusion Matrix:
[[1005   86]
 [  52  357]]
Precision: 0.8058690744920993
Recall: 0.8728606356968215
F1-score: 0.8380281690140845
AUC Score: 0.8970169356302623





### Breaking Ties Report

In [None]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/breaking_ties/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).

Evaluating: 100%|██████████| 188/188 [00:20<00:00,  9.18it/s]

Metrics Report:

Accuracy: 0.892
Confusion Matrix:
[[994  97]
 [ 65 344]]
Precision: 0.780045351473923
Recall: 0.8410757946210269
F1-score: 0.8094117647058824
AUC Score: 0.8760832685295785





### Max Entropy Report

In [None]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/max_entropy/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).

Evaluating: 100%|██████████| 188/188 [00:20<00:00,  9.22it/s]

Metrics Report:

Accuracy: 0.9
Confusion Matrix:
[[1009   82]
 [  68  341]]
Precision: 0.806146572104019
Recall: 0.8337408312958435
F1-score: 0.8197115384615384
AUC Score: 0.8792902139980592





### Contrastive Report

In [None]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/contrastive/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).

Evaluating: 100%|██████████| 188/188 [00:20<00:00,  9.23it/s]

Metrics Report:

Accuracy: 0.9
Confusion Matrix:
[[996  95]
 [ 55 354]]
Precision: 0.7884187082405345
Recall: 0.8655256723716381
F1-score: 0.8251748251748252
AUC Score: 0.8892247976890272





### Least Confidence

In [None]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/least_confidence/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.

The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).

Evaluating: 100%|██████████| 188/188 [00:20<00:00,  9.23it/s]

Metrics Report:

Accuracy: 0.9
Confusion Matrix:
[[1009   82]
 [  68  341]]
Precision: 0.806146572104019
Recall: 0.8337408312958435
F1-score: 0.8197115384615384
AUC Score: 0.8792902139980592



