<a href="https://colab.research.google.com/github/optimopium/is-this-political/blob/main/Reports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Active Learning Reports

## Preliminaries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Code is adapted from [here.](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) and [here](https://github.com/adapter-hub/adapter-transformers/blob/cffdf3974ea19f49e1febe6e3f5b74be4e2d496a/examples/pytorch/text-classification/run_glue.py)

In [2]:
! mkdir data

In [3]:
!pip install --quiet --upgrade gdown
!pip install --quiet -U transformers
!pip install --quiet datasets
!pip install --quiet scikit-learn
!pip install --quiet evaluate
!pip install --quiet sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m77.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m104.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import logging
import random
import sys
import os
from tqdm import tqdm

from scipy.stats import entropy
import torch
from datasets import load_dataset

random.seed("42")

In [5]:
# Creating an object
logger = logging.getLogger()
 
# Setting the threshold of logger to DEBUG
logger.setLevel(logging.INFO)

In [19]:
task_to_keys = {
    "politics": ("sentence", None),
}

base_dir = './dataset/'

In [7]:
import gdown

url = "https://drive.google.com/drive/folders/1B4s1JgxRKWJB4IrRVjE5wWNjM-sNSQT3"
gdown.download_folder(url, output="./", quiet=True, use_cookies=False)

['./dataset/test.csv', './dataset/train.csv', './dataset/validation.csv']

In [21]:
data_files = {"train": base_dir + "train.csv", "validation": base_dir + "validation.csv", "test": base_dir + "test.csv"}

In [9]:
# Active learning variables
BUDGET = 750
INITIAL_DATASET_SIZE = 150
ACQUISITION_SIZE = 100
ITERATIONS = int((BUDGET - INITIAL_DATASET_SIZE) / ACQUISITION_SIZE)
print(f"Budget: {BUDGET}")
print(f"Initial Dataset Size: {INITIAL_DATASET_SIZE}")
print(f"Acquisition size: {ACQUISITION_SIZE}")
print(f"Iterations: {ITERATIONS}")

# General variables
MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5
LOGGING_STEPS = 50
EVAL_STEPS = 50
# epoch * (budget/batch size)
MAX_STEPS = 500
CANDIDATE_TO_SAMPLE_RATIO=5
BASE_MODEL = 'xlm-roberta-base'

print(MAX_STEPS)

Budget: 750
Initial Dataset Size: 150
Acquisition size: 100
Iterations: 6
500


## Model

In [10]:
import numpy as np
from torch.utils.data import SequentialSampler, DataLoader

import datasets
from datasets import concatenate_datasets, load_dataset, load_metric
from datasets import load_dataset

import transformers
from transformers.trainer_utils import get_last_checkpoint
from transformers import (
    AutoModelForSequenceClassification,
    AutoConfig,
    AutoTokenizer,
)

import evaluate

In [11]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

## Report

### Load Model from Checkpoints

In [12]:
import plotly.graph_objects as go

def PlotRocAuc(y_test, y_pred, color, model_name):
    fig = go.Figure()
      
    fig.add_trace(
        go.Scatter(
            x=[0,1],
            y=[0,1],
            name="TPR = FPR",
            line=dict(color="black", dash="dash")
        )
    )
        
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc_score = roc_auc_score(y_test,y_pred)

    fig.add_trace(
        go.Scatter(
            x=fpr,
            y=tpr,
            name=f"{model_name}(AUC={auc_score})",
            marker=dict(color=color)
        )
    )

    fig.update_layout(title="ROC curve",
                xaxis_title="False Positive Rate",
                yaxis_title="True Positive Rate")

    return auc_score, fig

In [13]:
from datasets import ClassLabel

c2l = ClassLabel(num_classes=2, names=['Nonpolitical', 'Political'])

In [14]:
from torch.utils.data import TensorDataset

def transform_to_embedding_dataset(tokenizer, dataset, is_bert=True):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    labels = []

    for item in dataset:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            item["sentence1"],                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 256,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )
        
        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])
        if is_bert:
            token_type_ids.append(encoded_dict["token_type_ids"])

        labels.append(c2l.str2int(item["label"]))


    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    if is_bert:
        token_type_ids = torch.cat(token_type_ids, dim=0)
    labels = torch.tensor(labels)
    


    # Combine the training inputs into a TensorDataset.
    if is_bert:
        dataset = TensorDataset(input_ids, attention_masks, token_type_ids, labels)
    else:
        dataset = TensorDataset(input_ids, attention_masks, labels)

    return dataset

In [39]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

def report_model_metrics(dataset, model_path, head_path=None):
    tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True)

    if head_path:
        model = torch.load(model_path)
        head = torch.load(head_path)
        model.eval()
        head.eval()
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)

    test_dataset = transform_to_embedding_dataset(tokenizer, dataset, is_bert=False)
    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

    dataset_logits = torch.Tensor().to(device)
    truth_labels = torch.Tensor().to(device)

    model.eval()

    for batch in tqdm(test_dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            if head_path:
                outputs = model(**inputs)
                last_hidden_state_cls = outputs[0][:, 0, :].to(device)
                logits = torch.argmax(head(last_hidden_state_cls), dim=-1)
            else:
                logits = torch.argmax(model(**inputs).logits, dim=-1)
            
            dataset_logits = torch.cat((dataset_logits, logits), dim=0)
            truth_labels = torch.cat((truth_labels, batch[2]), dim=0)

    dataset_logits = dataset_logits.cpu().numpy()
    truth_labels = truth_labels.cpu().numpy()

    print("Metrics Report:\n")
    print(f"Accuracy: {accuracy_score(truth_labels, dataset_logits)}")
    print(f"Confusion Matrix:\n{confusion_matrix(truth_labels, dataset_logits)}")
    print(f"Precision: {precision_score(truth_labels, dataset_logits)}")
    print(f"Recall: {recall_score(truth_labels, dataset_logits)}")
    print(f"F1-score: {f1_score(truth_labels, dataset_logits)}")  

In [33]:
test_dataset = load_dataset("csv", data_files=data_files)["test"]



  0%|          | 0/3 [00:00<?, ?it/s]

### Full Dataset Report

In [23]:
base_path = "./drive/MyDrive/Thesis/Data/experiments"

In [24]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/full_dataset/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating: 100%|██████████| 188/188 [00:20<00:00,  9.06it/s]


Metrics Report:

Accuracy: 0.924
Confusion Matrix:
[[1045   46]
 [  68  341]]
Precision: 0.8811369509043928
Recall: 0.8337408312958435
F1-score: 0.8567839195979899


### Random Sampling Report

In [25]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/random_sampling/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating: 100%|██████████| 188/188 [00:18<00:00, 10.20it/s]


Metrics Report:

Accuracy: 0.91
Confusion Matrix:
[[997  94]
 [ 41 368]]
Precision: 0.7965367965367965
Recall: 0.8997555012224939
F1-score: 0.8450057405281287


### Breaking Ties Report

In [26]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/breaking_ties/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating: 100%|██████████| 188/188 [00:18<00:00, 10.14it/s]


Metrics Report:

Accuracy: 0.8926666666666667
Confusion Matrix:
[[985 106]
 [ 55 354]]
Precision: 0.7695652173913043
Recall: 0.8655256723716381
F1-score: 0.8147295742232452


### Max Entropy Report

In [27]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/max_entropy/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating: 100%|██████████| 188/188 [00:18<00:00, 10.04it/s]


Metrics Report:

Accuracy: 0.9066666666666666
Confusion Matrix:
[[1000   91]
 [  49  360]]
Precision: 0.7982261640798226
Recall: 0.8801955990220048
F1-score: 0.8372093023255813


### Contrastive Report

In [28]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/contrastive/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating: 100%|██████████| 188/188 [00:18<00:00,  9.98it/s]


Metrics Report:

Accuracy: 0.9013333333333333
Confusion Matrix:
[[987 104]
 [ 44 365]]
Precision: 0.7782515991471215
Recall: 0.8924205378973105
F1-score: 0.8314350797266514


### Least Confidence Report

In [29]:
report_model_metrics(dataset=test_dataset, model_path=f"{base_path}/least_confidence/")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating: 100%|██████████| 188/188 [00:18<00:00,  9.95it/s]


Metrics Report:

Accuracy: 0.9066666666666666
Confusion Matrix:
[[1000   91]
 [  49  360]]
Precision: 0.7982261640798226
Recall: 0.8801955990220048
F1-score: 0.8372093023255813


### Discrepancy Report

In [40]:
report_model_metrics(dataset=test_dataset, 
                     model_path=f"{base_path}/discrepancy/model.pt", 
                     head_path=f"{base_path}/discrepancy/head.pt")

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Evaluating: 100%|██████████| 188/188 [00:18<00:00, 10.13it/s]


Metrics Report:

Accuracy: 0.888
Confusion Matrix:
[[1046   45]
 [ 123  286]]
Precision: 0.8640483383685801
Recall: 0.6992665036674817
F1-score: 0.772972972972973
