In [1]:
from functools import lru_cache

import numpy as np
import pandas as pd
import torch
from modAL.models import ActiveLearner
from sklearn.pipeline import Pipeline
from skorch import NeuralNetClassifier
from skorch.callbacks import LRScheduler
from skorch.callbacks import ProgressBar
from skorch.hf import HuggingfacePretrainedTokenizer
from sqlalchemy.orm import Query

from sqlalchemy.orm import Session
from sqlalchemy.orm import joinedload
from torch import nn
from torch.optim.lr_scheduler import LambdaLR
from tqdm.auto import tqdm
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import src
import src.db.models.bert_data as bm
from src.db.connect import make_engine

In [5]:
pd.set_option("display.max_colwidth", 2048)
pd.set_option("display.max_rows", 50)

# Choose a tokenizer and BERT model that work together
# uncased version would be: "dbmdz/bert-base-german-uncased"
TOKENIZER = "bert-base-german-cased"
PRETRAINED_MODEL = "bert-base-german-cased"

# model hyper-parameters
OPTMIZER = torch.optim.AdamW
LR = 5e-5
MAX_EPOCHS = 10
CRITERION = nn.CrossEntropyLoss
BATCH_SIZE = 2

# device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

engine = make_engine("DB")
session = Session(engine)

print(f"Using {DEVICE=}")

Using DEVICE='cuda'


# Load & prepare dataset


In [6]:
@lru_cache(maxsize=1)
def load_labeled_data(engine):
    with Session(engine) as session:
        samples = (
            session.query(bm.Sample)
            .options(joinedload(bm.Sample.raw_labels))
            .filter(bm.Sample.used_in_batch != None)
        )

    rows = []
    for sample in samples.all():
        if sample.n_coders > 0:
            row = (sample.id, sample.text, sample.labels())
            rows.append(row)

    return pd.DataFrame(rows, columns=["id", "text", "labels"])


@lru_cache(maxsize=1)
def load_unlabeled_data(engine):
    with Session(engine) as session:
        samples = session.query(bm.Sample).filter(bm.Sample.used_in_batch == None)

    rows = []
    for sample in samples.all():
        row = (sample.id, sample.text)
        rows.append(row)

    return pd.DataFrame(rows, columns=["id", "text"])


def define_labels(labels):
    if "antielite" in labels:
        return 1
    elif "pplcentr" in labels:
        return 2
    else:
        return 0

In [7]:
df = load_labeled_data(engine)

df["label"] = df.labels.apply(define_labels)

n_non_zero = len(df[df.label != 0])

zero_sample = df[df.label == 0].sample(n_non_zero)

df = pd.concat([df[df.label != 0], zero_sample])

In [8]:
df.groupby("label")["id"].count()

label
0    201
1    172
2     29
Name: id, dtype: int64

# Configure BERT model pipeline

- most parts in here are taken from [this tutorial](https://nbviewer.org/github/skorch-dev/skorch/blob/master/notebooks/Hugging_Face_Finetuning.ipynb).


In [9]:
num_training_steps = MAX_EPOCHS * (len(df) // BATCH_SIZE + 1)


def lr_schedule(current_step):
    factor = float(num_training_steps - current_step) / float(max(1, num_training_steps))
    assert factor > 0
    return factor

In [10]:
class BertModule(nn.Module):
    def __init__(self, name, num_labels):
        super().__init__()
        self.name = name
        self.num_labels = num_labels

        self.reset_weights()

    def reset_weights(self):
        self.bert = AutoModelForSequenceClassification.from_pretrained(
            self.name, num_labels=self.num_labels
        )

    def forward(self, **kwargs):
        pred = self.bert(**kwargs)
        return pred.logits

In [11]:
pipeline = Pipeline(
    [
        ("tokenizer", HuggingfacePretrainedTokenizer(TOKENIZER)),
        (
            "net",
            NeuralNetClassifier(
                BertModule,
                module__name=PRETRAINED_MODEL,
                module__num_labels=df.label.nunique(),
                optimizer=OPTMIZER,
                lr=LR,
                max_epochs=MAX_EPOCHS,
                criterion=CRITERION,
                batch_size=BATCH_SIZE,
                iterator_train__shuffle=True,
                device=DEVICE,
                callbacks=[
                    LRScheduler(LambdaLR, lr_lambda=lr_schedule, step_every="batch"),
                    ProgressBar(),
                ],
            ),
        ),
    ]
)

# Fit learner


In [12]:
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
np.random.seed(0)

In [13]:
%%time
learner = ActiveLearner(
    estimator=pipeline,
    X_training=df.text,
    y_training=df.label,
)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

  0%|          | 0/202 [00:00<?, ?it/s]

  epoch    train_loss    valid_acc    valid_loss      dur
-------  ------------  -----------  ------------  -------
      1        [36m0.8045[0m       [32m0.6790[0m        [35m0.7445[0m  16.7383


  0%|          | 0/202 [00:00<?, ?it/s]

      2        [36m0.3761[0m       [32m0.7160[0m        0.8394  14.8120


  0%|          | 0/202 [00:00<?, ?it/s]

      3        [36m0.1688[0m       0.7037        1.2257  14.2320


  0%|          | 0/202 [00:00<?, ?it/s]

      4        [36m0.0504[0m       0.7160        1.1249  14.4848


  0%|          | 0/202 [00:00<?, ?it/s]

      5        [36m0.0300[0m       0.7037        1.2307  14.6253


  0%|          | 0/202 [00:00<?, ?it/s]

      6        [36m0.0032[0m       [32m0.7407[0m        1.2105  14.7449


  0%|          | 0/202 [00:00<?, ?it/s]

      7        [36m0.0018[0m       0.7407        1.2673  14.3601


  0%|          | 0/202 [00:00<?, ?it/s]

      8        [36m0.0013[0m       0.7407        1.2981  14.4867


  0%|          | 0/202 [00:00<?, ?it/s]

      9        [36m0.0011[0m       0.7407        1.3303  13.8690


  0%|          | 0/202 [00:00<?, ?it/s]

     10        [36m0.0010[0m       0.7407        1.3457  14.7662
CPU times: user 2min 12s, sys: 11.1 s, total: 2min 23s
Wall time: 2min 32s


# get new samples


In [14]:
# using ALL samples takes in insane amount of RAM in the next cell... (well over 45GB)
# and takes forever...
# for the real runs, the sample size can be turned up a notch (like 100_000 or so)
X_pool = load_unlabeled_data(engine).sample(5_000)

In [15]:
%%time
query_idx, query_sample = learner.query(X_pool.text.tolist(), n_instances=500)

CPU times: user 41.8 s, sys: 165 ms, total: 42 s
Wall time: 42.3 s


In [16]:
X_sample = X_pool.iloc[query_idx, :]

## load gründl cuz it's funny


In [17]:
query = (
    Query(bm.Sample)
    .filter(bm.Sample.id.in_(X_sample.id.tolist()))
    .with_entities(bm.Sample.id, bm.Sample.pop_dict_score)
)

gruendl = pd.read_sql(query.statement, engine)
gruendl = pd.merge(X_sample, gruendl, on="id")

In [18]:
gruendl.groupby("pop_dict_score")["id"].count()

pop_dict_score
False    494
True       6
Name: id, dtype: int64

In [19]:
gruendl.groupby("pop_dict_score").sample(5)

Unnamed: 0,id,text,pop_dict_score
31,703145,"Schon die letzte Große Koalition - ihre Regierungszeit ist schon länger her - hatte sich vorgenommen, einen solchen Gesetzentwurf zu verabschieden.",False
352,422705,"Ich hoffe, dass all diese Vorarbeiten im Innenausschuss nicht Makulatur werden, dass wir sie nicht noch einmal leisten müssen.",False
159,904168,"Wir können aber, so wie es die Bundeskanzlerin und die Regierung regelmäßig tun, unsere Verbündeten und Freunde an deren Verantwortung erinnern und gleichzeitig darauf hinwirken, dieses Ziel in der Zukunft zu erreichen.",False
218,385193,"Aber die Wiedervereinigung war gut für die Menschen in Ost und West, in Gesamtdeutschland, in Europa, sie war gut für eine friedliche Entwicklung in der Welt.",False
299,20793,Die Bundesumweltministerin hat wahrscheinlich zu viel im „Dschungelbuch“ gelesen.,False
344,117846,"In dieser Aktuellen Stunde stehen bisher nicht die belegbaren Fakten im Mittelpunkt, sondern gefühlte Wahrheiten, Spekulationen, teils auch unverschämte Unterstellungen.",True
158,605148,"Sanktionen, Dialogverbote und primitive antirussische Propaganda sind keine Argumente, sondern zerstören die Grundlagen von Zusammenarbeit.",True
222,1115709,"Der sogenannte Bürgerdialog Stromnetz soll den Anwohnern also schmackhaft machen, dass sie vom Wertverlust ihres Grundstückes am Ende noch profitieren.",True
122,1226711,"Wenn ich die Veräußerungsverluste ohne Veräußerung anerkenne, ist das eine Einladung an Spekulanten, zu spekulieren, weil klar ist: Den Verlust tragen alle anderen. – Wir müssen also hochsensibel sein und gemeinsam überlegen, was zu tun ist.",True
398,503573,"Unsere Soldatinnen und Soldaten haben einen außerordentlichen Beitrag für das afghanische Volk geleistet und sich für die Stabilisierung dieses Landes eingesetzt, damit eben Unterdrückung aufhört, damit das Sterben aufhört.",True


# Export new batch

In [30]:
tmpdir = src.PATH / "tmp"
tmpdir.mkdir(exist_ok=True)
X_sample.to_parquet(tmpdir / "active_learning_batch.parquet")