<a href="https://colab.research.google.com/github/nicolaiberk/bild/blob/main/code/content_analysis/bert/mig/_mig_clsfr/mig_classifier_BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Model to classify German news articles

This is part of a project on migration attention in German news and their impact on issue attitudes and definitions, as well as political violence. The classifier indicated whether the content is about migration.

In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader

import os
os.environ["WANDB_DISABLED"] = "true"

## Prepare data

In [3]:
# load classified migration data
print('Loading data...')
dta = pd.read_csv('https://www.dropbox.com/s/t47m1bppr3f1na5/handcoding_finished.csv?dl=1', sep = ',')
print('\tDone!')

Loading data...
	Done!


In [4]:
# define outcome
dta['mig_bin'] = dta.mig == 'Ja'

In [5]:
# shuffle data
dta = dta.sample(frac = 1, random_state=42).reset_index(drop = True)

In [6]:
#determine train-test-val split
splits = ['train']*1400
splits.extend(['test']*200)
splits.extend(['val']*200)
dta['split'] = splits

# Split dataset into pre-specified training, validation, and test sets
train = dta[['text', 'mig_bin']][dta['split']=='train'].reset_index(drop = True)
test = dta[['text', 'mig_bin']][dta['split']=='test'].reset_index(drop = True)
val = dta[['text', 'mig_bin']][dta['split']=='val'].reset_index(drop = True)

print(train.shape)
print(test.shape)
print(val.shape)


(1400, 2)
(200, 2)
(200, 2)


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [8]:
model_name = 'bert-base-german-cased'
BERT_tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               model_max_length = 512)

Access to the secret `HF_TOKEN` has not been granted on this notebook.
You will not be requested again.
Please restart the session if you want to be prompted again.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

In [9]:
train_encodings = BERT_tokenizer(list(train['text']), padding='max_length', truncation=True, return_tensors='pt')
test_encodings = BERT_tokenizer(list(test['text']), padding='max_length', truncation=True, return_tensors='pt')
val_encodings = BERT_tokenizer(list(val['text']), padding='max_length', truncation=True, return_tensors='pt')

In [10]:
train_labels = [1 if label else 0 for label in train['mig_bin']]
test_labels = [1 if label else 0 for label in test['mig_bin']]
val_labels = [1 if label else 0 for label in val['mig_bin']]

In [11]:
import torch

class MigDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MigDataset(train_encodings, train_labels)
val_dataset = MigDataset(val_encodings, val_labels)
test_dataset = MigDataset(test_encodings, test_labels)

## Setup training, train

In [12]:
BERT_model = AutoModelForSequenceClassification.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
BERT_model = BERT_model.to(device)
train_encodings = train_encodings.to(device)
train_labels = torch.tensor(train_labels).to(device)
val_encodings = val_encodings.to(device)
val_labels = torch.tensor(val_labels).to(device)
test_encodings = test_encodings.to(device)
test_labels = torch.tensor(test_labels).to(device)

In [14]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    dataloader_pin_memory=False,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=BERT_model,                    # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.31322,0.895,0.864516,0.797619,0.943662
2,0.546000,0.266751,0.915,0.875912,0.909091,0.84507
3,0.182000,0.279793,0.925,0.895105,0.888889,0.901408
4,0.120700,0.33901,0.93,0.90411,0.88,0.929577
5,0.096300,0.488288,0.9,0.846154,0.932203,0.774648
6,0.055600,0.510433,0.905,0.857143,0.919355,0.802817
7,0.064800,0.615122,0.92,0.882353,0.923077,0.84507
8,0.016700,0.60189,0.915,0.879433,0.885714,0.873239
9,0.016700,0.637712,0.905,0.865248,0.871429,0.859155
10,0.003700,0.645762,0.905,0.865248,0.871429,0.859155


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=880, training_loss=0.12339781340084631, metrics={'train_runtime': 1348.1636, 'train_samples_per_second': 10.384, 'train_steps_per_second': 0.653, 'total_flos': 3683554775040000.0, 'train_loss': 0.12339781340084631, 'epoch': 10.0})

In [17]:
# save
trainer.save_model("drive/MyDrive/Bild/mig_clsfr_BERT_torch")

## Assess performance

In [18]:
trainer.evaluate(test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.5278571844100952,
 'eval_accuracy': 0.93,
 'eval_f1': 0.9090909090909091,
 'eval_precision': 0.8974358974358975,
 'eval_recall': 0.9210526315789473,
 'eval_runtime': 5.589,
 'eval_samples_per_second': 35.785,
 'eval_steps_per_second': 0.716,
 'epoch': 10.0}