<a href="https://colab.research.google.com/github/nicolaiberk/bild/blob/main/code/content_analysis/bert/crime/bert/crime_classifier_BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Model to classify German news articles

This is part of a project on migration attention in German news and their impact on issue attitudes. The classifier indicated whether the content is about crime.

In [18]:
!pip install transformers[torch]



In [19]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader

In [20]:
# set random seed for reproducibility
SEED_GLOBAL = 42
np.random.seed(SEED_GLOBAL)

## Prepare data

In [21]:
# load classified migration data
print('Loading data...')
dta = pd.read_csv('https://www.dropbox.com/s/6y71ulr3axxf5iq/training_crime.csv?dl=1', sep = ',')
print('\tDone!')

Loading data...
	Done!


In [22]:
# define outcome
dta['crime_bin'] = dta.crime == 'Ja'

In [23]:
# shuffle data
dta = dta.sample(frac = 1, random_state=42).reset_index(drop = True)

In [24]:
#determine train-test-val split
splits = ['train']*1400
splits.extend(['test']*200)
splits.extend(['val']*200)
dta['split'] = splits

# Split dataset into pre-specified training, validation, and test sets
train = dta[['text', 'crime_bin']][dta['split']=='train'].reset_index(drop = True)
test = dta[['text', 'crime_bin']][dta['split']=='test'].reset_index(drop = True)
val = dta[['text', 'crime_bin']][dta['split']=='val'].reset_index(drop = True)

print(train.shape)
print(test.shape)
print(val.shape)


(1400, 2)
(200, 2)
(200, 2)


In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [26]:
model_name = 'distilbert-base-german-cased'
BERT_tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               model_max_length = 512)

In [27]:
train_encodings = BERT_tokenizer(list(train['text']), padding='max_length', truncation=True, return_tensors='pt')
test_encodings = BERT_tokenizer(list(test['text']), padding='max_length', truncation=True, return_tensors='pt')
val_encodings = BERT_tokenizer(list(val['text']), padding='max_length', truncation=True, return_tensors='pt')

In [28]:
train_labels = [1 if label else 0 for label in train['crime_bin']]
test_labels = [1 if label else 0 for label in test['crime_bin']]
val_labels = [1 if label else 0 for label in val['crime_bin']]

In [29]:
import torch

class MigDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MigDataset(train_encodings, train_labels)
val_dataset = MigDataset(val_encodings, val_labels)
test_dataset = MigDataset(test_encodings, test_labels)

## Setup training, train

In [30]:
BERT_model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-german-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'c

In [31]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [32]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    evaluation_strategy="epoch",
    seed=SEED_GLOBAL
)

trainer = Trainer(
    model=BERT_model,                    # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

In [33]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.315341,0.91,0.785714,0.868421,0.717391
2,0.546500,0.23664,0.92,0.836735,0.788462,0.891304
3,0.219200,0.278095,0.91,0.790698,0.85,0.73913
4,0.183800,0.283185,0.905,0.8,0.77551,0.826087
5,0.135800,0.404498,0.92,0.818182,0.857143,0.782609
6,0.088500,0.462971,0.9,0.787234,0.770833,0.804348
7,0.061400,0.428059,0.92,0.813953,0.875,0.76087
8,0.013500,0.522103,0.91,0.804348,0.804348,0.804348
9,0.013500,0.607576,0.905,0.791209,0.8,0.782609
10,0.004400,0.606261,0.905,0.791209,0.8,0.782609


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=880, training_loss=0.14279266792264853, metrics={'train_runtime': 661.5229, 'train_samples_per_second': 21.163, 'train_steps_per_second': 1.33, 'total_flos': 1854543581184000.0, 'train_loss': 0.14279266792264853, 'epoch': 10.0})

In [34]:
# save
trainer.save_model("drive/MyDrive/Bild/crime_clsfr_BERT_torch")

## Assess performance

In [35]:
trainer.evaluate(test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.4399771988391876,
 'eval_accuracy': 0.915,
 'eval_f1': 0.8316831683168318,
 'eval_precision': 0.8076923076923077,
 'eval_recall': 0.8571428571428571,
 'eval_runtime': 3.1551,
 'eval_samples_per_second': 63.39,
 'eval_steps_per_second': 1.268,
 'epoch': 10.0}

83.2% F1, this is amazing!