<a href="https://colab.research.google.com/github/nicolaiberk/bild/blob/main/code/content_analysis/bert/crime/bert/crime_classifier_BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Model to classify German news articles

This is part of a project on migration attention in German news and their impact on issue attitudes. The classifier indicated whether the content is about crime.

In [1]:
!pip install transformers[torch]

Collecting accelerate>=0.20.2 (from transformers[torch])
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [2]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader

## Prepare data

In [3]:
# load classified migration data
print('Loading data...')
dta = pd.read_csv('https://www.dropbox.com/s/6y71ulr3axxf5iq/training_crime.csv?dl=1', sep = ',')
print('\tDone!')

Loading data...
	Done!


In [4]:
# define outcome
dta['crime_bin'] = dta.crime == 'Ja'

In [5]:
# shuffle data
dta = dta.sample(frac = 1, random_state=42).reset_index(drop = True)

In [6]:
#determine train-test-val split
splits = ['train']*1400
splits.extend(['test']*200)
splits.extend(['val']*200)
dta['split'] = splits

# Split dataset into pre-specified training, validation, and test sets
train = dta[['text', 'crime_bin']][dta['split']=='train'].reset_index(drop = True)
test = dta[['text', 'crime_bin']][dta['split']=='test'].reset_index(drop = True)
val = dta[['text', 'crime_bin']][dta['split']=='val'].reset_index(drop = True)

print(train.shape)
print(test.shape)
print(val.shape)


(1400, 2)
(200, 2)
(200, 2)


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [8]:
model_name = 'distilbert-base-german-cased'
BERT_tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               model_max_length = 512)

In [9]:
train_encodings = BERT_tokenizer(list(train['text']), padding='max_length', truncation=True, return_tensors='pt')
test_encodings = BERT_tokenizer(list(test['text']), padding='max_length', truncation=True, return_tensors='pt')
val_encodings = BERT_tokenizer(list(val['text']), padding='max_length', truncation=True, return_tensors='pt')

In [10]:
train_labels = [1 if label else 0 for label in train['crime_bin']]
test_labels = [1 if label else 0 for label in test['crime_bin']]
val_labels = [1 if label else 0 for label in val['crime_bin']]

In [11]:
import torch

class MigDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MigDataset(train_encodings, train_labels)
val_dataset = MigDataset(val_encodings, val_labels)
test_dataset = MigDataset(test_encodings, test_labels)

## Setup training, train

In [12]:
BERT_model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-german-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_cla

In [13]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=BERT_model,                    # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.29546,0.905,0.791209,0.8,0.782609
2,0.475700,0.23654,0.92,0.833333,0.8,0.869565
3,0.226100,0.285861,0.905,0.786517,0.813953,0.76087
4,0.182500,0.290623,0.91,0.804348,0.804348,0.804348
5,0.128500,0.464056,0.915,0.773333,1.0,0.630435
6,0.063100,0.460065,0.925,0.805195,1.0,0.673913
7,0.054700,0.527663,0.91,0.775,0.911765,0.673913
8,0.010800,0.548353,0.91,0.780488,0.888889,0.695652
9,0.010800,0.565604,0.915,0.813187,0.822222,0.804348
10,0.000600,0.593124,0.91,0.808511,0.791667,0.826087


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=880, training_loss=0.12982596898112786, metrics={'train_runtime': 707.593, 'train_samples_per_second': 19.785, 'train_steps_per_second': 1.244, 'total_flos': 1854543581184000.0, 'train_loss': 0.12982596898112786, 'epoch': 10.0})

In [16]:
# save
trainer.save_model("drive/MyDrive/Bild/crime_clsfr_BERT_torch")

## Assess performance

In [17]:
trainer.evaluate(test_dataset)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.28473687171936035,
 'eval_accuracy': 0.94,
 'eval_f1': 0.8867924528301887,
 'eval_precision': 0.8245614035087719,
 'eval_recall': 0.9591836734693877,
 'eval_runtime': 3.4321,
 'eval_samples_per_second': 58.274,
 'eval_steps_per_second': 1.165,
 'epoch': 10.0}

88.7% F1, this is amazing!