<a href="https://colab.research.google.com/github/nicolaiberk/bild/blob/main/code/crime_classifier_BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Model to classify German news articles

This is part of a project on migration attention in German news and their impact on issue attitudes. The classifier indicated whether the content is about crime.

In [1]:
!pip install transformers
!pip install bertviz



In [2]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader

## Prepare data

In [3]:
# load classified migration data
print('Loading data...')
dta = pd.read_csv('https://www.dropbox.com/s/6y71ulr3axxf5iq/training_crime.csv?dl=1', sep = ',')
print('\tDone!')

Loading data...
	Done!


In [4]:
# define outcome
dta['crime_bin'] = dta.crime == 'Ja'

In [5]:
# shuffle data
dta = dta.sample(frac = 1, random_state=42).reset_index(drop = True)

In [6]:
#determine train-test-val split
splits = ['train']*1400
splits.extend(['test']*200)
splits.extend(['val']*200)
dta['split'] = splits

# Split dataset into pre-specified training, validation, and test sets 
train = dta[['text', 'crime_bin']][dta['split']=='train'].reset_index(drop = True)
test = dta[['text', 'crime_bin']][dta['split']=='test'].reset_index(drop = True)
val = dta[['text', 'crime_bin']][dta['split']=='val'].reset_index(drop = True)

print(train.shape)
print(test.shape)
print(val.shape)


(1400, 2)
(200, 2)
(200, 2)


In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [8]:
model_name = 'distilbert-base-german-cased'
BERT_tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                               model_max_length = 512)

In [9]:
train_encodings = BERT_tokenizer(list(train['text']), padding='max_length', truncation=True, return_tensors='pt')
test_encodings = BERT_tokenizer(list(test['text']), padding='max_length', truncation=True, return_tensors='pt')
val_encodings = BERT_tokenizer(list(val['text']), padding='max_length', truncation=True, return_tensors='pt')

In [10]:
train_labels = [1 if label else 0 for label in train['crime_bin']]
test_labels = [1 if label else 0 for label in test['crime_bin']]
val_labels = [1 if label else 0 for label in val['crime_bin']]

In [11]:
import torch

class MigDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MigDataset(train_encodings, train_labels)
val_dataset = MigDataset(val_encodings, val_labels)
test_dataset = MigDataset(test_encodings, test_labels)

## Setup training, train

In [12]:
BERT_model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-german-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', '

In [13]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [14]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100, 
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=BERT_model,                    # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

***** Running training *****
  Num examples = 1400
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 880
  if __name__ == '__main__':


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.349445,0.895,0.764045,0.790698,0.73913
2,0.538100,0.234619,0.9,0.777778,0.795455,0.76087
3,0.214000,0.234222,0.91,0.804348,0.804348,0.804348
4,0.196600,0.257623,0.92,0.829787,0.8125,0.847826
5,0.154900,0.398118,0.91,0.808511,0.791667,0.826087
6,0.107000,0.457512,0.91,0.790698,0.85,0.73913
7,0.069000,0.486459,0.9,0.782609,0.782609,0.782609
8,0.028700,0.506075,0.915,0.813187,0.822222,0.804348
9,0.028700,0.525736,0.925,0.83871,0.829787,0.847826
10,0.002600,0.53457,0.925,0.83871,0.829787,0.847826


***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':
*

TrainOutput(global_step=880, training_loss=0.14899570101194762, metrics={'train_runtime': 717.5021, 'train_samples_per_second': 19.512, 'train_steps_per_second': 1.226, 'total_flos': 1854543581184000.0, 'train_loss': 0.14899570101194762, 'epoch': 10.0})

In [16]:
# save
trainer.save_model("drive/MyDrive/Bild/crime_clsfr_BERT_torch")

Saving model checkpoint to drive/MyDrive/Bild/crime_clsfr_BERT_torch
Configuration saved in drive/MyDrive/Bild/crime_clsfr_BERT_torch/config.json
Model weights saved in drive/MyDrive/Bild/crime_clsfr_BERT_torch/pytorch_model.bin


## Assess performance

In [17]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':


{'epoch': 10.0,
 'eval_accuracy': 0.925,
 'eval_f1': 0.8571428571428571,
 'eval_loss': 0.30922314524650574,
 'eval_precision': 0.8035714285714286,
 'eval_recall': 0.9183673469387755,
 'eval_runtime': 3.5077,
 'eval_samples_per_second': 57.017,
 'eval_steps_per_second': 1.14}

85.7% F1, this is amazing!