<a href="https://colab.research.google.com/github/nicolaiberk/_rrpviol_med/blob/master/_sc/_mig_clsfr/mig_classifier_BERT_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERT Model to classify German news articles

This is part of a project on migration attention in German news and their impact on issue attitudes and definitions, as well as political violence. The classifier indicated whether the content is about migration.

In [None]:
!pip install transformers
!pip install bertviz

Collecting bertviz
  Downloading bertviz-1.2.0-py3-none-any.whl (156 kB)
[K     |████████████████████████████████| 156 kB 14.4 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 59.7 MB/s 
Collecting boto3
  Downloading boto3-1.18.18-py3-none-any.whl (131 kB)
[K     |████████████████████████████████| 131 kB 72.4 MB/s 
Collecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 10.2 MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting botocore<1.22.0,>=1.21.18
  Downloading botocore-1.21.18-py3-none-any.whl (7.8 MB)
[K     |████████████████████████████████| 7.8 MB 61.0 MB/s 
[?25hCollecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.6-py2.py3-none-any.whl (138 kB)
[K     |███████████████████████████

In [None]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import TensorDataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.dpi'] = 200

## Prepare data

In [None]:
# load classified migration data
print('Loading data...')
dta = pd.read_csv('https://www.dropbox.com/s/t47m1bppr3f1na5/handcoding_finished.csv?dl=1', sep = ',')
print('\tDone!')

Loading data...
	Done!


In [None]:
# define outcome
dta['mig_bin'] = dta.mig == 'Ja'

In [None]:
# shuffle data
dta = dta.sample(frac = 1, random_state=42).reset_index(drop = True)

In [None]:
#determine train-test-val split
splits = ['train']*1400
splits.extend(['test']*200)
splits.extend(['val']*200)
dta['split'] = splits

# Split dataset into pre-specified training, validation, and test sets 
train = dta[['text', 'mig_bin']][dta['split']=='train'].reset_index(drop = True)
test = dta[['text', 'mig_bin']][dta['split']=='test'].reset_index(drop = True)
val = dta[['text', 'mig_bin']][dta['split']=='val'].reset_index(drop = True)

print(train.shape)
print(test.shape)
print(val.shape)


(1400, 2)
(200, 2)
(200, 2)


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
model_name = 'bert-base-german-cased'
BERT_tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                               model_max_length = 512)

In [None]:
train_encodings = BERT_tokenizer(list(train['text']), padding='max_length', truncation=True, return_tensors='pt')
test_encodings = BERT_tokenizer(list(test['text']), padding='max_length', truncation=True, return_tensors='pt')
val_encodings = BERT_tokenizer(list(val['text']), padding='max_length', truncation=True, return_tensors='pt')

In [None]:
train_labels = [1 if label else 0 for label in train['mig_bin']]
test_labels = [1 if label else 0 for label in test['mig_bin']]
val_labels = [1 if label else 0 for label in val['mig_bin']]

In [None]:
import torch

class MigDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MigDataset(train_encodings, train_labels)
val_dataset = MigDataset(val_encodings, val_labels)
test_dataset = MigDataset(test_encodings, test_labels)

## Setup training, train

In [None]:
BERT_model = AutoModelForSequenceClassification.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100, 
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=BERT_model,                    # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

***** Running training *****
  Num examples = 1400
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 880
  if __name__ == '__main__':


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.271432,0.9,0.868421,0.814815,0.929577
2,0.174700,0.247389,0.92,0.888889,0.876712,0.901408
3,0.147100,0.421865,0.89,0.838235,0.876923,0.802817
4,0.122300,0.428121,0.915,0.877698,0.897059,0.859155
5,0.048600,0.625781,0.895,0.864516,0.797619,0.943662
6,0.038900,0.494578,0.91,0.865672,0.920635,0.816901
7,0.055000,0.628562,0.91,0.867647,0.907692,0.830986
8,0.003000,0.653055,0.915,0.881119,0.875,0.887324
9,0.003000,0.751069,0.91,0.875,0.863014,0.887324
10,0.001300,0.75805,0.905,0.868966,0.851351,0.887324


***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
  if __name__ == '__main__':
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
***** Running Evaluation *****
  Num examples = 200
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=880, training_loss=0.06716629793306558, metrics={'train_runtime': 784.9995, 'train_samples_per_second': 17.834, 'train_steps_per_second': 1.121, 'total_flos': 3683554775040000.0, 'train_loss': 0.06716629793306558, 'epoch': 10.0})

In [None]:
# save
trainer.save_model("drive/MyDrive/Bild/mig_clsfr_BERT_torch")

Saving model checkpoint to drive/MyDrive/Bild/mig_clsfr_BERT_torch
Configuration saved in drive/MyDrive/Bild/mig_clsfr_BERT_torch/config.json
Model weights saved in drive/MyDrive/Bild/mig_clsfr_BERT_torch/pytorch_model.bin


## Assess performance

In [None]:
trainer.evaluate(test_dataset)

***** Running Evaluation *****
  Num examples = 200
  Batch size = 64
  if __name__ == '__main__':


{'epoch': 10.0,
 'eval_accuracy': 0.955,
 'eval_f1': 0.9403973509933775,
 'eval_loss': 0.3630613088607788,
 'eval_precision': 0.9466666666666667,
 'eval_recall': 0.9342105263157895,
 'eval_runtime': 3.465,
 'eval_samples_per_second': 57.721,
 'eval_steps_per_second': 1.154}

94% F1, this is amazing!

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs

## Assess features with [BERTviz](https://colab.research.google.com/drive/1YoJqS9cPGu3HL2_XExw3kCsRBtySQS2v?usp=sharing)

(currently not working, would have to rewrite the code in pytorch)

In [None]:
from bertviz import model_view, head_view
from bertviz.neuron_view import show

In [None]:
# Load model and retrieve attention weights

from bertviz import head_view, model_view
from transformers import BertTokenizer, BertModel

model_version = "drive/MyDrive/Bild/mig_clsfr_BERT_torch"
do_lower_case = True
model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = 512)
sentence_a = "Deutschland schiebt vorerst keine Menschen mehr nach Afghanistan ab."
sentence_b = 'Der Bundesinnenminister hat aufgrund der aktuellen Entwicklungen der Sicherheitslage entschieden, Abschiebungen nach Afghanistan zunächst auszusetzen'
inputs = tokenizer.encode_plus(sentence_a, sentence_b, return_tensors='pt', add_special_tokens=True)
input_ids = inputs['input_ids']
token_type_ids = inputs['token_type_ids']
attention = model(input_ids, token_type_ids=token_type_ids)[-1]
sentence_b_start = token_type_ids[0].tolist().index(1)
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list) 

In [None]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>

In [None]:
head_view(attention, tokens)