In [1]:
! pip install bert-for-sequence-classification

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-for-sequence-classification
  Downloading bert_for_sequence_classification-0.0.4-py3-none-any.whl (14 kB)
Collecting transformers>=4.2.0
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 3.9 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 57.9 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 92.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers, bert-for-sequence-classification
Successfully installed bert-for-sequence-classification-0.0.4 huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import json

from transformers import AutoModel, AutoTokenizer

from bert_clf import BertCLF, train_evaluate, predict_metrics, prepare_data_notebook, prepare_dataset
from bert_clf.utils import set_global_seed

### Prepare UC-UNSC dataset for testing

In [3]:
df = pd.read_csv('sentence_full.csv', sep =',')

In [4]:
df

Unnamed: 0,Sentence,Label,Components
0,"Once again, since the last briefing to the Cou...",claim,{'the situation regarding Ukraine has seriousl...
1,This is now the tenth time that the Council ha...,none,{}
2,The General Assembly also took up the matter o...,none,{}
3,"Following close to two weeks of relative calm,...",premise,{'Following close to two weeks of relative cal...
4,The individuals involved called for secession ...,premise,{'The individuals involved called for secessio...
...,...,...,...
4746,We welcome Italy's decision to designate dialo...,claim,"{""We welcome Italy's decision to designate dia..."
4747,China supports practical and effective coopera...,claim,{'China supports practical and effective coope...
4748,We welcome all the positive efforts being made...,claim,{'We welcome all the positive efforts being ma...
4749,We hope that all the parties concerned will wo...,claim,{'We hope that all the parties concerned will ...


In [5]:
df['Label'] = df['Label'].str.replace('claim','Arg')
df['Label'] = df['Label'].str.replace('premise','Arg')
df['Label'] = df['Label'].str.replace('none','O')

In [6]:
df

Unnamed: 0,Sentence,Label,Components
0,"Once again, since the last briefing to the Cou...",Arg,{'the situation regarding Ukraine has seriousl...
1,This is now the tenth time that the Council ha...,O,{}
2,The General Assembly also took up the matter o...,O,{}
3,"Following close to two weeks of relative calm,...",Arg,{'Following close to two weeks of relative cal...
4,The individuals involved called for secession ...,Arg,{'The individuals involved called for secessio...
...,...,...,...
4746,We welcome Italy's decision to designate dialo...,Arg,"{""We welcome Italy's decision to designate dia..."
4747,China supports practical and effective coopera...,Arg,{'China supports practical and effective coope...
4748,We welcome all the positive efforts being made...,Arg,{'We welcome all the positive efforts being ma...
4749,We hope that all the parties concerned will wo...,Arg,{'We hope that all the parties concerned will ...


In [7]:
utest = df

### USElecDeb dataset

In [8]:
df = pd.read_csv('sentence_db_candidate.csv')

In [9]:
df.shape

(29621, 18)

In [10]:
df.head()

Unnamed: 0,Text,Part,Document,Order,Sentence,Start,End,Annotator,Tag,Component,Speech,Speaker,SpeakerType,Set,Date,Year,Name,MainTag
0,"CHENEY: Gwen, I want to thank you, and I want ...",1,30_2004,0,0,2101,2221,,"{""O"": 27}",O,"Gwen, I want to thank you, and I want to than...",CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O
1,"It's a very important event, and they've done ...",1,30_2004,1,1,2221,2304,,"{""O"": 19}",O,"It's a very important event, and they've done ...",CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O
2,It's important to look at all of our developme...,1,30_2004,2,2,2304,2418,,"{""O"": 23}",O,It's important to look at all of our developme...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,O
3,"And, after 9/11, it became clear that we had t...",1,30_2004,3,3,2418,2744,,"{""O"": 16, ""Claim"": 50}",Claim,"And, after 9/11, it became clear that we had t...",CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Claim
4,And we also then finally had to stand up democ...,1,30_2004,4,4,2744,2974,,"{""O"": 4, ""Claim"": 13, ""Premise"": 25}",Premise,And we also then finally had to stand up democ...,CHENEY,Candidate,TRAIN,05 Oct 2004,2004,Richard(Dick) B. Cheney,Mixed


In [11]:
df['Component'] = df['Component'].str.replace('Claim','Arg')
df['Component'] = df['Component'].str.replace('Premise','Arg')

In [12]:
df['Component'].value_counts()

Arg    22280
O       7252
Name: Component, dtype: int64

In [13]:
df = df.dropna(subset=['Component'])

In [14]:
df.shape

(29532, 18)

In [15]:
#splitting as the authors did 
df_train = df[df['Set'] == 'TRAIN']
df_val = df[df['Set'] == 'VALIDATION']
df_test = df[df['Set'] == 'TEST']

df_train = df_train[['Speech', 'Component']]
df_val = df_val[['Speech', 'Component']]
df_test = df_test[['Speech', 'Component']]

In [16]:
print(df_train.shape, df_val.shape, df_test.shape)

(14044, 2) (7033, 2) (8455, 2)


In [17]:
df_test['Component'].value_counts()

Arg    6575
O      1880
Name: Component, dtype: int64

### Transformer Language Model

In [18]:
config = dict(
    transformer_model = dict(
        model = "chkla/roberta-argument",
        path_to_state_dict = False,
        device = 'cuda',
        dropout = 0.2,
        learning_rate = 2e-5,
        batch_size = 16,
        shuffle = True,
        maxlen = 128,
    ),
    data = dict(
        train_data_path = df_train,
        test_data_path = df_val,
        text_column = "Speech",
        target_column = "Component",
        random_state = 52,
        test_size = 0.3,
        stratify=True
    ),
    training = dict (
    save_state_dict = False, # if False the model will be saved using torch.save(<model_class>)
        # and should be loaded like this: model = torch.load()
        # you will have to install the library to do so
    early_stopping = True,
    delta = 0.001,
    patience = 7,
    num_epochs = 2,
    average_f1 = 'macro',
    other_metrics = ['micro', 'weighted'],
    output_dir = "../results/",
    class_weight = True
    )
)

In [19]:
set_global_seed(seed=config['data']['random_state'])
os.makedirs(config['training']['output_dir'], exist_ok=True)

In [20]:
device = torch.device(config['transformer_model']['device'])
tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=config['transformer_model']["model"]
    )
model_bert = AutoModel.from_pretrained(
    pretrained_model_name_or_path=config['transformer_model']["model"]
).to(device)

#for param in model_bert.parameters():
    #param.requires_grad = False

Downloading:   0%|          | 0.00/255 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/790 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at chkla/roberta-argument were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at chkla/roberta-argument and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
id2label, train_texts, valid_texts, train_targets, valid_targets = prepare_data_notebook(
    config=config, train_df = df_train, test_df = df_val
)

In [22]:
id2label 

{0: 'O', 1: 'Arg'}

In [42]:
model = BertCLF(
    pretrained_model=model_bert,
    tokenizer=tokenizer,
    id2label=id2label,
    dropout=config['transformer_model']['dropout'],
    device=device     
    )

In [43]:
model = model.to(device)

In [44]:
optimizer = optim.Adam(model.parameters(), lr=float(config['transformer_model']['learning_rate']))
criterion = nn.NLLLoss()

training_generator, valid_generator = prepare_dataset(
    tokenizer=tokenizer,
    train_texts=train_texts,
    train_targets=train_targets,
    valid_texts=valid_texts,
    valid_targets=valid_targets,
    config=config
)

In [45]:
model = train_evaluate(
    model=model,
    training_generator=training_generator,
    valid_generator=valid_generator,
    criterion=criterion,
    optimizer=optimizer,
    num_epochs=config['training']['num_epochs'],
    average=config['training']['average_f1'],
    config=config
)

==== Epoch 1 out of 2 ====


Training loop: 100%|██████████| 878/878 [05:03<00:00,  2.90it/s]
Evaluating loop: 100%|██████████| 440/440 [00:48<00:00,  9.14it/s]


Train F1: 0.8105865676901391
Eval F1: 0.7316440679328298

Train F1 micro: 0.8799117312072893
Eval F1 micro: 0.8244633838383839

Train F1 weighted: 0.8714284053801619
Eval F1 weighted: 0.8188784089282282

==== Epoch 2 out of 2 ====


Training loop: 100%|██████████| 878/878 [05:12<00:00,  2.81it/s]
Evaluating loop: 100%|██████████| 440/440 [00:47<00:00,  9.19it/s]


Train F1: 0.8687871875106898
Eval F1: 0.7183150165518366

Train F1 micro: 0.9133447228549735
Eval F1 micro: 0.8290088383838384

Train F1 weighted: 0.9101267361928317
Eval F1 weighted: 0.8116480275356139




Computing final metrics...: 100%|██████████| 440/440 [00:47<00:00,  9.19it/s]


              precision    recall  f1-score   support

         Arg       0.84      0.95      0.89      5241
           O       0.75      0.49      0.59      1792

    accuracy                           0.83      7033
   macro avg       0.80      0.72      0.74      7033
weighted avg       0.82      0.83      0.82      7033



In [47]:
model.to('cpu')

BertCLF(
  (pretrained_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm(

In [28]:
preds = []
for i,j in zip(df_test['Speech'], df_test['Component']):
    preds.append([model.predict(i), j, i])

In [29]:
pred = []
for i in preds:
    pred.append(i[0])

true = []
for m in preds:
    true.append(m[1])

In [30]:
from sklearn.metrics import classification_report
# roberta- arg on sentence level, USElecDeb corpus, task 1

target_names = ['class 0', 'class 1']
print(classification_report(true, pred, target_names=target_names, digits=3))

              precision    recall  f1-score   support

     class 0      0.872     0.956     0.912      6575
     class 1      0.769     0.510     0.613      1880

    accuracy                          0.857      8455
   macro avg      0.820     0.733     0.763      8455
weighted avg      0.849     0.857     0.846      8455



### UC-UNSC Testing

In [48]:
preds = []
for i,j in zip(utest['Sentence'], utest['Label']):
    preds.append([model.predict(i), j, i])

In [49]:
pred = []
for i in preds:
    pred.append(i[0])

true = []
for m in preds:
    true.append(m[1])

In [50]:
from sklearn.metrics import classification_report
# roberta-arg sentence level, task 1, our corpus 

target_names = ['class 0', 'class 1']
print(classification_report(true, pred, target_names=target_names, digits=3))

              precision    recall  f1-score   support

     class 0      0.865     0.964     0.911      3814
     class 1      0.723     0.386     0.503       937

    accuracy                          0.850      4751
   macro avg      0.794     0.675     0.707      4751
weighted avg      0.837     0.850     0.831      4751



In [None]:
dummy = np.ones(4751)

In [None]:
# baseline for task 1 on UN-UNSC
target_names = ['class 0', 'class 1']
print(classification_report(true, dummy, target_names=target_names, digits=3))

              precision    recall  f1-score   support

     class 0      0.000     0.000     0.000       937
     class 1      0.803     1.000     0.891      3814

    accuracy                          0.803      4751
   macro avg      0.401     0.500     0.445      4751
weighted avg      0.644     0.803     0.715      4751



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
