### BERTweet

I would like to thank Maria Antoniak for her tutorial in [NLP+CSS 201](https://nlp-css-201-tutorials.github.io/nlp-css-201-tutorials/), *BERT for Computational Social Scientists*.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/My\ Drive/vaccine_tweets

Mounted at /content/gdrive
/content/gdrive/My Drive/vaccine_tweets


In [None]:
import pandas as pd
df = pd.read_csv('training.csv')

In [None]:
# shuffle and make train/test splits
import math
train_size = math.floor(df.shape[0]*0.6)
eval_size = math.floor(df.shape[0]*0.2)
print(train_size)
print(eval_size)
df = df.sample(frac=1).reset_index()

In [None]:
corpus = df['text']

In [None]:
!pip install nltk
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.1.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 5.1 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.1.0-py3-none-any.whl size=212392 sha256=9f8d035dd06d974a79971f3eed6a51ddd78aadfca4671a2a2bf2bba2b189162d
  Stored in directory: /root/.cache/pip/wheels/77/75/99/51c2a119f4cfd3af7b49cc57e4f737bed7e40b348a85d82804
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.1.0


Use the text preprocessor consistent with BERTweet's pre-training.

In [None]:
from nltk.tokenize import TweetTokenizer
from emoji import demojize
import re

tokenizer = TweetTokenizer()

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "HTTPURL"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token

def normalizeTweet(tweet):
    tokens = tokenizer.tokenize(tweet.replace("’", "'").replace("…", "..."))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = normTweet.replace("cannot ", "can not ").replace("n't ", " n't ").replace("n 't ", " n't ").replace("ca n't", "can't").replace("ai n't", "ain't")
    normTweet = normTweet.replace("'m ", " 'm ").replace("'re ", " 're ").replace("'s ", " 's ").replace("'ll ", " 'll ").replace("'d ", " 'd ").replace("'ve ", " 've ")
    normTweet = normTweet.replace(" p . m .", "  p.m.") .replace(" p . m ", " p.m ").replace(" a . m .", " a.m.").replace(" a . m ", " a.m ")

    normTweet = re.sub(r",([0-9]{2,4}) , ([0-9]{2,4})", r",\1,\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3}) / ([0-9]{2,4})", r"\1/\2", normTweet)
    normTweet = re.sub(r"([0-9]{1,3})- ([0-9]{2,4})", r"\1-\2", normTweet)

    return " ".join(normTweet.split())
    # return len(normTweet.split())

In [None]:
import concurrent
from concurrent.futures import ThreadPoolExecutor

# start the thread pool
with ThreadPoolExecutor(10) as executor:
    # execute tasks concurrently and process results in order
    corpus = [c for c in executor.map(normalizeTweet, corpus)]

In [None]:
X_train, X_eval, X_test = corpus[:train_size], corpus[train_size:(train_size+eval_size)], corpus[(train_size+eval_size):]

In [None]:
! pip install torch
import torch
!pip install transformers
import transformers

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 4.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 75.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 94.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
# This is the name of the BERT model that we want to use.
# We're using DistilBERT to save space (it's a distilled version of the full BERT model),
# and we're going to use the cased (vs uncased) version.
# bert = 'vinai/bertweet-large'
bert = 'vinai/bertweet-base'
# This is the name of the program management system for NVIDIA GPUs. We're going to send our code here.
device_name = 'cuda'

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained(bert, normalization=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
max_length = 100
train_encodings = bert_tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
eval_encodings  = bert_tokenizer(X_eval, truncation=True, padding=True, max_length=max_length)
test_encodings  = bert_tokenizer(X_test, truncation=True, padding=True, max_length=max_length)

In [None]:
training_args = TrainingArguments(
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=50,                # number of warmup steps for learning rate scheduler (set lower because of small dataset size)
    weight_decay=0.01,               # strength of weight decay
    output_dir='./results',          # output directory
    logging_dir='./logs',            # directory for storing logs
    logging_steps=50,               # number of steps to output logging (set lower because of small dataset size)
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [None]:
import sklearn
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1_score = sklearn.metrics.f1_score(labels, preds)
  return {
      'f1_score': f1_score,
  }

In [None]:
from sklearn import preprocessing, utils
Y_strings = df['mandate_m']
le_Y = preprocessing.LabelEncoder()
Y = le_Y.fit_transform(Y_strings)

encode_dic = dict(zip(range(len(list(le_Y.classes_))),le_Y.classes_))
Y_train, Y_eval, Y_test = Y[:train_size], Y[train_size:(train_size+eval_size)], Y[(train_size+eval_size):]

In [None]:
train_Y_dataset = MyDataset(train_encodings, list(Y_train))
eval_Y_dataset = MyDataset(eval_encodings, list(Y_eval))
test_Y_dataset = MyDataset(test_encodings, list(Y_test))

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(bert, num_labels=len(list(le_Y.classes_))).to(device_name)

Downloading:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_Y_dataset,         # training dataset
    eval_dataset=eval_Y_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 2449
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 770


Step,Training Loss,Validation Loss,F1 Score
50,0.5624,0.359127,0.679426
100,0.3521,0.233348,0.822134
150,0.2898,0.246177,0.834586
200,0.2515,0.370059,0.773333
250,0.1938,0.256954,0.835165
300,0.2022,0.232637,0.84
350,0.1036,0.300577,0.85124
400,0.121,0.339517,0.810219
450,0.1157,0.262199,0.848739
500,0.0955,0.352407,0.802974


***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 524
  Batch size = 20
***** Running Evaluation *****
  Num examples = 524


TrainOutput(global_step=770, training_loss=0.16194064771974243, metrics={'train_runtime': 295.2364, 'train_samples_per_second': 41.475, 'train_steps_per_second': 2.608, 'total_flos': 629256811110000.0, 'train_loss': 0.16194064771974243, 'epoch': 5.0})

In [None]:
Y_training_score = trainer.predict(train_Y_dataset)
Y_eval_score = trainer.predict(eval_Y_dataset)

***** Running Prediction *****
  Num examples = 2449
  Batch size = 20


***** Running Prediction *****
  Num examples = 524
  Batch size = 20


In [None]:
pd.DataFrame(Y_eval_star.predictions[:10],columns=['score0','score1'])

Unnamed: 0,score0,score1
0,2.96993,-3.106414
1,3.078438,-3.279341
2,3.083519,-3.241588
3,3.075743,-3.250585
4,3.035839,-3.193835
5,3.080025,-3.284057
6,-2.412523,2.685285
7,3.062514,-3.230369
8,2.908291,-3.044233
9,3.056684,-3.235869


In [None]:
Y_training_star = Y_training_score.predictions.argmax(-1) # Get the highest probability prediction
Y_training_star = Y_training_star.flatten().tolist()
Y_eval_star = Y_eval_score.predictions.argmax(-1) # Get the highest probability prediction
Y_eval_star = Y_eval_star.flatten().tolist()

In [None]:
Y_test_score = trainer.predict(test_Y_dataset)

***** Running Prediction *****
  Num examples = 526
  Batch size = 20


In [None]:
Y_test_star = Y_test_score.predictions.argmax(-1) # Get the highest probability prediction
Y_test_star = Y_test_star.flatten().tolist()      # Flatten the predictions into a 1D list

In [None]:
print(sklearn.metrics.classification_report(Y_test,Y_test_star))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       385
           1       0.86      0.89      0.88       141

    accuracy                           0.93       526
   macro avg       0.91      0.92      0.91       526
weighted avg       0.93      0.93      0.93       526



In [None]:
sklearn.metrics.confusion_matrix(Y_test,Y_star)

array([[364,  21],
       [ 15, 126]])

In [None]:
126/141

0.8936170212765957

In [None]:
trainer.save_model('bertweet20221020/trained')

Saving model checkpoint to bertweet20221020/trained
Configuration saved in bertweet20221020/trained/config.json
Model weights saved in bertweet20221020/trained/pytorch_model.bin


In [None]:
bert_tokenizer.save_pretrained('bertweet20221020/tokenizer')

tokenizer config file saved in bertweet20221020/tokenizer/tokenizer_config.json
Special tokens file saved in bertweet20221020/tokenizer/special_tokens_map.json
added tokens file saved in bertweet20221020/tokenizer/added_tokens.json


('bertweet20221020/tokenizer/tokenizer_config.json',
 'bertweet20221020/tokenizer/special_tokens_map.json',
 'bertweet20221020/tokenizer/vocab.txt',
 'bertweet20221020/tokenizer/bpe.codes',
 'bertweet20221020/tokenizer/added_tokens.json')

In [None]:
df_score = pd.DataFrame(Y_training_score.predictions,columns=['score0','score1']).append(pd.DataFrame(Y_eval_score.predictions,columns=['score0','score1'])).append(pd.DataFrame(Y_test_score.predictions,columns=['score0','score1']))

In [None]:
df = pd.concat([df.reset_index(drop=True), df_score.reset_index(drop=True)], axis=1)

In [None]:
df['usage']=''

In [None]:
df['usage'].iloc[:train_size]='train'
df['usage'].iloc[train_size:(train_size+eval_size)]='eval'
df['usage'].iloc[(train_size+eval_size):]='test'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
import numpy as np
df['pred'] = np.nan
df['pred'].iloc[:train_size]=Y_training_star
df['pred'].iloc[train_size:(train_size+eval_size)]=Y_eval_star
df['pred'].iloc[(train_size+eval_size):]=Y_test_star