In [None]:
# Transformers installation
! pip install transformers datasets
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from google.colab import drive

In [None]:
from datasets import (Dataset,
                      DatasetDict)

from transformers import (DataCollatorWithPadding,
                          # AutoModelForSequenceClassification
                          # Trainer,
                          # TrainingArguments,
                          AutoTokenizer,
                          AutoModel,
                          AutoConfig)
from transformers.modeling_outputs import TokenClassifierOutput

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [None]:
raw_df = pd.read_csv('/content/one_post_per_row.csv')
raw_df.head(5)

Unnamed: 0,type,post,type_index
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,1
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,1
2,INFJ,enfp and intj moments https://www.youtube.com...,1
3,INFJ,What has been the most life-changing experienc...,1
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,1


In [None]:
df = raw_df.rename(columns={'type': 'label',  'post': 'sentence', 'type_index': 'label_int',})
df = df[['sentence', 'label_int']]
df = df.dropna(subset=['sentence'])

In [None]:
dataset = Dataset.from_pandas(df)


train_testvalid = dataset.train_test_split(test_size=0.2, seed=15, shuffle=True)

test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15, shuffle=True)


dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__'],
        num_rows: 337401
    })
    test: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__'],
        num_rows: 42176
    })
    valid: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__'],
        num_rows: 42175
    })
})

In [None]:
model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.91M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [None]:
tokenizer

BertweetTokenizer(name_or_path='vinai/bertweet-base', vocab_size=64000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	64000: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
print(dataset["train"]["label_int"][0:5])

[13, 8, 1, 3, 1]


In [None]:
mbti_encoding = {
    'INFP': 0, 'INFJ': 1, 'INTP': 2, 'INTJ': 3,
    'ISFP': 4, 'ISFJ': 5, 'ISTP': 6, 'ISTJ': 7,
    'ENFP': 8, 'ENFJ': 9, 'ENTP': 10, 'ENTJ': 11,
    'ESFP': 12, 'ESFJ': 13, 'ESTP': 14, 'ESTJ': 15
}

In [None]:
def tokenize(batch):
    # Tokenize and convert into model-compatible format
    # Choose to omit padding, or change other arguments
    # batch["label"] = mbti_encoding[batch["label"]]

    return tokenizer(batch["sentence"], padding=True, truncation=True, max_length=128, return_tensors="pt")

# Takes ~3 minutes
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/337401 [00:00<?, ? examples/s]

Map:   0%|          | 0/42176 [00:00<?, ? examples/s]

Map:   0%|          | 0/42175 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 337401
    })
    test: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 42176
    })
    valid: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 42175
    })
})

In [None]:

tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label_int"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 337401
    })
    test: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 42176
    })
    valid: Dataset({
        features: ['sentence', 'label_int', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 42175
    })
})

In [None]:
data_collator

DataCollatorWithPadding(tokenizer=BertweetTokenizer(name_or_path='vinai/bertweet-base', vocab_size=64000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	64000: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None,

In [None]:
class BERTweetClassifier(nn.Module):
    def __init__(self, bertweet_model, num_classes=16):
        super(BERTweetClassifier, self).__init__()
        self.bertweet_model = bertweet_model

        self.num_classes = num_classes


        self.hidden_size = bertweet_model.config.hidden_size


        self.dropout = nn.Dropout(0.1)


        self.linear1 = nn.Linear(self.hidden_size, self.hidden_size//2)
        self.linear2 = nn.Linear(self.hidden_size//2, num_classes)

        # Freeze BERT model parameters
        for param in self.bertweet_model.parameters():
          param.requires_grad = False

        # Unfreeze specific layers, for example, the last 2 transformer layers
        for param in self.bertweet_model.encoder.layer[-2:].parameters():
            param.requires_grad = True


    def forward(self, input_ids, attention_mask=None):

        outputs = self.bertweet_model(input_ids=input_ids, attention_mask=attention_mask)


        last_hidden_state = outputs[0]

        attentions = outputs[1]

        hidden_states = outputs[2]



        # We'll take the [CLS] token representation (first token) as the pooled output
        pooled_output = last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)


        out = F.relu(self.linear1(pooled_output))
        logits = self.linear2(out)

        return logits, attentions, hidden_states

In [None]:
model_name = "vinai/bertweet-base"

config = AutoConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)

bertweet_model = AutoModel.from_pretrained(model_name, config=config)

pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

In [None]:
train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=32, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["valid"], batch_size=32, collate_fn=data_collator
)
test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=32, collate_fn=data_collator
)

PREPROCESSING:
normalize tweet (is it already normalized??)
load tweet
pad
batch

In [None]:
# INPUT TWEET IS ALREADY NORMALIZED!
model = BERTweetClassifier(bertweet_model)

line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"

input_ids = torch.tensor([tokenizer.encode(line)])

with torch.no_grad():
    features = model(input_ids)  # Models outputs are now tuples


In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BERTweetClassifier(
  (bertweet_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Layer

In [None]:
drive.mount('/content/drive/MyDrive/fruits')

Mounted at /content/drive/MyDrive/fruits


In [None]:
device

device(type='cuda')

In [None]:
checkpoint_dir = "/content/drive/MyDrive/fruits"
os.makedirs(checkpoint_dir, exist_ok=True)

In [None]:
# checkpoint_dir = "checkpoints"

# checkpoint_path = os.path.join(checkpoint_dir, f"gabagool.hi")
# torch.save(model.state_dict(), checkpoint_path)
# torch.save(optimizer.state_dict(), checkpoint_path + "optim")
# print(f"Checkpoint saved at: {checkpoint_path}")

#os.makedirs(checkpoint_dir, exist_ok=True)
#checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_{1}.pt")
#torch.save({'apple': 1, 'banana':2}, checkpoint_path)

In [None]:
from tqdm.auto import tqdm



progress_bar = tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label_int']

        # checkpoint_path = os.path.join(checkpoint_dir, f"YAY.pdf")
        # torch.save("go bananaz", checkpoint_path)
        # torch.save("b a n a n a s", checkpoint_path + "optim")
        # print(f"Checkpoint saved at: {checkpoint_path}")

        # outputs = model(**batch)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        logits, attentions, hidden_states = outputs[0], outputs[1], outputs[2]

        loss_fn = nn.CrossEntropyLoss()
        loss = loss_fn(logits, labels)


     #   loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_extra_layer_{epoch}.pt")
    torch.save(model.state_dict(), checkpoint_path)
    torch.save(optimizer.state_dict(), checkpoint_path + "optim")
    print(f"Checkpoint saved at: {checkpoint_path}")


  0%|          | 0/31632 [00:00<?, ?it/s]

Checkpoint saved at: /content/drive/MyDrive/fruits/checkpoint_extra_layer_0.pt
Checkpoint saved at: /content/drive/MyDrive/fruits/checkpoint_extra_layer_1.pt


In [None]:
%cd /content/drive/MyDrive/fruits

/content/drive/MyDrive/fruits


In [None]:
# Restore the best validation checkpoint
checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_2.pt")
model.load_state_dict(torch.load(checkpoint_path))
optimizer.load_state_dict(torch.load(checkpoint_path + "optim"))

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/fruits/checkpoint_2.pt'

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

<a id='additional-resources'></a>