In [None]:
!pip install transformers datasets -q

[K     |████████████████████████████████| 3.8 MB 5.1 MB/s 
[K     |████████████████████████████████| 325 kB 16.3 MB/s 
[K     |████████████████████████████████| 67 kB 3.1 MB/s 
[K     |████████████████████████████████| 895 kB 42.3 MB/s 
[K     |████████████████████████████████| 596 kB 18.5 MB/s 
[K     |████████████████████████████████| 6.5 MB 38.5 MB/s 
[K     |████████████████████████████████| 134 kB 47.1 MB/s 
[K     |████████████████████████████████| 212 kB 58.7 MB/s 
[K     |████████████████████████████████| 1.1 MB 39.3 MB/s 
[K     |████████████████████████████████| 127 kB 67.4 MB/s 
[K     |████████████████████████████████| 144 kB 47.4 MB/s 
[K     |████████████████████████████████| 271 kB 50.7 MB/s 
[K     |████████████████████████████████| 94 kB 3.4 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires foli

## This uses roberta-base as an example, but any model can be used. When changing models, replace all instances of Roberta with the new model name (Electra, Bert, Albert, etc.). If you run into errors, refer to the source code for that specific model (XXXForSequenceClassification) and compare it.

For example, if you use Deberta, check how the `__init__` method ([link](https://github.com/huggingface/transformers/blob/93d3fd86459eb27bc584da29a3d542817a395bca/src/transformers/models/deberta/modeling_deberta.py#L1131)) and `forward` method ([link](https://github.com/huggingface/transformers/blob/93d3fd86459eb27bc584da29a3d542817a395bca/src/transformers/models/deberta/modeling_deberta.py#L1162)) are different. There might be 1 or 2 parameters that need to be changed to have it work properly. 


In [None]:
import torch
from torch import nn
from transformers import AutoConfig, RobertaModel, RobertaForSequenceClassification
from transformers.modeling_outputs import SequenceClassifierOutput
from typing import Optional, Union, Tuple

class ClassificationHead(nn.Module):
    """Head for sentence-level classification tasks."""

    def __init__(self, config, num_extra_dims):
        super().__init__()
        total_dims = config.hidden_size+num_extra_dims
        self.dense = nn.Linear(total_dims, total_dims)
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(total_dims, config.num_labels)

    def forward(self, features, **kwargs):
        x = self.dropout(features)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class CustomSequenceClassification(RobertaForSequenceClassification):

    def __init__(self, config, num_extra_dims):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        # might need to rename this depending on the model
        self.roberta =  RobertaModel(config)
        self.classifier = ClassificationHead(config, num_extra_dims)

        # Initialize weights and apply final processing
        self.post_init()

    
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        extra_data: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        # sequence_output will be (batch_size, seq_length, hidden_size)
        sequence_output = outputs[0]

        # additional data should be (batch_size, num_extra_dims)
        cls_embedding = sequence_output[:, 0, :]

        output = torch.cat((cls_embedding, extra_data), dim=-1)

        logits = self.classifier(output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

### This example does 3-class classification and adds 5 extra data points for each example. Please change for your own use case.



For simple text classification, the last hidden state for the CLS token is run through a classifier, producing a score for each label.  A simple way to combine text and numerical/categorical data is to concatenate the CLS embedding with the extra data. If the CLS embedding is [1.0, 2.0, 3.0] and the extra data is 5.0, then the concatenated version is [1.0, 2.0, 3.0, 5.0]. Likewise for categorical data, the variable can be turned into a one-hot encoding and concatenated. The concatenated version is then put through a classifier to produce scores for each label. 

In [None]:
new_model = CustomSequenceClassification.from_pretrained("roberta-base", num_labels=3, num_extra_dims=5)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing CustomSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing CustomSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CustomSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CustomSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You

## This is just dummy data, but you should make a dataset where each example has text, the extra features, and a label

After the text gets tokenized and run through the model, the output embeddings will be concatenated with the extra data. That concatenation will then go through the classifer and produce scores for each label.


NOTE: Your `extra_data` should be one hot data or scaled to mean 0, variance 1 if possible.

In [None]:
from datasets import Dataset
import numpy as np

ds = Dataset.from_dict({"text": ["This is a sentence"]*100, "extra_data": np.random.randint(0, 10, size=(100, 5)), "labels": np.random.randint(0, 3, size=(100,))})
print(ds)
ds[0]

Dataset({
    features: ['text', 'extra_data', 'labels'],
    num_rows: 100
})


{'extra_data': [7, 9, 5, 9, 0], 'labels': 1, 'text': 'This is a sentence'}

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

tokenized_ds = ds.map(lambda x: tokenizer(x["text"]))

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/100 [00:00<?, ?ex/s]

In [None]:
# here is what one example looks like
tokenized_ds[0]

{'attention_mask': [1, 1, 1, 1, 1, 1],
 'extra_data': [7, 9, 5, 9, 0],
 'input_ids': [0, 713, 16, 10, 3645, 2],
 'labels': 1,
 'text': 'This is a sentence'}

## If you set it up this way, you can train it using the Hugging Face Trainer, and the process is nearly identical to the example script here: https://github.com/huggingface/transformers/blob/master/examples/pytorch/text-classification/run_glue.py

In [None]:
from transformers import TrainingArguments, Trainer


args = TrainingArguments(output_dir="./")

trainer = Trainer(model=new_model, train_dataset=tokenized_ds, tokenizer=tokenizer, args=args)

In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `CustomSequenceClassification.forward` and have been ignored: text. If text are not expected by `CustomSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=39, training_loss=1.1151008605957031, metrics={'train_runtime': 78.1763, 'train_samples_per_second': 3.837, 'train_steps_per_second': 0.499, 'total_flos': 931469932800.0, 'train_loss': 1.1151008605957031, 'epoch': 3.0})

# No trainer method

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW, get_scheduler, DataCollatorWithPadding

BS = 4
LR = 3e-5
EPOCHS = 3
WEIGHT_DECAY = 0.01

no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in new_model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": WEIGHT_DECAY,
    },
    {
        "params": [p for n, p in new_model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(tokenized_ds)//BS*EPOCHS,
)

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

train_ds = tokenized_ds.remove_columns(["text"])
train_dataloader = DataLoader(
        train_ds, shuffle=True, collate_fn=data_collator, batch_size=BS
    )



In [None]:
GRAD_ACCUMULATION_STEPS = 1

for epoch in range(EPOCHS):
    new_model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = new_model(**batch)
        loss = outputs.loss
        loss = loss / GRAD_ACCUMULATION_STEPS
        loss.backward()
        if step % GRAD_ACCUMULATION_STEPS == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()