In [1]:
# !pip install transformers[torch]
# !pip install torch
# !pip install datasets --user
# !pip install evaluate

Collecting transformers[torch]
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m63.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting regex!=2019.12.17
  Downloading regex-2023.10.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m773.3/773.3 kB[0m [31m90.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14
  Downloading tokenizers-0.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m120.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.16.4
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m66.1 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.3.1
  Downlo

# The most basic HuggingFace pipeline

In [33]:
from transformers import pipeline

In [3]:
classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [7]:
classifier("This is a wonderful test run for the mighty huggingface transformers!")

[{'label': 'POSITIVE', 'score': 0.9998185038566589}]

# Classification as Model and Tokenizer

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [12]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [15]:
# Wow it even does other languages!
classifier("No quiero a usar Tensorflow")

[{'label': '1 star', 'score': 0.3957779109477997}]

# Using the Tokenizer

In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [21]:
#let's take a look at the inner workings
inputs = tokenizer("We are so incredibly happy to show you the HuggingFace transformers library!")

In [23]:
print(inputs)
print(len(inputs['input_ids']))

{'input_ids': [101, 2057, 2024, 2061, 11757, 3407, 2000, 2265, 2017, 1996, 17662, 12172, 19081, 3075, 999, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
16


In [27]:
pt_batch = tokenizer(
    ["We are so incredibly happy to show you the HuggingFace transformers library!", "Here's a shorter sentence for tokenizing"],
    padding    = True,
    truncation = True,
    max_length = 512,
    return_tensors = "pt"
)

In [28]:
for key, value in pt_batch.items():
    print(f"{key}: {value.numpy().tolist()}")

input_ids: [[101, 2057, 2024, 2061, 11757, 3407, 2000, 2265, 2017, 1996, 17662, 12172, 19081, 3075, 999, 102], [101, 2182, 1005, 1055, 1037, 7820, 6251, 2005, 19204, 6026, 102, 0, 0, 0, 0, 0]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]


# Fine Tuning Models for Classification

For this, we're going to be using reviews data from Starbucks as a test case (small file so things should run fast!)

In [2]:
import pandas as pd

df = pd.read_csv('/domino/datasets/local/llm_example/reviews_data.csv')
df = df[['Review', 'Rating']]
df_final_test = df[df.Rating.isna()]
df = df[df.Rating.isna() == False]
df.columns = ['text', 'label']

In [3]:
train_percent = 0.8
row_cutoff = int(train_percent * df.shape[0])
df = df.sample(frac=1).reset_index(drop=True)
df_train = df[:row_cutoff]
df_test = df[row_cutoff:]

In [4]:
# Create dataset objects so that we can follow along with official HuggingFace tutorial:
# https://huggingface.co/docs/transformers/training

from datasets import Dataset, DatasetDict

dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)

dataset_star = DatasetDict({
        'train':dataset_train,
        'test':dataset_test
})

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(df):
    return tokenizer(df['text'], padding="max_length", truncation=True)

tokenized_datasets_star = dataset_star.map(tokenize_function)
# Debugging
#tokenized_datasets = tokenized_datasets.remove_columns(["text", "label"])

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/564 [00:00<?, ? examples/s]

Map:   0%|          | 0/141 [00:00<?, ? examples/s]

In [63]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir = './results', evaluation_strategy="epoch")

In [74]:
# Create a metric to pass the trainer
import numpy as np
import evaluate 
import torch

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [66]:
text = "This is a test"
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

In [67]:
inputs

{'input_ids': tensor([[ 101, 1188, 1110,  170, 2774,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [84]:
logits = model(input_ids = torch.tensor(tokenized_datasets['train']['input_ids'][0]), attention_mask=torch.tensor(tokenized_datasets['train']['attention_mask'][0]))

In [94]:
np.argmax(logits.logits.detach().numpy(), axis=-1)

array([3])

In [87]:
logits

SequenceClassifierOutput(loss=None, logits=tensor([[-0.2353, -0.2467,  0.3233,  0.5721,  0.0475]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [76]:
torch.tensor(tokenized_datasets['train']['input_ids'][0])

tensor([[  101,   146,  1301,  1118,  2537,  7925,  8770,  1451,  2106,  1105,
          4417,  1103,  4592, 16516, 21216,  2884,  1105,  3093,  1115,  2121,
          1103,  8171,  1138,  1151,  1533,  1105, 23029,  1174,  1149,  1104,
           119,   146,  2653,   170,  1974,  1111,  1143,  1106,  1178,  1243,
           126, 19573,  1105,   123, 22888,   119,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [56]:
model(**tokenized_datasets['train'][0])

AttributeError: 'list' object has no attribute 'size'

In [37]:
model(**tokenized_datasets['train'][1]['input_ids'])

TypeError: BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (pooler): BertPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=5, bias=True)
) argument after ** must be a mapping, not list

In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

In [10]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 5]))

# HuggingFace From Scratch

In [98]:
del model
del trainer
torch.cuda.empty_cache()

In [6]:
tokenized_datasets_star = tokenized_datasets_star.remove_columns(["text"])

In [7]:
tokenized_datasets_star = tokenized_datasets_star.rename_column("label", "labels")

In [8]:
tokenized_datasets_star.set_format("torch")

In [9]:
train_dataset_star = tokenized_datasets_star["train"]
eval_dataset_star = tokenized_datasets_star["test"]

In [10]:
 tokenized_datasets_star["train"][0]

{'labels': tensor(2.),
 'input_ids': tensor([  101,   146,  1108,  1120,  1240,  2508, 14791,  7130, 15057,  2450,
          1107, 13811,  2138,  1105,   115,   115,  1105,   115,   115,  3347,
          1106,  1294,   170,  6876,  1104,  1251,  3668,   119,   119,   119,
           146,  1821,  3102,  1105,  1136,  1215,  1106,  1129,  5165,  1114,
          4267,  1116,  4894, 26426,   119,   119,   119,  1220, 10072,  1112,
           146,  2045,  1283,   119,   119,   119, 12528,  2008,  1106,  1474,
          1115,   146,  1281,   112,   189,  3143,  1240,  1282,  1251,  2039,
           119,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0, 

In [11]:
from torch.utils.data import DataLoader

train_dataloader_star = DataLoader(train_dataset_star, shuffle=True, batch_size=8)
eval_dataloader_star = DataLoader(eval_dataset_star, batch_size=8)

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', num_labels=5)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Create optimizer and learning schedule
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [15]:
from transformers import get_scheduler

num_epochs=5
num_training_steps = num_epochs * len(train_dataloader_star)
lr_scheduler = get_scheduler(
    name = "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [16]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [161]:
for batch in train_dataloader_star:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch)
    break

{'labels': tensor([2., 1., 1., 5., 2., 1., 1., 1.]), 'input_ids': tensor([[[ 101,  146, 1138,  ...,    0,    0,    0]],

        [[ 101,  146, 1460,  ...,    0,    0,    0]],

        [[ 101, 2119,  117,  ...,    0,    0,    0]],

        ...,

        [[ 101,  146, 3306,  ...,    0,    0,    0]],

        [[ 101,  146, 2097,  ...,    0,    0,    0]],

        [[ 101, 1881, 3430,  ...,    0,    0,    0]]]), 'token_type_ids': tensor([[[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0]]]), 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        ...,

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]],

        [[1, 1, 1,  ..., 0, 0, 0]]])}


In [163]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch)
    break

{'labels': tensor([2, 1, 0, 1, 4, 3, 2, 3]), 'input_ids': tensor([[  101,  7462, 15969,  ...,     0,     0,     0],
        [  101,   146,  1274,  ...,     0,     0,     0],
        [  101,   146,  1195,  ...,     0,     0,     0],
        ...,
        [  101,  3949,  1103,  ...,     0,     0,     0],
        [  101,  1142,  3871,  ...,     0,     0,     0],
        [  101,   146,  1541,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [167]:
small_train_dataset[0]

{'labels': tensor(4),
 'input_ids': tensor([  101,   146, 27438,  1142,  4202,   119,   146,   112,  1396,  1151,
          1106,  3924,  8412,  1187,   146,  9981,  1106,  1129,   170, 13395,
          7589,  2288,  1107,  1413,   117,  6322,  8796,  5030,  7424,   117,
          1105,  1104,  1736,  1103,  9230,   112,   188,  2319,   119,  1109,
         20400,  1132,  1177,  1177,  7284, 10455,   119,  1109,  3172,  1110,
          7688,  4931,  1105,  1119,  2228,  1296,  7329,  1118,  1289,  1114,
          1126, 10965,  2971,  1104,  8188,   119,  1192, 13224,  3940,  1303,
          3713,   106,   106,   106,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,  

In [113]:
from tqdm.auto import tqdm

progress_bar= tqdm(range(num_training_steps))

model.train()

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/355 [00:00<?, ?it/s]

ValueError: too many values to unpack (expected 2)

# Working Example

In [130]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]

Downloading builder script:   0%|          | 0.00/4.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.55k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/196M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/650000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/50000 [00:00<?, ? examples/s]

{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [131]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [133]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

In [134]:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [142]:
tokenized_datasets.set_format("torch")

In [143]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [144]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [145]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [146]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [147]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [148]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [149]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/375 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [150]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()


KeyboardInterrupt



In [162]:
for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    print(batch)
    break

{'labels': tensor([4, 3, 2, 3, 0, 1, 4, 0]), 'input_ids': tensor([[  101,  2825,   146,  ...,     0,     0,     0],
        [  101,  1635, 19453,  ...,     0,     0,     0],
        [  101,  1109,  2094,  ...,     0,     0,     0],
        ...,
        [  101,  1109, 16533,  ...,     0,     0,     0],
        [  101,  6466,  2213,  ...,     0,     0,     0],
        [  101,  3278, 23812,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [44]:
# TESTING
from datasets import load_dataset
dataset_yelp = load_dataset("yelp_review_full")

In [38]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [40]:
def tokenize_function_example(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function_example, batched=True)

Map:   0%|          | 0/650000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [41]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [42]:
small_train_dataset[0]

{'label': 4,
 'text': "I stalk this truck.  I've been to industrial parks where I pretend to be a tech worker standing in line, strip mall parking lots, and of course the farmer's market.  The bowls are so so absolutely divine.  The owner is super friendly and he makes each bowl by hand with an incredible amount of pride.  You gotta eat here guys!!!",
 'input_ids': [101,
  146,
  27438,
  1142,
  4202,
  119,
  146,
  112,
  1396,
  1151,
  1106,
  3924,
  8412,
  1187,
  146,
  9981,
  1106,
  1129,
  170,
  13395,
  7589,
  2288,
  1107,
  1413,
  117,
  6322,
  8796,
  5030,
  7424,
  117,
  1105,
  1104,
  1736,
  1103,
  9230,
  112,
  188,
  2319,
  119,
  1109,
  20400,
  1132,
  1177,
  1177,
  7284,
  10455,
  119,
  1109,
  3172,
  1110,
  7688,
  4931,
  1105,
  1119,
  2228,
  1296,
  7329,
  1118,
  1289,
  1114,
  1126,
  10965,
  2971,
  1104,
  8188,
  119,
  1192,
  13224,
  3940,
  1303,
  3713,
  106,
  106,
  106,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [9]:
# import torch
# from torch.utils.data import Dataset

# class EncodingDataset(Dataset):
#     def __init__(self, encodings, labels=None):
#         self.encodings = encodings
#         self.labels = labels

#     def __len__(self):
#         return len(self.encodings['input_ids'])

#     def __getitem__(self, idx):
#         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
#         if self.labels:
#             item['labels'] = torch.tensor(self.labels[idx])
#         return item

In [None]:
from torch.utils.data import DataLoader

train_dataloader = 

In [10]:
train_dataset = EncodingDataset(train_encodings)
test_dataset = EncodingDataset(test_encodings)

In [14]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./results",
#     num_train_epochs=5,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=16,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=10
# )
training_args = TrainingArguments(output_dir="./results")

In [12]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)
    
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism ha

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.

In [13]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 