In [None]:
!pip install -q transformers datasets torch > /dev/null

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

In [None]:
img2text = pipeline(task='image-to-text',model='google/pix2struct-textcaps-base')

In [None]:
img2text("/content/evaluate_huggingface.png")



[{'generated_text': 'A book titled Mastering Huggingface Model Evaluation.'}]

In [None]:
classifier("There is a lot to learn from the new models.")

[{'label': 'POSITIVE', 'score': 0.9977133274078369}]

In [None]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")

In [None]:
encoding

{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
pt_batch = tokenizer(
    ["We are very happy to show you the 🤗 Transformers library.", 
     "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

In [None]:
type(pt_batch)

transformers.tokenization_utils_base.BatchEncoding

In [None]:
pt_batch

{'input_ids': tensor([[  101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103,   100,
         58263, 13299,   119,   102],
        [  101, 11312, 18763, 10855, 11530,   112,   162, 39487, 10197,   119,
           102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [None]:
model(**pt_batch)

SequenceClassifierOutput(loss=None, logits=tensor([[-2.6222, -2.7745, -0.8967,  2.0137,  3.3064],
        [ 0.0064, -0.1258, -0.0503, -0.1655,  0.1329]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
from torch import nn

pt_pred = nn.functional.softmax(model(**pt_batch).logits, 
                                dim=-1)

In [None]:
pt_pred

tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)

In [None]:
pt_save_directory = "./pt_save_pretrained"
tokenizer.save_pretrained(pt_save_directory)

('./pt_save_pretrained/tokenizer_config.json',
 './pt_save_pretrained/special_tokens_map.json',
 './pt_save_pretrained/vocab.txt',
 './pt_save_pretrained/added_tokens.json',
 './pt_save_pretrained/tokenizer.json')

In [None]:
from transformers import AutoConfig

my_config = AutoConfig.from_pretrained("distilbert-base-uncased", 
                                       n_heads=12)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
my_config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.28.1",
  "vocab_size": 30522
}

In [None]:
from transformers import AutoModel

my_model = AutoModel.from_config(my_config)

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="path/to/save/folder/",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
)

In [None]:
from datasets import load_dataset

dataset = load_dataset("rotten_tomatoes")

In [None]:
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

In [None]:
dataset = dataset.map(tokenize_dataset, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [None]:
dataset['train'][0]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
trainer.train()