# Imports

In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig
from transformers import Trainer, TrainingArguments, Seq2SeqTrainer, Seq2SeqTrainingArguments
from torch import nn
from datasets import load_dataset
from transformers import DataCollatorWithPadding
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Pipeline

In [2]:
classifier = pipeline("sentiment-analysis")
classifier("We are very happy to show you the ü§ó Transformers library.")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9997795224189758}]

In [3]:
vision_classifier = pipeline(model="google/vit-base-patch16-224")
preds = vision_classifier(
    images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
)
preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
print(preds)

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]


In [0]:
# This model is a `zero-shot-classification` model.
# It will classify text, except you are free to choose any label you might imagine
classifier = pipeline(model="facebook/bart-large-mnli", batch_size=8)
res_classifier = classifier(
    ["I have a problem with my iphone that needs to be resolved asap!!",
    "I have a problem with my iphone that needs to be!!",
    "I have a problem with my iphone tha!!",
    "I have a problem with!!",
    "I have a!!",
    "I!!"],
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
)
print(res_classifier)

In [0]:
segmenter = pipeline(model="Falconsai/text_summarization", torch_dtype=torch.float16)
result = segmenter("M√©trica m√°s importante: ‚Äúpearson cosine‚Äù")


# AutoClass

## Cargar un tokenizer pre-entrenado

In [0]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoding = tokenizer("We are very happy to show you the ü§ó Transformers library.")
pt_batch = tokenizer(
    ["We are very happy to show you the ü§ó Transformers library.", "We hope you don't hate it."],
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt",
)

## Cargar un modelo pre-entrenado

In [0]:
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
pt_outputs = pt_model(**pt_batch)
print(pt_outputs)

In [0]:
pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
print(pt_predictions)

## Guardar y cargar

In [0]:
#pt_save_directory = "Volumes/uado01in/raw/uado01in/pt_save_pretrained"
#tokenizer.save_pretrained(pt_save_directory)
#pt_model.save_pretrained(pt_save_directory)

In [0]:
#pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")

# AutoConfig

Para cargar un modelo pre-entrenado pero cambiar su arquitectura.

In [0]:
my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=6)

# Dataset

In [0]:
dataset = load_dataset("rotten_tomatoes", split='test', cache_dir ="")
print(dataset["label"][0])  
def tokenize_dataset(dataset):
    #print(dataset)
    return tokenizer(dataset["text"])
dataset = dataset.map(tokenize_dataset, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Entrenamiento de un modelo

In [0]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [0]:
training_args = TrainingArguments(
    output_dir="./folder",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    report_to="none"
)

In [0]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["text"],
    eval_dataset=dataset["text"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [0]:
trainer.train()