# HUGGINGFACE + TENSORFLOW

___
### PREPARATION
___

In [None]:
from datasets import load_dataset

imdb = load_dataset('imdb')
imdb


In [None]:
imdb["test"][0]


In [None]:
# générateur de tokens
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


In [None]:
# fonction de prétraitement des tokens pour les tronqués pour par qu'ils dépassent la longueur max d'entrée du modèle
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


In [None]:
# application de la fonction avec un accélérateur de mapping
tokenized_imdb = imdb.map(preprocess_function, batched=True)


In [None]:
# If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes
small_train_dataset = tokenized_imdb["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_imdb["test"].shuffle(seed=42).select(range(1000))


In [None]:
# Now create a batch of examples using DataCollatorWithPadding. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
from transformers import DataCollatorWithPadding

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# TS
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


___
### HYPERPARAMETRES
___

In [None]:
# création de IDs pour les labels
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


In [None]:
# entraienement avec DistilBERT
# from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# model = AutoModelForSequenceClassification.from_pretrained(
#     "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
# )


In [None]:
# from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="my_awesome_model",
#     learning_rate=2e-5,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     num_train_epochs=2,
#     weight_decay=0.01,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     push_to_hub=False,
#     no_cuda=True,
# )


___
### EVALUATION
___

In [None]:
#  avec fonction évaluer les prédictions
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [None]:
# métrics
# import numpy as np
# import evaluate

# metric = evaluate.load("accuracy")


In [None]:
# monitoring
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")


___
### ENTRAINEMENT
___

In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_eval_dataset,
#     compute_metrics=compute_metrics,
#     data_collator=data_collator,
# )


In [None]:
# Assurez-vous que le GPU est désactivé dans torch également
# import torch
# torch.cuda.is_available = lambda: False


In [None]:
# trainer.train()


___
___
# Tensorflow
___
___

In [None]:
# TS
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)


In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)


In [None]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_imdb["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_imdb["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!


In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)


In [None]:
from huggingface_hub import login
import os

login('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')
print()
print('<> login huggingface <>')
os.system('huggingface-cli whoami')


In [None]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="my_awesome_model",
    tokenizer=tokenizer,
)


In [None]:
callbacks = [metric_callback, push_to_hub_callback]


In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
