# HUGGINGFACE + TENSORFLOW

___
### PREPARATION
___

In [1]:
import pandas as pd

df = pd.read_csv('../dataset.csv')
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head(1)


Unnamed: 0,url,title,description,cat1,cat2,cat3
0,https://www.tourisme-cambresis.fr/1-les-templi...,"Aventure-jeu : ""Les Templiers du coffre d'or""",Le jeu aventure « Les templiers du coffre d’or...,Jeu,Famille,Détente


In [2]:
def extract_domain(url):
    url = url[url.find('//')+2:]
    url = url[:url.find('/')]
    return url

df['domain'] = df['url'].apply(extract_domain)

print("nb ligne df", len(df))

df = df.dropna(subset=['description'])

print("nb ligne df1", len(df))

df1 = df.copy()
df2 = df.dropna(subset=['cat2'])
df3 = df.dropna(subset=['cat3'])

print("nb ligne df2", len(df2))
print("nb ligne df3", len(df3))

df[['domain', 'url']].head(1)


nb ligne df 391
nb ligne df1 390
nb ligne df2 317
nb ligne df3 134


Unnamed: 0,domain,url
0,www.tourisme-cambresis.fr,https://www.tourisme-cambresis.fr/1-les-templi...


In [3]:
# Concaténer les colonnes 'title' et 'description' pour former les textes
df1['text'] = df1['domain'] + " | " + df1['title'] + " " + df1['description']
df1['label'] = df1['cat1']

df_ml = df1[['text', 'label']]
df_ml.head(1)


Unnamed: 0,text,label
0,"www.tourisme-cambresis.fr | Aventure-jeu : ""Le...",Jeu


In [4]:
from datasets import Dataset, DatasetDict

df_train = df_ml.iloc[:190]
df_test = df_ml.iloc[190:380]
df_unsupervised = df_ml.iloc[380:]

# Conversion des DataFrames en Datasets
dataset_train = Dataset.from_pandas(df_train)
dataset_test = Dataset.from_pandas(df_test)
dataset_unsupervised = Dataset.from_pandas(df_unsupervised)

# Créer un DatasetDict
dataset_dict = DatasetDict({
    'train': dataset_train,
    'test': dataset_test,
    'unsupervised': dataset_unsupervised
})


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from datasets import load_dataset

# imdb = load_dataset('imdb')
imdb = dataset_dict
imdb


DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 190
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 190
    })
    unsupervised: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 10
    })
})

In [6]:
imdb["test"][0]


{'text': 'www.tourismevalenciennes.fr | Exposition "Quand la nature inspire l\'innovation" - Famars Va prendre tes leçons dans la nature, c’est là qu’est notre futur.\nLéonard de Vinci\n\n\nCe temps fort exceptionnel, préfiguration à la grande exposition Biomimétisme prévue en septembre 2024, réunira durant 3 semaines une exposition, des ateliers, une conférence…\n\nLe biomimétisme, littéralement, l’imitation du vivant, consiste à s’inspirer de la nature pour concevoir de nouvelles technologies, innover. C’est, par exemple, en s’inspirant des oiseaux et de leurs ailes que sont nés les premiers ancêtres des avions. Que de chemin parcouru depuis, dans tous les domaines !\n\nL’EXPOSITION*, du 2 au 20 octobre, accessible dès 10 ans, sera certainement une révélation pour beaucoup de visiteurs !\nÀ travers quelques exemples concrets, on découvre comment le vivant a été, est et sera une source d’inspiration exceptionnelle pour l’innovation. L’Aviation, le ferroviaire, le sport, l’automobile s

In [7]:
# générateur de tokens
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


In [8]:
# fonction de prétraitement des tokens pour les tronqués pour par qu'ils dépassent la longueur max d'entrée du modèle
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)


In [9]:
# application de la fonction avec un accélérateur de mapping
tokenized_imdb = imdb.map(preprocess_function, batched=True)


Map: 100%|██████████| 190/190 [00:00<00:00, 3732.95 examples/s]
Map: 100%|██████████| 190/190 [00:00<00:00, 4156.20 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 1426.20 examples/s]


In [10]:
# If you like, you can create a smaller subset of the full dataset to fine-tune on to reduce the time it takes
small_train_dataset = tokenized_imdb["train"].shuffle(seed=42).select(range(190))
small_eval_dataset = tokenized_imdb["test"].shuffle(seed=42).select(range(190))


In [11]:
# Now create a batch of examples using DataCollatorWithPadding. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
from transformers import DataCollatorWithPadding

# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# TS
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")


2024-06-06 15:25:23.877606: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


___
### HYPERPARAMETRES
___

In [12]:
# création de IDs pour les labels
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


In [13]:
# entraienement avec DistilBERT
# from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# model = AutoModelForSequenceClassification.from_pretrained(
#     "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
# )


In [14]:
# from transformers import TrainingArguments

# training_args = TrainingArguments(
#     output_dir="my_awesome_model",
#     learning_rate=2e-5,
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     num_train_epochs=2,
#     weight_decay=0.01,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     push_to_hub=False,
#     no_cuda=True,
# )


___
### EVALUATION
___

In [15]:
#  avec fonction évaluer les prédictions
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)


In [16]:
# métrics
# import numpy as np
# import evaluate

# metric = evaluate.load("accuracy")


In [17]:
# monitoring
# from transformers import TrainingArguments, Trainer

# training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch")


___
### ENTRAINEMENT
___

In [18]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=small_train_dataset,
#     eval_dataset=small_eval_dataset,
#     compute_metrics=compute_metrics,
#     data_collator=data_collator,
# )


In [19]:
# Assurez-vous que le GPU est désactivé dans torch également
# import torch
# torch.cuda.is_available = lambda: False


In [20]:
# trainer.train()


___
___
# Tensorflow
___
___

In [21]:
# TS
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)


2024-06-06 15:25:28.437311: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-06-06 15:25:28.437966: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [22]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [23]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_imdb["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_imdb["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)


RuntimeError: Unrecognized array dtype object. 
Nested types and image/audio types are not supported yet.

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!


In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)


In [None]:
from huggingface_hub import login
import os

login('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')
print()
print('<> login huggingface <>')
os.system('huggingface-cli whoami')


In [None]:
from transformers.keras_callbacks import PushToHubCallback

push_to_hub_callback = PushToHubCallback(
    output_dir="my_awesome_model",
    tokenizer=tokenizer,
)


In [None]:
callbacks = [metric_callback, push_to_hub_callback]


In [None]:
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
