In [1]:
import os
from tqdm import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from keras.layers import Dense, Input, Dropout, Lambda
from keras.optimizers import Adam
from keras.optimizers import AdamW
from keras.optimizers import Nadam
from keras.optimizers import RMSprop
from keras.models import Model
from keras.callbacks import ModelCheckpoint

import transformers




In [2]:
dir_path = "../"
data_path = "../data"
output_path = "../outputs"

In [3]:
from transformers import TFDistilBertModel, DistilBertTokenizer
import tensorflow as tf
from keras.layers import Dense, Input, Dropout, Lambda
from keras.losses import BinaryCrossentropy
from keras import regularizers
EPOCHS = 15
BATCH_SIZE = 32 #* strategy.num_replicas_in_sync
MAX_LEN = 192

def build_bert_model(max_len=192, optimizer = Adam()):
  """
  That function create the BERT model for training
  """
  # Charger le modèle pré-entraîné DistilBERT et le tokenizer
  distilbert_model = TFDistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
  #tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

  model = tf.keras.Sequential([
    # La couche d'entrée
    Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids"),

    # Ajouter la couche DistilBERT (notez que nous utilisons distilbert_model.layers[0] pour accéder à la couche de transformer)
    # La couche DistilBERT
    distilbert_model.layers[0],

    # La couche pour obtenir le premier token [CLS]
    Lambda(lambda seq: seq[:, 0, :]),

    #Dense(256, activation="relu", kernel_regularizer=regularizers.l2(0.01)),
    #Dropout(0.5),
    #Dense(128, activation="relu", kernel_regularizer=regularizers.l2(0.01)),
    #Dropout(0.5),
    #Dense(64,  activation="relu", kernel_regularizer=regularizers.l2(0.01)),

    # La couche de sortie
    Dense(1, activation='sigmoid')
  ])

  loss = BinaryCrossentropy()
  #metrics = tf.metrics.BinaryAccuracy()

  # Compiler le modèle
  # Compiler le modèle avec une loss adaptée à la classification binaire
  model.compile(optimizer = optimizer, loss=loss, metrics=['accuracy'])

  # Afficher le résumé du modèle

  return model

In [4]:
new_model = build_bert_model()  # Créez le modèle avec la même architecture
new_model.load_weights(output_path+'/trained-models/bert_model-3-val.h5') 




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [12]:
examples1 = [
    "malade",
    "Je suis malade",
    "Je suis gravement malade",
    "Je suis malade, j'ai pris un medicament",
    "Je suis malade, j'ai pris un medicament, je vais bien maintenant",
]

examples0 = [
    'Everythings goes bad today, but i have to smile, that\'s make me happy',
    'I am tired',
    'I am not healthy',
    'I am sick',
    'I am gonna kill you',
    'I am angry about you bastard',
    'Love you',
    'Fuck you',
]

examples2 = [
    'Je suis heureux',
    'Je t\'aime',
    'Je te haie',
    'Va chier',
    'Va dormir',
    'Te quiero mucho',
    'Como estas',
]

In [5]:
from transformers import DistilBertTokenizer


# Phrase à tester
# First load the real tokenizer
try:
    tokenizer = transformers.AutoTokenizer.from_pretrained(output_path+'/tokenizers')
except (OSError, ValueError):
    tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
    # Save the loaded tokenizer locally
    tokenizer.save_pretrained(output_path+'/tokenizers')

def test_toxic(phrase):
  # Prétraiter la phrase
  inputs = tokenizer(phrase, return_tensors="tf", max_length=MAX_LEN, truncation=True, padding='max_length')

  # Obtenir la prédiction
  predictions = new_model.predict(inputs['input_ids'])

  # Interpréter la prédiction
  toxic_threshold = 0.5  # Définir un seuil
  is_toxic = predictions[0, 0] > toxic_threshold

  per = predictions[0, 0] * 100

  print(f" **{phrase}** a une toxicité de {per:.2f} - [{'toxique' if is_toxic else 'non-toxique'}]")



In [14]:
for phrase in examples1:
  test_toxic(phrase)

 **malade** a une toxicité de 6.90 - [non-toxique]
 **Je suis malade** a une toxicité de 96.90 - [toxique]
 **Je suis gravement malade** a une toxicité de 97.26 - [toxique]
 **Je suis malade, j'ai pris un medicament** a une toxicité de 47.88 - [non-toxique]
 **Je suis malade, j'ai pris un medicament, je vais bien maintenant** a une toxicité de 19.70 - [non-toxique]
