<a href="https://colab.research.google.com/github/meriemben1/projet-big-data/blob/main/Projet_Big_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importation des bibliothèques nécessaires
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length



In [None]:
# Création d'une session Spark
spark = SparkSession.builder \
    .appName("Healthcare Data Processing") \
    .getOrCreate()

In [None]:
#importation de drive
from google.colab import drive
drive.mount("/content/mydrive")


Drive already mounted at /content/mydrive; to attempt to forcibly remount, call drive.mount("/content/mydrive", force_remount=True).


In [None]:
data_path = "/content/mydrive/MyDrive/healthcare/healthcare_-_train.csv.csv"
data = spark.read.csv(data_path, header=True, inferSchema=True)

In [None]:
# Aperçu des données
data.show(5)
data.printSchema()

+----+--------------------+--------------------+
| _c0|              prompt|            response|
+----+--------------------+--------------------+
|NULL|Who is at risk fo...|LCMV infections c...|
|NULL|What are the symp...|LCMV is most comm...|
|NULL|Who is at risk fo...|Individuals of al...|
|NULL|How to diagnose L...|During the first ...|
|NULL|What are the trea...|Aseptic meningiti...|
+----+--------------------+--------------------+
only showing top 5 rows

root
 |-- _c0: string (nullable = true)
 |-- prompt: string (nullable = true)
 |-- response: string (nullable = true)



In [None]:
# Simplification du fichier : sélection des colonnes nécessaires (par exemple, 'prompt' et 'response')
simplified_data = data.select("prompt", "response")

In [None]:
simplified_data.show(5)

In [None]:
# Suppression des doublons
simplified_data = simplified_data.dropDuplicates()

In [None]:
# Filtrage des lignes avec des longueurs de texte inférieures à 10 caractères
simplified_data = simplified_data.filter((length(col("prompt")) >= 10) & (length(col("response")) >= 10))

In [None]:
# Sauvegarde du fichier simplifié dans un nouveau fichier CSV
output_path = "/content/mydrive/MyDrive/healthcare/final_data.csv"
simplified_data.write.csv(output_path, header=True, mode="overwrite")
print("Final simplified data saved at", output_path)

Final simplified data saved at /content/mydrive/MyDrive/healthcare/final_data.csv


##Nettoyage et prétraitement des données


In [None]:
# Sélection des colonnes nécessaires
new_data = data.select("prompt", "response")

In [None]:
# Affichage des doublons globaux et sur une colonne spécifique
print("Doublons globaux :", new_data.dropDuplicates().count() - new_data.count())
print("Doublons dans 'prompt' :", new_data.dropDuplicates(["prompt"]).count() - new_data.count())

Doublons globaux : -12081
Doublons dans 'prompt' : -14173


In [None]:
# Suppression des doublons globaux
df = new_data.dropDuplicates()

In [None]:
# Suppression des doublons basés uniquement sur la colonne 'prompt'
df = df.dropDuplicates(["prompt"])

In [None]:
# Compter les valeurs manquantes
df.select([col(c).isNull().cast("int").alias(c + "_missing") for c in df.columns]).show()

+--------------+----------------+
|prompt_missing|response_missing|
+--------------+----------------+
|             1|               1|
|             0|               0|
|             0|               0|
|             0|               0|
|             0|               0|
|             0|               0|
|             0|               1|
|             0|               0|
|             0|               1|
|             0|               0|
|             0|               0|
|             0|               0|
|             0|               0|
|             0|               1|
|             0|               0|
|             0|               0|
|             0|               0|
|             0|               0|
|             0|               0|
|             0|               1|
+--------------+----------------+
only showing top 20 rows



In [None]:
# Suppression des lignes avec des longueurs de 'prompt' ou 'response' < 10
df = df.filter((length(col("prompt")) >= 10) & (length(col("response")) >= 10))

In [None]:
# Réinitialisation de l'index (non nécessaire avec PySpark, mais les données sont maintenant propres)
df.show()

+--------------------+--------------------+
|              prompt|            response|
+--------------------+--------------------+
| ""A 'rest time' ...| after it has bee...|
| 1 out of 90 Ashk...|     and 1 out of 30|
| 119 people with ...|700 treated with ...|
| 30 days) followi...| the first stage ...|
| 5% has been appr...| does not kill li...|
| Ancylostoma duod...| are worldwide in...|
| Ashkenazi Jews a...| are more likely ...|
|           Bilateral| and Cavitary Ret...|
| CDC estimates th...|000 persons with ...|
| CDC has develope...| hospital-based g...|
|    CK or CKMB tests| and serum myoglo...|
|  Causes of Diabetes| or by calling 18...|
|     Central America| and South Americ...|
|          Charleston|      South Carolina|
| DEC can cause se...| including enceph...|
| Dravet syndrome ...| febrile seizures...|
| For People of Af...|       Mediterranean|
|       HIV infection| a transplant or ...|
| High Blood Pressure|        and Diabetes|
| I will eat _____...| I will ea

##Ajout de nouvelles colonnes

In [None]:
# Importation de 'expr' depuis 'pyspark.sql.functions'
from pyspark.sql.functions import expr

# Ajout d'une colonne pour le nombre de mots dans chaque 'prompt'
df = df.withColumn("prompt_word_count", expr("size(split(prompt, ' '))"))

# Affichage du DataFrame après l'ajout de la colonne
df.show()


+--------------------+--------------------+-----------------+
|              prompt|            response|prompt_word_count|
+--------------------+--------------------+-----------------+
| ""A 'rest time' ...| after it has bee...|               16|
| 1 out of 90 Ashk...|     and 1 out of 30|               11|
| 119 people with ...|700 treated with ...|                9|
| 30 days) followi...| the first stage ...|                8|
| 5% has been appr...| does not kill li...|               32|
| Ancylostoma duod...| are worldwide in...|                6|
| Ashkenazi Jews a...| are more likely ...|                5|
|           Bilateral| and Cavitary Ret...|                2|
| CDC estimates th...|000 persons with ...|                7|
| CDC has develope...| hospital-based g...|                5|
|    CK or CKMB tests| and serum myoglo...|                5|
|  Causes of Diabetes| or by calling 18...|                4|
|     Central America| and South Americ...|                3|
|       

In [None]:
# Ajout d'une colonne pour le nombre de mots dans chaque 'response'
df = df.withColumn("response_word_count", expr("size(split(response, ' '))"))

In [None]:
df.show()

+--------------------+--------------------+-----------------+-------------------+
|              prompt|            response|prompt_word_count|response_word_count|
+--------------------+--------------------+-----------------+-------------------+
| ""A 'rest time' ...| after it has bee...|               16|                  9|
| 1 out of 90 Ashk...|     and 1 out of 30|               11|                  6|
| 119 people with ...|700 treated with ...|                9|                 25|
| 30 days) followi...| the first stage ...|                8|                 15|
| 5% has been appr...| does not kill li...|               32|                 69|
| Ancylostoma duod...| are worldwide in...|                6|                  7|
| Ashkenazi Jews a...| are more likely ...|                5|                 14|
|           Bilateral| and Cavitary Ret...|                2|                  4|
| CDC estimates th...|000 persons with ...|                7|                 35|
| CDC has develo

## Visualisation des données avec matplotlib

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Conversion en Pandas pour visualisation
df_pd = df.toPandas()

##Sauvegarde des données finales

In [None]:
# Sauvegarde du DataFrame final dans un fichier CSV
output_path = "/content/mydrive/MyDrive/healthcare/final_data1.csv"
df.write.csv(output_path, header=True, mode="overwrite")

print("Final data saved at", output_path)

Final data saved at /content/mydrive/MyDrive/healthcare/final_data1.csv


In [None]:
!pip install pyspark



In [None]:
#initialisez une session Spark pour pouvoir manipuler les données avec PySpark :
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

##Chargement et Prétraitement des Données avec PySpark

In [None]:
import os
print(os.getcwd())  # Affiche le répertoire courant

/content


In [None]:
import os
os.chdir('/content/mydrive/MyDrive/healthcare')  # Changez le répertoire de travail
print(os.getcwd())  # Vérifiez que vous êtes dans le bon répertoire

/content/mydrive/MyDrive/healthcare


In [None]:
import os
print(os.path.exists('final_data.csv'))  # Vérifie si le fichier existe

True


In [None]:
data_path = '/content/mydrive/MyDrive/healthcare/final_data.csv'
df_spark = spark.read.csv(data_path, header=True, inferSchema=True)


In [None]:
# Affichage des 10 premières lignes
df_spark.show(10)

+--------------------+--------------------+
|              prompt|            response|
+--------------------+--------------------+
|only four countri...|the Dominican Rep...|
|What are the symp...|After an incubati...|
|in part because t...|a blood cell that...|
|What is (are) Par...|Trichinellosis, a...|
|What are the trea...|There is no cure ...|
|one from each par...|each of their chi...|
|What to do for Wh...|Your health care ...|
|What is (are) Ane...|Anemia is a condi...|
|for patients on h...|many studies show...|
|What to do for Wh...|- Bladder problem...|
+--------------------+--------------------+
only showing top 10 rows



In [None]:
# Suppression des colonnes "prompt_word_count" et "response_word_count"
df_spark = df_spark.drop("prompt_word_count", "response_word_count")

In [None]:
# Affichage des 10 premières lignes après suppression
df_spark.show(10)

+--------------------+--------------------+
|              prompt|            response|
+--------------------+--------------------+
|only four countri...|the Dominican Rep...|
|What are the symp...|After an incubati...|
|in part because t...|a blood cell that...|
|What is (are) Par...|Trichinellosis, a...|
|What are the trea...|There is no cure ...|
|one from each par...|each of their chi...|
|What to do for Wh...|Your health care ...|
|What is (are) Ane...|Anemia is a condi...|
|for patients on h...|many studies show...|
|What to do for Wh...|- Bladder problem...|
+--------------------+--------------------+
only showing top 10 rows



In [None]:
# Échantillonnage aléatoire de 9000 lignes
df_sampled = df_spark.sample(False, 9000 / df_spark.count(), seed=42)

In [None]:
print("Nombre de lignes dans df_sampled :", df_sampled.count())


Nombre de lignes dans df_sampled : 9048


In [None]:
# Séparation en jeu d'entraînement et jeu de test (90/10)
train_df = df_sampled.sample(False, 0.9, seed=42)

test_df = df_sampled.exceptAll(train_df)

In [None]:
# Sauvegarde des datasets sous format JSONL pour être utilisés dans l'entraînement
train_df.write.json('train.jsonl', mode='overwrite')
test_df.write.json('test.jsonl', mode='overwrite')

In [None]:
print("Nombre de lignes dans train_df :", train_df.count())


Nombre de lignes dans train_df : 8163


##Traitement des Datasets avec PySpark

In [None]:
# Transformation des données en format texte pour l'entraînement
def process_data(row):
    system_message = "Hello! I'm here to provide concise information about general health problem, including their causes, symptoms, treatments, and recommended medications. How can I assist you today?"
    prompt = row['prompt']
    response = row['response']
    return {'text': f'[INST] <<SYS>>\n{system_message.strip()}\n<</SYS>>\n\n{prompt} [/INST] {response}'}

In [None]:
train_rdd = train_df.rdd
test_rdd = test_df.rdd

In [None]:
print("Nombre de lignes dans train_rdd :", train_rdd.count())
print("Nombre de lignes dans test_rdd :", test_rdd.count())


Nombre de lignes dans train_rdd : 8163
Nombre de lignes dans test_rdd : 885


In [None]:
# Conversion en DataFrame pour utilisation dans le fine-tuning
train_spark_df = spark.createDataFrame(train_rdd)
test_spark_df = spark.createDataFrame(test_rdd)

In [None]:
# Sauvegarde des datasets prétraités
train_spark_df.write.json('train_processed.jsonl', mode='overwrite')
test_spark_df.write.json('test_processed.jsonl', mode='overwrite')

##Chargement et Fine-Tuning du Modèle

In [None]:
!pip install datasets



In [None]:
#importation de load datasets
from datasets import load_dataset

In [None]:
#verification de la presence exacte de fichier
import os

file_path = '/content/train_processed.jsonl'
if os.path.exists(file_path):
    print(f"File found: {file_path}")
else:
    print(f"File not found: {file_path}")

File found: /content/train_processed.jsonl


In [None]:
#affichage de fichiers existant dans content
import os
print(os.listdir('/content/'))

['.config', 'test.jsonl', 'test_processed.jsonl', 'mydrive', 'train.jsonl', 'train_processed.jsonl', 'sample_data']


In [None]:
import os

file_path = '/content/train_processed.jsonl'

if os.path.isfile(file_path):
    print(f"{file_path} is a valid file.")
elif os.path.isdir(file_path):
    print(f"{file_path} is a directory!")
else:
    print(f"{file_path} does not exist or is not a file.")


/content/train_processed.jsonl is a directory!


In [None]:
import shutil

directory_path = '/content/train_processed.jsonl'

if os.path.isdir(directory_path):
    print(f"Supprimons le répertoire : {directory_path}")
    shutil.rmtree(directory_path)  # Supprime le répertoire et son contenu
else:
    print(f"{directory_path} n'est pas un répertoire.")

Supprimons le répertoire : /content/train_processed.jsonl


In [None]:
# Exemple de création d'un fichier JSONL minimal
import json
import pandas as pd

# Chemin vers le fichier CSV contenant les données
csv_file_path = '/content/mydrive/MyDrive/healthcare/final_data (1).csv'

# Lecture du fichier CSV
try:
    # Charger les données CSV dans un DataFrame
    df = pd.read_csv(csv_file_path)

    # Convertir les données du DataFrame en une liste de dictionnaires
    # Supposons que le fichier CSV a deux colonnes : 'prompt' et 'response'
    data = df.to_dict(orient='records')

    # Chemin de sortie du fichier JSONL
    jsonl_file_path = '/content/train_processed.jsonl'

    # Écriture dans le fichier JSONL
    with open(jsonl_file_path, 'w') as f:
        for record in data:
            f.write(json.dumps(record) + '\n')

    print(f"Fichier JSONL créé avec succès : {jsonl_file_path}")

except FileNotFoundError:
    print(f"Erreur : Le fichier '{csv_file_path}' est introuvable.")
except Exception as e:
    print(f"Une erreur s'est produite : {e}")


Fichier JSONL créé avec succès : /content/train_processed.jsonl


In [None]:
#recharge de donnees
from datasets import load_dataset

train_dataset = load_dataset('json', data_files='/content/train_processed.jsonl', split="train")
print(train_dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['prompt', 'response', 'prompt_word_count', 'response_word_count'],
    num_rows: 14978
})


In [None]:
train_dataset = load_dataset('json', data_files='/content/train_processed.jsonl', split="train")
valid_dataset = load_dataset('json', data_files='/content/train_processed.jsonl', split="train")

In [None]:
!pip install gradio



In [None]:
!pip install transformers accelerate



In [None]:
!pip install transformers torch gradio



In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from google.colab import drive

In [None]:
from pyspark.sql import SparkSession

# Créez la session Spark et chargez le fichier
spark = SparkSession.builder.master("local[*]").appName("Healthcare Chatbot").getOrCreate()
data_path = "/content/mydrive/MyDrive/healthcare/final_data.csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Convertir le DataFrame en Pandas pour plus de flexibilité
df_pd = df.toPandas()

# Vérifier les 5 premières lignes du fichier
print(df_pd.head())


                                              prompt  \
0  only four countries are currently known to be ...   
1  What are the symptoms of Hendra Virus Disease ...   
2  in part because there are no readily available...   
3  What is (are) Parasites - Trichinellosis (also...   
4  What are the treatments for Inclusion Body Myo...   

                                            response  
0                             the Dominican Republic  
1  After an incubation of 9-16 days, infection wi...  
2  a blood cell that can be elevated in the prese...  
3  Trichinellosis, also called trichinosis, is ca...  
4  There is no cure for IBM, nor is there a stand...  


In [None]:
from transformers import AutoTokenizer

# Charger le tokenizer pour BioBERT
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Définir le pad_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})



def tokenize_function(examples):
    # Concaténer 'prompt' et 'response' pour chaque entrée (vous pouvez adapter cette partie en fonction de vos besoins spécifiques)
    # Si vous utilisez BioBERT pour des tâches comme la classification, vous n'avez pas besoin de "prompt" et "response", mais simplement du texte d'entrée.
    texts = [f"[CLS] {prompt} [SEP] {response} [SEP]" for prompt, response in zip(examples['prompt'], examples['response'])]

    # Utilisation du tokenizer pour transformer le texte
    return tokenizer(texts, truncation=True, max_length=512, padding="max_length")

# Appliquer la fonction de tokenisation sur le dataset entier
tokenized_train = train_dataset.map(tokenize_function, batched=True)

# Vérification après la tokenisation
print(tokenized_train[0])



Map:   0%|          | 0/14978 [00:00<?, ? examples/s]

{'prompt': 'Who is at risk for Lymphocytic Choriomeningitis (LCM)? ?', 'response': 'LCMV infections can occur after exposure to fresh urine, droppings, saliva, or nesting materials from infected rodents. Transmission may also occur when these materials are directly introduced into broken skin, the nose, the eyes, or the mouth, or presumably, via the bite of an infected rodent. Person-to-person transmission has not been reported, with the exception of vertical transmission from infected mother to fetus, and rarely, through organ transplantation.', 'prompt_word_count': 9, 'response_word_count': 68, 'input_ids': [101, 101, 1150, 1110, 1120, 3187, 1111, 181, 25698, 13335, 23894, 22572, 20571, 2354, 1158, 10721, 113, 181, 1665, 1306, 114, 136, 136, 102, 181, 1665, 1306, 1964, 16565, 1169, 4467, 1170, 7401, 1106, 4489, 19968, 117, 7367, 1116, 117, 21718, 24186, 117, 1137, 24056, 3881, 1121, 10594, 8335, 5240, 119, 6580, 1336, 1145, 4467, 1165, 1292, 3881, 1132, 2626, 2234, 1154, 3088, 2241, 

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer
from datasets import load_dataset
import os

# Désactiver W&B si non utilisé
os.environ["WANDB_DISABLED"] = "true"

# Charger le modèle pré-entraîné BioBERT
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)

# Charger le tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Charger les données depuis un fichier JSONL
dataset = load_dataset('json', data_files='/content/train_processed.jsonl')

# Diviser les données en ensembles d'entraînement et d'évaluation
train_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)['train']
eval_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)['test']

# **1. Ajuster la structure des données**
def adjust_dataset_format(dataset):
    # Vérifiez chaque entrée pour qu'elle ait les bonnes colonnes
    print("Avant ajustement :", dataset[0])  # Debugging
    return dataset.map(lambda x: {"prompt": x["prompt"], "response": x["response"]})

train_dataset = adjust_dataset_format(train_dataset)
eval_dataset = adjust_dataset_format(eval_dataset)

# **2. Vérifiez les colonnes et une entrée**
print("Colonnes train_dataset :", train_dataset.column_names)
print("Première entrée train_dataset :", train_dataset[0])

# **3. Prétraitement et tokenisation**
def preprocess_function(examples):
    print("Exemple avant tokenisation :", examples)  # Debugging
    return tokenizer(
        examples["prompt"],  # Assurez-vous que "prompt" existe
        truncation=True,
        padding="max_length",
        max_length=512
    )

# Appliquer la tokenisation
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

# Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)




Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Avant ajustement : {'prompt': 'What is (are) Mitochondrial encephalomyopathy lactic acidosis and stroke-like episodes ?', 'response': "Mitochondrial encephalomyopathy, lactic acidosis, and stroke-like episodes (MELAS) affects many parts of the body, particularly the brain and nervous system (encephalo-) and muscles (myopathy). Symptoms typically begin in childhood and may include muscle weakness and pain, recurrent headaches, loss of appetite, vomiting, and seizures. Most affected individuals experience stroke-like episodes beginning before age 40. People with MELAS can also have a buildup of lactic acid in their bodies that can lead to vomiting, abdominal pain, fatigue, muscle weakness, and difficulty breathing. The genes associated with MELAS are located in mitochondrial DNA and therefore follow a maternal inheritance pattern (also called mitochondrial inheritance). MELAS can be inherited from the mother only, because only females pass mitochondrial DNA to their children. In some cas

In [None]:
# Assurez-vous que Spark est configuré et les données sont chargées
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialiser la session Spark
spark = SparkSession.builder.appName("Healthcare Chatbot").getOrCreate()

# Charger les données de final_data.csv
data_path = "/content/mydrive/MyDrive/healthcare/final_data (1).csv"
df = spark.read.csv(data_path, header=True, inferSchema=True)
# Convertir en DataFrame Pandas pour une recherche plus facile
df_pd = df.toPandas()

In [None]:
# Vérifier l'état de votre GPU (optionnel)
!nvidia-smi

Wed Dec  4 10:09:22 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   75C    P0              32W /  70W |   2735MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
# Importer les bibliothèques nécessaires
import gradio as gr
import random
import time
import numpy as np
from transformers import AutoTokenizer
import transformers
import torch

In [None]:
from transformers import pipeline

# Initialisation du pipeline avec un modèle de conversation
pipe = pipeline("text-generation", model="dmis-lab/biobert-base-cased-v1.1")

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
print(train_dataset[0])  # Afficher le premier exemple
print(train_dataset[1])  # Afficher le deuxième exemple


{'prompt': 'What is (are) Mitochondrial encephalomyopathy lactic acidosis and stroke-like episodes ?', 'response': "Mitochondrial encephalomyopathy, lactic acidosis, and stroke-like episodes (MELAS) affects many parts of the body, particularly the brain and nervous system (encephalo-) and muscles (myopathy). Symptoms typically begin in childhood and may include muscle weakness and pain, recurrent headaches, loss of appetite, vomiting, and seizures. Most affected individuals experience stroke-like episodes beginning before age 40. People with MELAS can also have a buildup of lactic acid in their bodies that can lead to vomiting, abdominal pain, fatigue, muscle weakness, and difficulty breathing. The genes associated with MELAS are located in mitochondrial DNA and therefore follow a maternal inheritance pattern (also called mitochondrial inheritance). MELAS can be inherited from the mother only, because only females pass mitochondrial DNA to their children. In some cases, MELAS results f

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Charger le modèle pré-entraîné BioBERT (par exemple pour une tâche de classification ou réponse à des questions)
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=2)  # Ajustez num_labels selon votre tâche

# Préparer les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=100,  # Sauvegarde après 100 steps
    evaluation_strategy="steps",  # Évaluation après 100 steps
    eval_steps=100,  # Définir les steps pour l'évaluation
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,  # Charger le meilleur modèle à la fin
)

# Exemple de données d'entraînement (ajustez en fonction de votre tâche spécifique)
train_dataset = ...  # Chargez vos données d'entraînement
eval_dataset = ...  # Chargez vos données d'évaluation

# Initialiser le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Démarrer l'entraînement
trainer.train()

In [None]:
def chat_response(message):
    system_message = "Welcome! You're now communicating with an AI model trained to assist with information about general health disease. Feel free to ask about causes, symptoms, medications, and treatment options!"
    prompt = f"{system_message}\n\n{message}"  # Définir prompt ici à partir du message de l'utilisateur

    # Générer une réponse avec troncation activée et ajustements pour éviter la répétition
    sequences = pipe(
        prompt,
        do_sample=True,
        top_k=50,
        num_return_sequences=1,
        max_length=200,
        temperature=0.5,
        no_repeat_ngram_size=2,
        truncation=True  # Ajout explicite de la troncation
    )

    # Récupérer la réponse générée et supprimer les balises indésirables
    bot_message = sequences[0]['generated_text']

    # Nettoyer la sortie en enlevant les balises
    bot_message = bot_message.replace('[INST] ', '').replace(' [/INST]', '').replace('<SYS>', '').replace('</SYS>', '')

    # Supprimer tout texte indésirable (exemple : questions répétitives ou réponses incohérentes)
    bot_message = bot_message.split("The following")[0]  # On coupe la réponse au premier passage de "The following"

    return bot_message.strip()  # Supprimer les espaces superflus

# Fonction qui traite le message de l'utilisateur
def respond(message, chat_history):
    bot_message = chat_response(message)  # Utiliser le message de l'utilisateur pour générer une réponse

    # Ajouter les messages sous forme de dictionnaires avec 'role' et 'content'
    chat_history.append({"role": "user", "content": message})
    chat_history.append({"role": "assistant", "content": bot_message})

    return "", chat_history

In [None]:
# Interface Gradio pour le chatbot
with gr.Blocks() as demo:
    chatbot = gr.Chatbot(label="Personal Health Assistant", type='messages')  # Spécifier 'messages' pour le type
    msg = gr.Textbox()
    clear = gr.ClearButton([msg, chatbot])

    def respond(message, chat_history):
        bot_message = chat_response(message)

        # Ajouter les messages sous forme de dictionnaires avec 'role' et 'content'
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": bot_message})

        return "", chat_history

    msg.submit(respond, [msg, chatbot], [msg, chatbot])

# Lancer l'application
demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://63f2697d0c175c36e2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://63f2697d0c175c36e2.gradio.live


