In [44]:
import streamlit as st
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D


In [45]:
# Chargement du dataset
data = pd.read_csv('Diseases_Symptoms.csv')

In [46]:
data.head(10)

Unnamed: 0,Code,Name,Symptoms,Treatments
0,1,Panic disorder,"Palpitations, Sweating, Trembling, Shortness o...","Antidepressant medications, Cognitive Behavior..."
1,2,Vocal cord polyp,"Hoarseness, Vocal Changes, Vocal Fatigue","Voice Rest, Speech Therapy, Surgical Removal"
2,3,Turner syndrome,"Short stature, Gonadal dysgenesis, Webbed neck...","Growth hormone therapy, Estrogen replacement t..."
3,4,Cryptorchidism,"Absence or undescended testicle(s), empty scro...",Observation and monitoring (in cases of mild o...
4,5,Ethylene glycol poisoning-1,"Nausea, vomiting, abdominal pain, General mala...","Supportive Measures, Gastric Decontamination, ..."
5,6,Ethylene glycol poisoning-2,"Metabolic acidosis, apid breathing, rapid hear...","Blood tests, Supportive Measures, Gastric Deco..."
6,7,Ethylene glycol poisoning-3,"Decreased urine output, swelling in the legs a...","Supportive Measures, Gastric Decontamination, ..."
7,8,Atrophic vaginitis,"Vaginal dryness, Vaginal burning, frequent uri...","Vaginal moisturizers, Vaginal estrogen therapy..."
8,9,Fracture,"Pain, Swelling, Bruising, Deformity, Difficult...","Immobilization, Surgery, Rehabilitation"
9,10,Cellulitis,"Redness, Pain, tenderness, Swelling, Skin chan...","Antibiotics, Warm compresses, immobilization, ..."


In [47]:
# Convert all text columns to lowercase
df = data.apply(lambda x: x.str.lower() if x.dtype == 'object' else x)

In [48]:
# Removing duplicates
df = df.drop_duplicates()
df.head(10)

Unnamed: 0,Code,Name,Symptoms,Treatments
0,1,panic disorder,"palpitations, sweating, trembling, shortness o...","antidepressant medications, cognitive behavior..."
1,2,vocal cord polyp,"hoarseness, vocal changes, vocal fatigue","voice rest, speech therapy, surgical removal"
2,3,turner syndrome,"short stature, gonadal dysgenesis, webbed neck...","growth hormone therapy, estrogen replacement t..."
3,4,cryptorchidism,"absence or undescended testicle(s), empty scro...",observation and monitoring (in cases of mild o...
4,5,ethylene glycol poisoning-1,"nausea, vomiting, abdominal pain, general mala...","supportive measures, gastric decontamination, ..."
5,6,ethylene glycol poisoning-2,"metabolic acidosis, apid breathing, rapid hear...","blood tests, supportive measures, gastric deco..."
6,7,ethylene glycol poisoning-3,"decreased urine output, swelling in the legs a...","supportive measures, gastric decontamination, ..."
7,8,atrophic vaginitis,"vaginal dryness, vaginal burning, frequent uri...","vaginal moisturizers, vaginal estrogen therapy..."
8,9,fracture,"pain, swelling, bruising, deformity, difficult...","immobilization, surgery, rehabilitation"
9,10,cellulitis,"redness, pain, tenderness, swelling, skin chan...","antibiotics, warm compresses, immobilization, ..."


In [49]:
# Basic info about the dataset
print(data.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Code        400 non-null    int64 
 1   Name        400 non-null    object
 2   Symptoms    400 non-null    object
 3   Treatments  399 non-null    object
dtypes: int64(1), object(3)
memory usage: 12.6+ KB
None


In [51]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Concatenate all symptoms into a single string
all_symptoms = ' '.join(data['Symptoms'])

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_symptoms)

# Plot the word cloud
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Word Cloud of Symptoms')
plt.axis('off')
plt.show()


In [60]:
# Count of symptoms (assuming each symptom is separated by ', ')
symptom_counts = df['Symptoms'].str.split(', ').apply(len)

# Count of symptoms
plt.subplot(1, 3, 2)
symptom_counts.value_counts().sort_index().plot(kind='bar', color='salmon')
plt.title('Count of Symptoms')
plt.xlabel('Number of Symptoms')
plt.ylabel('Count')

Text(0, 0.5, 'Count')

In [57]:
# Convertir les valeurs float en chaînes de caractères
data['Symptoms'] = data['Symptoms'].astype(str)
data['Treatments'] = data['Treatments'].astype(str)

In [None]:
# Convertir les valeurs float en chaînes de caractères
data['Symptoms'] = data['Symptoms'].astype(str)
data['Treatments'] = data['Treatments'].astype(str)

In [58]:
# Nettoyage et prétraitement des données
def preprocess_text(text):
    # Tokenisation
    tokens = word_tokenize(text.lower())
    # Suppression des stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Lemmatisation
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)


data['Symptoms'] = data['Symptoms'].apply(preprocess_text)
data['Treatments'] = data['Treatments'].apply(preprocess_text)

In [59]:
# Tokenisation des textes
max_words = 500
tokenizer = Tokenizer(num_words=max_words, split=' ')
tokenizer.fit_on_texts(data['Symptoms'] + ' ' + data['Treatments'] + ' ' + data['Name'])
X = tokenizer.texts_to_sequences(data['Symptoms'] + ' ' + data['Treatments'] + ' ' + data['Name'])
X = pad_sequences(X)

In [None]:
# Téléchargement des ressources NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Création des labels
y = pd.get_dummies(data['Code']).values

# Séparation des données d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Définition du modèle RNN
embedding_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(400, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\patel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\patel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\patel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!






In [None]:
# Entraînement du modèle
batch_size = 32
epochs = 10
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), 
                    epochs=epochs, batch_size=batch_size, verbose=2)



Epoch 1/10


10/10 - 13s - loss: 5.9947 - accuracy: 0.0000e+00 - val_loss: 6.0004 - val_accuracy: 0.0000e+00 - 13s/epoch - 1s/step
Epoch 2/10
10/10 - 4s - loss: 5.9800 - accuracy: 0.0250 - val_loss: 6.0166 - val_accuracy: 0.0000e+00 - 4s/epoch - 428ms/step
Epoch 3/10
10/10 - 4s - loss: 5.9681 - accuracy: 0.0688 - val_loss: 6.4464 - val_accuracy: 0.0000e+00 - 4s/epoch - 427ms/step
Epoch 4/10
10/10 - 4s - loss: 5.9088 - accuracy: 0.0063 - val_loss: 6.1176 - val_accuracy: 0.0000e+00 - 4s/epoch - 404ms/step
Epoch 5/10
10/10 - 4s - loss: 5.8780 - accuracy: 0.0125 - val_loss: 7.0172 - val_accuracy: 0.0000e+00 - 4s/epoch - 410ms/step
Epoch 6/10
10/10 - 4s - loss: 5.8077 - accuracy: 0.0063 - val_loss: 6.8610 - val_accuracy: 0.0000e+00 - 4s/epoch - 417ms/step
Epoch 7/10
10/10 - 5s - loss: 5.7282 - accuracy: 0.0094 - val_loss: 7.7180 - val_accuracy: 0.0000e+00 - 5s/epoch - 488ms/step
Epoch 8/10
10/10 - 5s - loss: 5.5704 - accuracy: 0.0094 - val_loss: 8.6622 - val_accuracy: 0.0000e+00 - 5s/epoch 

In [None]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("Input shape of the model:", model.input_shape)


X_train shape: (320, 71)
y_train shape: (320, 400)
Input shape of the model: (None, 71)


In [None]:

print("Unique values in y_train:", np.unique(y_train))
print("Unique values in y_test:", np.unique(y_test))


Unique values in y_train: [False  True]
Unique values in y_test: [False  True]


In [None]:
# Fonction pour trouver la réponse à une question
def get_answer(question):
    # Prétraitement de la question
    question = preprocess_text(question)
    texts = data['Symptoms'] + ' ' + data['Treatments'] + ' ' + data['Name']
    # Vectorisation des textes
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    question_vector = vectorizer.transform([question])
    # Calcul de la similarité cosinus entre la question et les textes
    similarities = cosine_similarity(X, question_vector)
    # Récupération de l'index de la réponse la plus similaire
    idx = similarities.argmax()
    return data.iloc[idx]['Name'], data.iloc[idx]['Symptoms'], data.iloc[idx]['Treatments']
