In [16]:
# 1. Import Libraries
import pandas as pd
import numpy as np

# 2. Load the Dataset
df = pd.read_csv('/content/drive/MyDrive/symptom_checker_project/data/dataset.csv')  # or .xlsx using pd.read_excel

# 3. Explore
print(df.columns)
df.head()
df.info()

# 4. Clean Nulls & Duplicates
#df.drop_duplicates(inplace=True)
#df.dropna(inplace=True)

# 5. Preview Sample Row
print(df.iloc[0])


Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4920 entries, 0 to 4919
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Disease     4920 non-null   object
 1   Symptom_1   4920 non-null   object
 2   Symptom_2   4920 non-null   object
 3   Symptom_3   4920 non-null   object
 4   Symptom_4   4572 non-null   object
 5   Symptom_5   3714 non-null   object
 6   Symptom_6   2934 non-null   object
 7   Symptom_7   2268 non-null   object
 8   Symptom_8   1944 non-null   object
 9   Symptom_9   1692 non-null   object
 10  Symptom_10  1512 non-null   object
 11  Symptom_11  1194 non-null   object
 12  Symptom_12  744 non-null    object
 1

In [17]:
def combine_symptoms(row):
    symptoms = [str(symptom).strip() for symptom in row if pd.notna(symptom)]
    return ", ".join(symptoms)

# Extract symptom columns
symptom_data = df.iloc[:, 1:]  # Assuming first column is 'Disease'

# Combine all symptom columns into a single string
df['symptom_text'] = symptom_data.apply(combine_symptoms, axis=1)

# Assign the disease label from the first column
df['disease'] = df.iloc[:, 0]

# Keep only needed columns
df = df[['symptom_text', 'disease']]
df.head()


Unnamed: 0,symptom_text,disease
0,"itching, skin_rash, nodal_skin_eruptions, disc...",Fungal infection
1,"skin_rash, nodal_skin_eruptions, dischromic _p...",Fungal infection
2,"itching, nodal_skin_eruptions, dischromic _pat...",Fungal infection
3,"itching, skin_rash, dischromic _patches",Fungal infection
4,"itching, skin_rash, nodal_skin_eruptions",Fungal infection


In [18]:
# Split data
X = df['symptom_text']
y = df['disease']

from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = to_categorical(y_encoded)

# Update train-test split
X_train, X_test, y_train_enc, y_test_enc = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
y_train_cat = to_categorical(y_train_enc)
y_test_cat = to_categorical(y_test_enc)




In [19]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [21]:
model.fit(X_train_tfidf.toarray(), y_train_cat,
          validation_split=0.1, epochs=10, batch_size=32)


Epoch 1/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.2597 - loss: 3.5059 - val_accuracy: 0.9695 - val_loss: 1.8782
Epoch 2/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.8944 - loss: 1.3993 - val_accuracy: 1.0000 - val_loss: 0.1416
Epoch 3/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9775 - loss: 0.2874 - val_accuracy: 1.0000 - val_loss: 0.0298
Epoch 4/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9873 - loss: 0.1302 - val_accuracy: 1.0000 - val_loss: 0.0104
Epoch 5/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9939 - loss: 0.0773 - val_accuracy: 1.0000 - val_loss: 0.0050
Epoch 6/10
[1m111/111[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9956 - loss: 0.0581 - val_accuracy: 1.0000 - val_loss: 0.0027
Epoch 7/10
[1m111/111[0m 

<keras.src.callbacks.history.History at 0x7d1c1e1337d0>

In [22]:
loss, accuracy = model.evaluate(X_test_tfidf.toarray(), y_test_cat)
print("Test Accuracy:", accuracy)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 4.4061e-04
Test Accuracy: 1.0


In [23]:
def predict_disease_neural(text):
    vec = vectorizer.transform([text])
    prediction = model.predict(vec.toarray())
    class_idx = prediction.argmax(axis=1)[0]
    return label_encoder.inverse_transform([class_idx])[0]

# Example
print(predict_disease_neural("fever, chills, joint pain"))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
Malaria


In [24]:
# Example input
input_symptom_text = "itching, skin rash, nodal skin eruptions"

# Vectorize input
input_vec = vectorizer.transform([input_symptom_text])

# Predict using the trained model
prediction = model.predict(input_vec.toarray())
predicted_class_index = prediction.argmax(axis=1)[0]

# Convert to actual disease name
predicted_disease = label_encoder.inverse_transform([predicted_class_index])[0]


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step


In [25]:
import pandas as pd

desc_df = pd.read_csv("/content/drive/MyDrive/symptom_checker_project/data/symptom_Description.csv")


In [26]:
desc_df['Disease'] = desc_df['Disease'].str.lower().str.strip()
desc_df['Description'] = desc_df['Description'].str.strip()
predicted_disease = predicted_disease.lower().strip()
desc_dict = dict(zip(desc_df['Disease'], desc_df['Description']))
description = desc_dict.get(predicted_disease, "Description not available.")
# print("Predicted Disease:", predicted_disease)
# print("Description:", description)


In [27]:
precaution_df = pd.read_csv("/content/drive/MyDrive/symptom_checker_project/data/symptom_precaution.csv")

# Standardize the 'Disease' column
precaution_df['Disease'] = precaution_df['Disease'].str.lower().str.strip()
# Combine all precaution columns into a list per row
precaution_df['Precautions'] = precaution_df.iloc[:, 1:].apply(lambda row: [prec for prec in row if pd.notna(prec)], axis=1)

# Create dictionary
precaution_dict = dict(zip(precaution_df['Disease'], precaution_df['Precautions']))
precautions = precaution_dict.get(predicted_disease, ["No precautions available."])






In [28]:
# Standardize predicted disease
predicted_disease = predicted_disease.lower().strip()

# Get description
description = desc_dict.get(predicted_disease, "Description not available.")

# Get precautions
precautions = precaution_dict.get(predicted_disease, ["No precautions available."])

# Print output
print(f"🦠 Predicted Disease: {predicted_disease.title()}")
print(f"📘 Description: {description}")
print("🛡️ Precautions:")
for i, p in enumerate(precautions, 1):
    print(f"{i}. {p}")


🦠 Predicted Disease: Chronic Cholestasis
📘 Description: Chronic cholestatic diseases, whether occurring in infancy, childhood or adulthood, are characterized by defective bile acid transport from the liver to the intestine, which is caused by primary damage to the biliary epithelium in most cases
🛡️ Precautions:
1. cold baths
2. anti itch medicine
3. consult doctor
4. eat healthy


In [29]:
import os

# Define your project path
project_dir = "/content/drive/MyDrive/symptom_checker_project/models"

# Create directory if it doesn't exist
os.makedirs(project_dir, exist_ok=True)
model_path = os.path.join(project_dir, "symptom_diagnosis_model_keras.h5")
model.save(model_path)
import pickle

# Save vectorizer
with open(os.path.join(project_dir, "vectorizer.pkl"), "wb") as f:
    pickle.dump(vectorizer, f)

# Save label encoder
with open(os.path.join(project_dir, "label_encoder.pkl"), "wb") as f:
    pickle.dump(label_encoder, f)



