### 1. Load Libraries

In [167]:
import numpy as np
import pandas as pd
import json
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

### 2. Load Data


In [168]:
df = pd.read_csv('../data/processed/Top15diseases_clean.csv')
df.head()

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Migraine,acidity,indigestion,headache,blurred_and_distorted_vision,excessive_hunger,stiff_neck,depression,irritability,visual_disturbances,none,none,none,none,none,none,none,none
1,Migraine,indigestion,headache,blurred_and_distorted_vision,excessive_hunger,stiff_neck,depression,irritability,visual_disturbances,none,none,none,none,none,none,none,none,none
2,Migraine,acidity,headache,blurred_and_distorted_vision,excessive_hunger,stiff_neck,depression,irritability,visual_disturbances,none,none,none,none,none,none,none,none,none
3,Migraine,acidity,indigestion,blurred_and_distorted_vision,excessive_hunger,stiff_neck,depression,irritability,visual_disturbances,none,none,none,none,none,none,none,none,none
4,Migraine,acidity,indigestion,headache,excessive_hunger,stiff_neck,depression,irritability,visual_disturbances,none,none,none,none,none,none,none,none,none


In [169]:
# Load symptom and disease encoding dictionaries

with open("../data/processed/name_symptom.json", "r") as f:
    symptom_dict = json.load(f)
with open("../data/processed/name_disease.json", "r") as f:
    disease_dict = json.load(f)

disease_dict_inv = {v: k for k, v in disease_dict.items()}  # Reverse mapping

### 3. Preprocessing


In [170]:
# Encode symptoms
def encode_symptoms(row):
    return [symptom_dict.get(symptom, -1) for symptom in row if symptom != "none"]

df["encoded_symptoms"] = df.iloc[:, :-1].apply(encode_symptoms, axis=1)

# Encode target (disease)
df["encoded_disease"] = df["Disease"].map(disease_dict)

# Fill missing symptom slots with -1 (unknown)
max_symptoms = 17
df["encoded_symptoms"] = df["encoded_symptoms"].apply(lambda x: x + [-1] * (max_symptoms - len(x)))

### 4. Train-Test Split


In [171]:
X = np.array(df["encoded_symptoms"].tolist())
y = df["encoded_disease"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

### 5. Train Random Forest Model


In [172]:
# rf_initial = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_initial.fit(X_train, y_train)

In [173]:
# X_train_df = pd.DataFrame(X_train, columns=["Symptom_1", "Symptom_2", "Symptom_3", "Symptom_4", "Symptom_5","Symptom_6", "Symptom_7", "Symptom_8", "Symptom_9", "Symptom_10","Symptom_11", "Symptom_12", "Symptom_13", "Symptom_14", "Symptom_15","Symptom_16", "Symptom_17"])  # Add all symptom names as per your dataset

# # Feature importance DataFrame
# feature_importance = pd.DataFrame({
#     'Symptom': X_train_df.columns,
#     'Importance': rf_initial.feature_importances_
# }).sort_values(by='Importance', ascending=False)

# # Display the important symptoms
# print(feature_importance)

### 6. Feature Importance & Selection


In [174]:
# # Assuming X is a numpy.ndarray and rf_initial is a fitted RandomForest model
# important_feature_indices = [i for i, importance in enumerate(rf_initial.feature_importances_) if importance > 0.01]

# # Select important features by their indices
# X_train_selected = X_train[:, important_feature_indices]
# X_test_selected = X_test[:, important_feature_indices]


In [179]:
param_grid = {
    'n_estimators': [100, 150],  # Start with a smaller range
    'max_depth': [5, 10],         # Keep it limited
    'min_samples_split': [5, 10], # Limit the values
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', None],
    'max_features': ['sqrt']
}



rf_tuned = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_tuned, param_grid=param_grid, cv=5, scoring="f1_weighted", n_jobs=1, verbose=2)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_


Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END class_weight=balanced, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END class_weight=balanced, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END class_weight=balanced, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END class_weight=balanced, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END class_weight=balanced, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.0s
[CV] END class_weight=balanced, max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=150; total time=   0.1s
[CV] END class_weight=balanced, max_depth=5, max_features=sqrt, min_samples_leaf=1, mi

### 7. Evaluation


In [180]:
y_pred = best_model.predict(X_test)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Best Parameters: {'class_weight': 'balanced', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 150}
Accuracy: 0.9907407407407407
              precision    recall  f1-score   support

           0       1.00      0.89      0.94        36
           1       1.00      1.00      1.00        36
           2       1.00      1.00      1.00        36
           4       1.00      1.00      1.00        36
           6       1.00      1.00      1.00        36
           7       1.00      1.00      1.00        36
           8       1.00      1.00      1.00        36
           9       1.00      1.00      1.00        36
          10       1.00      1.00      1.00        36
          11       0.90      1.00      0.95        36
          13       1.00      1.00      1.00        36
          14       1.00      1.00      1.00        36

    accuracy                           0.99       432
   macro avg       0.99      0.99      0.99       432
weigh

### 8. Save Model & Feature Selection


In [None]:
joblib.dump(best_model, "../src/models/disease_model.pkl")
joblib.dump(important_feature_indices, "../src/models/selected_features.pkl")

['../src/models/selected_features.pkl']

### 9. Prediction Function


In [None]:
def predict_disease(symptoms):
    # Load the trained model & necessary data
    model = joblib.load("../src/models/disease_model.pkl")  # Ensure the model is trained & saved
    with open("../data/processed/name_symptom.json", "r") as f:
        symptom_dict = json.load(f)
    with open("../data/processed/name_disease.json", "r") as f:
        disease_dict = json.load(f)
    with open("../src/models/selected_features.pkl", "rb") as f:
        important_features = joblib.load(f)

    # Inverse mapping for disease labels
    disease_dict_inv = {v: k for k, v in disease_dict.items()}

    # Remove extra spaces from input symptoms
    symptoms = [symptom.strip() for symptom in symptoms]

    # Convert symptoms to encoded values, use -1 if not found
    encoded_input = [symptom_dict.get(symptom, -1) for symptom in symptoms]

    # Create a feature vector with only important features
    input_vector = np.full(len(important_features), -1)  # Default to -1
    for i, feature in enumerate(important_features):
        if feature in symptom_dict and symptom_dict[feature] in encoded_input:
            input_vector[i] = symptom_dict[feature]

    # Reshape for prediction
    input_vector = input_vector.reshape(1, -1)

    # Predict disease
    predicted_class = model.predict(input_vector)[0]

    # Return the original disease name
    return disease_dict_inv.get(predicted_class, "Unknown Disease")

# Example Test
user_symptoms = ["joint_pain", "fatigue", "excessive_hunger"]
print("Predicted Disease:", predict_disease(user_symptoms))

Predicted Disease: Migraine
