In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# 2️ LOAD & CLEAN DATA
df = pd.read_csv("Training.csv")

In [None]:
df.head()

In [None]:
# Drop extra unnamed column if present
df = df.drop(columns=["Unnamed: 133"], errors="ignore")

In [None]:
# Remove duplicate rows
df = df.drop_duplicates()

In [None]:
# Replace missing values with 0 (symptom columns are 0/1)
df = df.fillna(0)

In [None]:
# Strip any extra spaces from column names
df.columns = df.columns.str.strip()

In [None]:
print(f" Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

In [None]:
#  SPLIT FEATURES & TARGET
X = df.drop(columns=["prognosis"])
y = df["prognosis"]

In [None]:
X

In [None]:
y

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)

# Plot settings
plt.rcParams["figure.dpi"] = 120
sns.set_style("whitegrid")

In [None]:
# Encode target labels to integers
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [None]:
# Scale the features (optional but good for some models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split into train/test (stratify ensures all diseases are represented)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded,
    test_size=0.2, random_state=42, stratify=y_encoded
)

In [None]:
print(f" Train size: {X_train.shape[0]} | Test size: {X_test.shape[0]}")

In [None]:
#  TRAIN MODEL
model = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)
print(" Model training complete")

In [None]:
#  EVALUATE MODEL
y_pred = model.predict(X_test)

acc  = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
rec  = recall_score(y_test, y_pred, average="weighted", zero_division=0)
f1   = f1_score(y_test, y_pred, average="weighted", zero_division=0)

print("\n Model Performance Metrics")
print(f"Accuracy : {acc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall   : {rec:.3f}")
print(f"F1-score : {f1:.3f}")


In [None]:
# Detailed per-class report
print("\n=== Classification Report ===")
print(classification_report(
    y_test, y_pred,
    target_names=encoder.classes_,
    zero_division=0
))

In [None]:
# Confusion Matrix (Visualization)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(9, 7))
sns.heatmap(cm, cmap="Blues", cbar=False)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.tight_layout()
plt.show()

In [None]:
# Feature Importance (Top 10 Symptoms)
importances = pd.Series(model.feature_importances_, index=X.columns)
top10 = importances.sort_values(ascending=False).head(10)

print("\n🔍 Top 10 Important Symptoms:")
print(top10)

plt.figure(figsize=(8, 5))
top10.iloc[::-1].plot.barh()
plt.title("Top 10 Most Important Symptoms")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

In [None]:
#  SAVE MODEL BUNDLE
bundle = {
    "model": model,
    "encoder": encoder,
    "scaler": scaler,
    "feature_names": X.columns.tolist()
}

with open("medical_model.pkl", "wb") as f:
    pickle.dump(bundle, f)

print(f"\n📦 Model saved to: {os.path.abspath('medical_model.pkl')}")
print(f"Total classes: {len(encoder.classes_)}")