STEP 1 â€” Open Google Colab & Install Libraries

In [None]:
!pip install streamlit pyngrok scikit-learn joblib


In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv("/content/MB-DATA-with-Disease-Type-C.csv")

# Quick look
print(df.head())
print(df['Anemia_Disease_Type'].value_counts())


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Features and target
X = df.drop(columns=['PatientID','Anemia','Anemia_Disease_Type'])
y = df['Anemia_Disease_Type']

# Encode target labels
le = LabelEncoder()
y = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# XGBoost
xgb = XGBClassifier(eval_metric='mlogloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

# Accuracy
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # Import numpy for unique label extraction
from sklearn.metrics import classification_report, confusion_matrix

models = {'Random Forest': y_pred_rf, 'Logistic Regression': y_pred_lr, 'XGBoost': y_pred_xgb}

# Get unique labels from the test set
unique_test_labels = np.unique(y_test)
# Map these unique numeric labels back to their original string names
target_names_for_report = [le.classes_[label] for label in unique_test_labels]

for name, preds in models.items():
    # Pass 'labels' explicitly to classification_report to ensure it matches unique_test_labels
    # and use target_names_for_report for display
    print(f"\n{name} Report:\n", classification_report(y_test, preds, labels=unique_test_labels, target_names=target_names_for_report, zero_division=0))

    # For confusion matrix, also specify the labels parameter to get a matrix only for present classes
    cm = confusion_matrix(y_test, preds, labels=unique_test_labels)
    plt.figure(figsize=(8,6)) # Increased figure size for better readability
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names_for_report, yticklabels=target_names_for_report)
    plt.title(f"{name} Confusion Matrix")
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import joblib

# Load dataset
df = pd.read_csv("MB-DATA-with-Disease-Type-C.csv")

# Features and target
X = df.drop(columns=['PatientID','Anemia','Anemia_Disease_Type'])
y = df['Anemia_Disease_Type']

# Encode target labels
le = LabelEncoder()
y = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Random Forest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Save model + encoders
joblib.dump(rf, "rf_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(le, "label_encoder.pkl")

print("Model and encoders saved successfully!")


In [None]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib

# Load trained model and encoders
rf = joblib.load("/content/rf_model.pkl")
scaler = joblib.load("/content/scaler.pkl")
le = joblib.load("/content/label_encoder.pkl")

st.title("ðŸ©¸ Anemia Prediction App")

# Choose input method
option = st.radio("Choose Input Method:", ("Upload CSV", "Manual Entry"))

if option == "Upload CSV":
    uploaded_file = st.file_uploader("Upload patient CSV", type="csv")
    if uploaded_file is not None:
        data = pd.read_csv(uploaded_file)
        # Drop PatientID if present
        if 'PatientID' in data.columns:
            features = data.drop(columns=['PatientID'])
        else:
            features = data
        scaled = scaler.transform(features)
        preds = rf.predict(scaled)
        data['Prediction'] = le.inverse_transform(preds)
        st.write(data)

else:
    st.subheader("Manual Patient Data Entry")
    age = st.number_input("Age", min_value=1, max_value=120)
    gender = st.selectbox("Gender", [0,1])  # 0=Female, 1=Male
    rbc = st.number_input("RBC (10Â¹Â²/L)")
    hgb = st.number_input("HGB (g/dL)")
    hct = st.number_input("HCT (%)")
    mcv = st.number_input("MCV (fL)")
    mch = st.number_input("MCH (pg)")
    mchc = st.number_input("MCHC (g/dL)")
    rdw = st.number_input("RDW-CV (%)")

    input_data = np.array([[age, gender, rbc, hgb, hct, mcv, mch, mchc, rdw]])
    scaled = scaler.transform(input_data)
    pred = rf.predict(scaled)
    st.write("Prediction:", le.inverse_transform(pred)[0])


Writing app.py
