In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import joblib


In [2]:
# Paths (keep as you used)
csv_path = r"C:\Users\lokan\OneDrive\Desktop\DSP PROJECT\Blood-donation.csv"
save_dir = r"C:\Users\lokan\OneDrive\Desktop\DSP PROJECT"
os.makedirs(save_dir, exist_ok=True)



In [3]:
# Load
df = pd.read_csv(csv_path, sep=None, engine="python", na_values=["NA","N/A",""," "])
df.columns = [c.strip().replace(" ", "_") for c in df.columns]


In [4]:
# Target
target_col = next((c for c in df.columns if c.lower()=="category"), None)
if target_col is None:
    raise ValueError("Target column 'Category' not found")


In [5]:
# Numeric and categorical features
num_candidates = ["Age","ALB","ALP","ALT","AST","BIL","CHE","CHOL","CREA","GGT","PROT"]
present_upper = {c.upper(): c for c in df.columns}
num_cols = [present_upper[n.upper()] for n in num_candidates if n.upper() in present_upper]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
cat_cols = []
if "Sex" in df.columns:
    df["Sex"] = df["Sex"].astype(str).str.strip().str.lower()
    df.loc[df["Sex"].isin(["male","m","1"]), "Sex"] = "m"
    df.loc[df["Sex"].isin(["female","f","0"]), "Sex"] = "f"
    cat_cols.append("Sex")


In [6]:
# Prepare
df = df.dropna(subset=[target_col]).reset_index(drop=True)
X = df[num_cols + cat_cols].copy()
y = df[target_col].astype(str).str.strip()


In [7]:
# Preprocess: impute -> scale numerics, impute -> OHE categoricals
num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

pre = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ],
    remainder="drop"
)


In [8]:
# Split with stratify if feasible
strat = y if y.nunique()>1 and y.value_counts().min()>=2 else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=strat
)


In [9]:
# Models
pipe_lr = Pipeline([("pre", pre), ("clf", LogisticRegression(
    max_iter=5000, class_weight="balanced", solver="lbfgs"
))])

pipe_rf = Pipeline([("pre", pre), ("clf", RandomForestClassifier(
    n_estimators=600, min_samples_leaf=2, class_weight="balanced_subsample",
    random_state=42, n_jobs=-1
))])

pipe_hgb = Pipeline([("pre", pre), ("clf", HistGradientBoostingClassifier(
    max_depth=None, learning_rate=0.08, max_iter=500, random_state=42
))])


In [10]:
# Train and evaluate
models = [("LogReg", pipe_lr), ("RandomForest", pipe_rf), ("HGB", pipe_hgb)]
scores = []
for name, pipe in models:
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, pred)
    f1m = f1_score(y_test, pred, average="macro")
    scores.append((name, pipe, pred, acc, f1m))


In [11]:
# Select best by macro-F1
best_name, best_pipe, best_pred, best_acc, best_f1 = sorted(scores, key=lambda z: z[4], reverse=True)[0]

print(f"Model: {best_name}")
print(f"Accuracy: {best_acc:.4f}")
print(f"F1-macro: {best_f1:.4f}")
print("\nClassification Report:\n", classification_report(y_test, best_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, best_pred))


Model: LogReg
Accuracy: 0.9268
F1-macro: 0.7331

Classification Report:
                         precision    recall  f1-score   support

         0=Blood Donor       0.99      0.95      0.97       107
0s=suspect Blood Donor       0.50      1.00      0.67         1
           1=Hepatitis       0.43      0.60      0.50         5
            2=Fibrosis       0.67      1.00      0.80         4
           3=Cirrhosis       0.80      0.67      0.73         6

              accuracy                           0.93       123
             macro avg       0.68      0.84      0.73       123
          weighted avg       0.94      0.93      0.93       123


Confusion Matrix:
 [[102   0   4   0   1]
 [  0   1   0   0   0]
 [  1   0   3   1   0]
 [  0   0   0   4   0]
 [  0   1   0   1   4]]


In [12]:
# Save model and predictions
model_path = os.path.join(save_dir, f"23MID0355-blood_model_{best_name}.pkl")
pred_path = os.path.join(save_dir, "blood_predictions.csv")
joblib.dump(best_pipe, model_path)

out = X_test.copy()
out[target_col] = y_test.values
out["pred"] = best_pred
out.to_csv(pred_path, index=False)

print("\nSaved:")
print("Model->", model_path)
print("Preds->",pred_path)


Saved:
Model-> C:\Users\lokan\OneDrive\Desktop\DSP PROJECT\23MID0355-blood_model_LogReg.pkl
Preds-> C:\Users\lokan\OneDrive\Desktop\DSP PROJECT\blood_predictions.csv
