In [1]:
# import pandas as pd
# from sklearn.model_selection import train_test_split, cross_val_score
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.feature_selection import SelectKBest, f_classif
# import pickle

# # Load the dataset
# df = pd.read_csv("blood_report_with_disease.csv")

# # Drop duplicates
# df = df.drop_duplicates()

# # Ensure we are only using numeric columns
# df = df.select_dtypes(include=[float, int]).join(df['Disease'])

# # Separate features (X) and target (y)
# X = df.drop(columns=['Disease'])
# y = df['Disease']

# # Encode target labels if they're categorical
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(y)

# # Split the data before applying SelectKBest
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Feature selection - SelectKBest to reduce to top 5 features on X_train
# selector = SelectKBest(score_func=f_classif, k=5)
# X_train_selected = selector.fit_transform(X_train, y_train)

# # Get the selected feature names and apply to X_test
# selected_feature_names = X_train.columns[selector.get_support()]
# X_test_selected = selector.transform(X_test)

# # Convert selected features back to DataFrame for both train and test sets
# X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_feature_names)
# X_test_selected_df = pd.DataFrame(X_test_selected, columns=selected_feature_names)

# # Initialize and train RandomForest
# rf = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42)
# rf.fit(X_train_selected_df, y_train)

# # Save model and selector
# with open("model_healthcheckpoints.pkl", "wb") as file:
#     pickle.dump((rf, selector), file)

# # Model evaluation
# y_pred = rf.predict(X_test_selected_df)
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Model accuracy on test set: {accuracy * 100:.2f}%")

# # Print detailed classification metrics
# print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
import pickle

# Load the dataset
df = pd.read_csv("blood_report_with_disease.csv")

# Drop duplicates and irrelevant columns
df = df.drop_duplicates().drop(columns=['ID'])

# Separate features (X) and target (y)
X = df.drop(columns=['Disease'])
y = df['Disease']

# Encode target labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Save the list of all feature names
all_feature_names = X.columns.tolist()

# Feature selection - SelectKBest to reduce to top 5 features
selector = SelectKBest(score_func=f_classif, k=5)
X_selected = selector.fit_transform(X, y)

# Get selected feature names
selected_feature_names = X.columns[selector.get_support()].tolist()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the selector to training and test data for consistency
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Initialize RandomForest with limited depth and estimators
rf = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42)

# Cross-validation for model accuracy estimation
cross_val_scores = cross_val_score(rf, X_train_selected, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy (mean): {cross_val_scores.mean() * 100:.2f}%")

# Train the model
rf.fit(X_train_selected, y_train)

# Save model, selector, selected feature names, and all feature names together
with open("model_healthcheckpoints.pkl", "wb") as file:
    pickle.dump((rf, selector, selected_feature_names, all_feature_names), file)


Cross-validation accuracy (mean): 82.00%
