In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

# Load dataset
df = pd.read_csv("blood_report_with_disease.csv")

# Check for duplicates and drop them
df = df.drop_duplicates()

# Separate features and target
X = df.drop(columns=['Disease'])  # Assuming 'Disease' is the target column
y = df['Disease']

# Encode target labels if they're categorical
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Feature selection to reduce dimensionality
selector = SelectKBest(score_func=f_classif, k=5)  # Try fewer features
X_selected = selector.fit_transform(X, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Switch to RandomForest with limited depth and fewer estimators
rf = RandomForestClassifier(
    n_estimators=50,       # Fewer trees
    max_depth=3,           # Shallow trees
    random_state=42
)

# Cross-validation to assess model performance on multiple folds
cross_val_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-validation accuracy (mean): {cross_val_scores.mean() * 100:.2f}%")

# Train the RandomForest model
rf.fit(X_train, y_train)

# Model evaluation - predict and calculate accuracy
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy on test set: {accuracy * 100:.2f}%")

# Print classification report for more detailed performance metrics
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Cross-validation accuracy (mean): 82.00%
Model accuracy on test set: 85.00%
                  precision    recall  f1-score   support

          Anemia       0.60      0.19      0.29        16
    Leukocytosis       0.67      1.00      0.80         2
      Leukopenia       1.00      0.89      0.94         9
          Normal       0.84      1.00      0.91        48
Thrombocytopenia       0.89      0.96      0.92        25

        accuracy                           0.85       100
       macro avg       0.80      0.81      0.77       100
    weighted avg       0.83      0.85      0.82       100

