In [1]:
import sys, os, pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import auc,accuracy_score, confusion_matrix, classification_report

Load the data

In [36]:
current_path=os.getcwd()
loader_path=os.path.abspath(
    os.path.join(current_path, '..', 'disease_prediction','data',
        )
    )
sys.path.append(loader_path)
import datasets as ds
df=ds.load_datasets(
    subsets=['train', 'test', 'validate'],
    directory='../ddx-dataset/'
)
dp_data=pd.concat(
    [df['train'],df['test'],df['validate']],
    axis=0, 
    ignore_index=True
    )

Features and labels

In [37]:
X=dp_data.drop('PATHOLOGY', axis=1)
y=dp_data['PATHOLOGY']

Label Encoding

In [4]:
label_encoder=LabelEncoder()
y_encoded=label_encoder.fit_transform(y)
pathologies=label_encoder.classes_

Features Encoding

In [5]:
numerical_features=X.select_dtypes(include='int64').columns.tolist()
categorical_features=X.select_dtypes(include='object').columns.tolist()
features_preprocessor=ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numerical_features),
        ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_features)
    ]
)

Data Splitting

In [38]:
X_train, X_test, y_train, y_test=train_test_split(
    X, y_encoded, test_size=0.25, random_state=42
)

X_train_train, X_val, y_train_train, y_val=train_test_split(
    X_train, y_train,
    test_size=0.20,
    random_state=42
)

Define the classification models

In [8]:
models={
    'Logistic Regression': LogisticRegression(max_iter=1000,random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME',random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False,eval_metric='logloss',random_state=42)
}

Function to train models and evaluate the AUC

In [9]:
def model_selection(X_train_train,y_train_train,X_val,y_val):
    auc_scores={}
    for name,model in models.items():
        pipeline=Pipeline(
            steps=[
                ('pre-processing',features_preprocessor),
                ('classifier',model)
            ]
        )
        pipeline.fit(X_train_train,y_train_train)
        y_predicted_probability=pipeline.predict_proba(X_val)[:,1]
        auc_score=roc_auc_score(y_val,y_predicted_probability)
        auc_scores[name]=auc_score
    return auc_scores

Finding the best model for each pathology based on the AUC scores

In [10]:
best_model={}
for pathology in pathologies:
    pathology_index=label_encoder.transform([pathology])[0]
    y_train_binary = (y_train_train==pathology_index).astype(int)
    y_val_binary = (y_val == pathology_index).astype(int)
    auc_scores=model_selection(X_train_train, y_train_binary, X_val, y_val_binary)
    best_model_name=max(auc_scores, key=auc_scores.get)
    best_model[pathology]=(best_model_name,auc_scores[best_model_name])

In [11]:
print("\nBest models for each pathologies:")
for pathology, (model_name, auc_score) in best_model.items():
    print(f"{pathology}: {model_name} (AUC: {round(auc_score,2)})")


Best models for each pathologies:
Allergic sinusitis: Gradient Boosting (AUC: 0.99)
Anaphylaxis: Gradient Boosting (AUC: 0.92)
Chagas: Gradient Boosting (AUC: 0.91)
Ebola: AdaBoost (AUC: 0.93)
HIV (initial infection): Logistic Regression (AUC: 0.88)
Influenza: Gradient Boosting (AUC: 0.9)
Localized edema: Gradient Boosting (AUC: 0.96)
SLE: Gradient Boosting (AUC: 0.93)
Sarcoidosis: AdaBoost (AUC: 0.95)
Tuberculosis: Gradient Boosting (AUC: 0.96)
Whooping cough: Logistic Regression (AUC: 1.0)


Test the best models for each pathologies

In [12]:
test_metrics={}
def predict_with_best_model(model_name,X_train,y_train,X_test):
    model=models[model_name]
    pipeline=Pipeline(
        steps=[
            ('pre-processing',features_preprocessor),
            ('classifier',model)
        ]
    )
    pipeline.fit(X_train,y_train)
    y_predicted=pipeline.predict(X_test)
    return y_predicted

In [13]:
for pathology in pathologies:
    print(f"\nEvaluating best model for {pathology} on the test set")
    pathology_index=label_encoder.transform([pathology])[0]
    y_test_binary = (y_test == pathology_index).astype(int)

    best_model_name,_= best_model[pathology]
    y_predicted=predict_with_best_model(best_model_name,X_train,y_train,X_test)
    y_predicted_binary=(y_predicted==pathology_index).astype(int)

    accuracy = accuracy_score(y_test_binary, y_predicted_binary)
    conf_matrix = confusion_matrix(y_test_binary, y_predicted_binary)
    class_report = classification_report(y_test_binary, y_predicted_binary, target_names=['Other',pathology])

    test_metrics[pathology]={
        'Accuracy': accuracy,
        'Confusion Matrix': conf_matrix,
        'Classification Report': classification_report
    }

    print(f"Test Accuracy for {pathology} with {best_model_name}: {accuracy}")
    print(f"Confusion Matrix: \n{conf_matrix}")
    print(f"Classification Report: \n{class_report}")


Evaluating best model for Allergic sinusitis on the test set
Test Accuracy for Allergic sinusitis with Gradient Boosting: 0.9653359360241799
Confusion Matrix: 
[[54696  1106]
 [ 1096  6626]]
Classification Report: 
                    precision    recall  f1-score   support

             Other       0.98      0.98      0.98     55802
Allergic sinusitis       0.86      0.86      0.86      7722

          accuracy                           0.97     63524
         macro avg       0.92      0.92      0.92     63524
      weighted avg       0.97      0.97      0.97     63524


Evaluating best model for Anaphylaxis on the test set
Test Accuracy for Anaphylaxis with Gradient Boosting: 0.9136546816951073
Confusion Matrix: 
[[54457   157]
 [ 5328  3582]]
Classification Report: 
              precision    recall  f1-score   support

       Other       0.91      1.00      0.95     54614
 Anaphylaxis       0.96      0.40      0.57      8910

    accuracy                           0.91     63524
 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Test Accuracy for Ebola with AdaBoost: 0.9964895157735659
Confusion Matrix: 
[[63301     0]
 [  223     0]]
Classification Report: 
              precision    recall  f1-score   support

       Other       1.00      1.00      1.00     63301
       Ebola       0.00      0.00      0.00       223

    accuracy                           1.00     63524
   macro avg       0.50      0.50      0.50     63524
weighted avg       0.99      1.00      0.99     63524


Evaluating best model for HIV (initial infection) on the test set
Test Accuracy for HIV (initial infection) with Logistic Regression: 0.8119923178641143
Confusion Matrix: 
[[46887  7418]
 [ 4525  4694]]
Classification Report: 
                         precision    recall  f1-score   support

                  Other       0.91      0.86      0.89     54305
HIV (initial infection)       0.39      0.51      0.44      9219

               accuracy                           0.81     63524
              macro avg       0.65      0.69      0

In [20]:
best_models={
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME', random_state=42)
}

In [21]:
def train_best_models(X_train, y_train):
    trained_models = {}
    for pathology, (model_name, _) in best_model.items():
        model = best_models[model_name]
        pipeline = Pipeline(
            steps=[
                ('pre-processing', features_preprocessor),
                ('classifier', model)
            ]
        )
        pathology_index = label_encoder.transform([pathology])[0]
        y_binary = (y_train == pathology_index).astype(int)
        pipeline.fit(X_train, y_binary)
        trained_models[pathology] = pipeline
    return trained_models

In [22]:

trained_models = train_best_models(X_train, y_train)

In [31]:
def predict_disease(instances, trained_models):
    probabilities = np.zeros((instances.shape[0], len(pathologies)))
    for i, pathology in enumerate(pathologies):
        model = trained_models[pathology]
        probabilities[:, i] = model.predict_proba(instances)[:, 1]
    predictions = np.argmax(probabilities, axis=1)
    predicted_diseases = label_encoder.inverse_transform(predictions)
    return predicted_diseases, probabilities

In [40]:
# Example instances from the test set
X_test = X_test.reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)
instances = X_test.sample(5, random_state=42)  
# Predict diseases for the instances
predicted_diseases, probabilities = predict_disease(instances, trained_models)

# Display results
for i, instance_index in enumerate(instances.index):
    actual_disease_index=y_test[instance_index]
    actual_disease = label_encoder.inverse_transform([actual_disease_index])[0]
    print(f"Instance {i+1}:")
    print(f"Actual Disease: {actual_disease}")
    print(f"Predicted Disease: {predicted_diseases[i]}")
    print(f"Probabilities: {probabilities[i]}")
    print("Instance details:")
    print(X_test.loc[instance_index])
    print("-" * 30)

Instance 1:
Actual Disease: Localized edema
Predicted Disease: Ebola
Probabilities: [4.10746047e-03 4.10746047e-03 4.10746047e-03 4.40076107e-01
 7.20226386e-06 4.10746047e-03 4.10746047e-03 4.10746047e-03
 4.40076107e-01 4.10746047e-03 7.20226386e-06]
Instance details:
AGE                   47
SEX                    F
INITIAL_EVIDENCE    pain
swollen_nodes          1
std                    0
                    ... 
breastfed_9            0
confusion              0
contact                0
ebolacase              0
bruising               0
Name: 1786, Length: 93, dtype: object
------------------------------
Instance 2:
Actual Disease: Tuberculosis
Predicted Disease: Allergic sinusitis
Probabilities: [4.03045163e-01 4.03045163e-01 4.03045163e-01 3.75428986e-01
 7.68000680e-05 4.03045163e-01 4.03045163e-01 4.03045163e-01
 3.75428986e-01 4.03045163e-01 7.68000680e-05]
Instance details:
AGE                    63
SEX                     M
INITIAL_EVIDENCE    cough
swollen_nodes           0
