In [1]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import ast
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from sklearn import metrics
import seaborn as sns
import re
import os
import pickle
from constants import base_path, model_list, pathology_scope
import warnings
warnings.filterwarnings("ignore")



In [2]:
diagnosis_df_train = pd.read_csv(f"{base_path}\\input\\release_train_patients")
diagnosis_df_test = pd.read_csv(f"{base_path}\\input\\release_test_patients")

In [3]:
diagnosis_df_train.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025602 entries, 0 to 1025601
Data columns (total 6 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   AGE                     1025602 non-null  int64 
 1   DIFFERENTIAL_DIAGNOSIS  1025602 non-null  object
 2   SEX                     1025602 non-null  object
 3   PATHOLOGY               1025602 non-null  object
 4   EVIDENCES               1025602 non-null  object
 5   INITIAL_EVIDENCE        1025602 non-null  object
dtypes: int64(1), object(5)
memory usage: 46.9+ MB


In [17]:
if pathology_scope:
    diagnosis_df_train.PATHOLOGY.value_counts(normalize=True).loc[pathology_scope]

Tuberculosis               0.015839
GERD                       0.025330
SLE                        0.011571
HIV (initial infection)    0.028289
Pulmonary neoplasm         0.014096
Name: PATHOLOGY, dtype: float64

In [16]:
if pathology_scope:
    diagnosis_df_train.PATHOLOGY.value_counts(normalize=True).loc[pathology_scope].sort_values().plot.barh(figsize=(12,3))
    plt.xlabel("Normalized Frequency")
else:
    diagnosis_df_train.PATHOLOGY.value_counts(normalize=True).plot.bar(figsize=(12,3))
    plt.ylabel("Normalized Frequency")
plt.title("Pathology Frequency (Train Set)")
plt.savefig(f'{base_path}\\output\\diseases\\train_pathology_freq.jpg', bbox_inches='tight')
plt.clf()

<Figure size 1200x300 with 0 Axes>

In [18]:
diagnosis_df_test.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134529 entries, 0 to 134528
Data columns (total 6 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   AGE                     134529 non-null  int64 
 1   DIFFERENTIAL_DIAGNOSIS  134529 non-null  object
 2   SEX                     134529 non-null  object
 3   PATHOLOGY               134529 non-null  object
 4   EVIDENCES               134529 non-null  object
 5   INITIAL_EVIDENCE        134529 non-null  object
dtypes: int64(1), object(5)
memory usage: 6.2+ MB


In [19]:
if pathology_scope:
    diagnosis_df_test.PATHOLOGY.value_counts(normalize=True).loc[pathology_scope]

Tuberculosis               0.015461
GERD                       0.026336
SLE                        0.011626
HIV (initial infection)    0.029131
Pulmonary neoplasm         0.014257
Name: PATHOLOGY, dtype: float64

In [21]:
if pathology_scope:
    diagnosis_df_test.PATHOLOGY.value_counts(normalize=True).loc[pathology_scope].sort_values().plot.barh(figsize=(12,3))
    plt.xlabel("Normalized Frequency")
else:
    diagnosis_df_test.PATHOLOGY.value_counts(normalize=True).plot.bar(figsize=(12,3))
    plt.ylabel("Normalized Frequency")
plt.title("Pathology Frequency (Test Set)")
plt.savefig(f'{base_path}\\output\\diseases\\test_pathology_freq.jpg', bbox_inches='tight')
plt.clf()

<Figure size 1200x300 with 0 Axes>

In [22]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
if pathology_scope:
  disease_list = pathology_scope
else:
  disease_list = list(disease_dict.keys())

In [23]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
for e in evidences.keys():
  # only binary symptoms and antecedents
  if (not evidences[e]["possible-values"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
evidences_dict["AGE"] = "AGE"
evidences_dict["SEX"] = "SEX"
feature_columns = ["AGE", "SEX"] + evidences_list

In [24]:
def data_proc(df):
    df["binary_evidences"] = df["EVIDENCES"].apply(lambda x: [d for d in ast.literal_eval(x) if "@" not in d])
    for e in evidences_list:
        df[e] = df["binary_evidences"].apply(lambda x: 1 if e in x else 0)
    df["SEX"] = df["SEX"].map({'F': 0, 'M': 1})
    df = df[feature_columns + ["PATHOLOGY"]]
    return df

In [25]:
diagnosis_df_train = data_proc(diagnosis_df_train)
diagnosis_df_train

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_199,E_121,E_120,E_142,E_195,E_183,E_224,E_223,E_5,PATHOLOGY
0,18,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,URTI
1,21,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,HIV (initial infection)
2,19,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Pneumonia
3,34,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,URTI
4,36,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,URTI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025597,18,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis
1025598,28,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis
1025599,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis
1025600,26,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis


In [26]:
diagnosis_df_test = data_proc(diagnosis_df_test)
diagnosis_df_test

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_199,E_121,E_120,E_142,E_195,E_183,E_224,E_223,E_5,PATHOLOGY
0,49,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,GERD
1,2,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bronchitis
2,49,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Acute dystonic reactions
3,64,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Acute laryngitis
4,70,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,URTI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134524,52,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,SLE
134525,88,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Pulmonary neoplasm
134526,29,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,Boerhaave
134527,8,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Scombroid food poisoning


In [27]:
for target_disease in disease_list:
    # create path for outputs
    img_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', target_disease).replace(" ", "_")
    disease_path = f'{base_path}\\output\\diseases\\{img_filename}'
    if not os.path.exists(disease_path):
        os.makedirs(disease_path)
    
    # create target variable
    diagnosis_df_train["is_target_disease"] = np.where(diagnosis_df_train["PATHOLOGY"]==target_disease, 1, 0)
    diagnosis_df_test["is_target_disease"] = np.where(diagnosis_df_test["PATHOLOGY"]==target_disease, 1, 0)

    # get disease proportion
    diagnosis_df_train["is_target_disease"].value_counts().plot.pie(autopct='%.2f%%', label='')
    plt.title(f"{target_disease} - Positive and Negative Proportions\n(Train Set)")
    plt.savefig(f'{disease_path}\\{img_filename}_class_perc_train.jpg', bbox_inches='tight')
    plt.clf()
    diagnosis_df_test["is_target_disease"].value_counts().plot.pie(autopct='%.2f%%', label='')
    plt.title(f"{target_disease} - Positive and Negative Proportions\n(Test Set)")
    plt.savefig(f'{disease_path}\\{img_filename}_class_perc_test.jpg', bbox_inches='tight')
    plt.clf()

<Figure size 640x480 with 0 Axes>

## Tree-based models

In [28]:
for model_name in model_list["tree-based"]:
  print(f"Training {model_name}...")
  feature_dict = {}
  seed = 0
  for target_disease in disease_list:
    # create path for outputs
    img_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', target_disease).replace(" ", "_")
    disease_path = f'{base_path}\\output\\diseases\\{img_filename}\\{model_name}'
    if not os.path.exists(disease_path):
      os.makedirs(disease_path)

    # create target variable
    diagnosis_df_train["is_target_disease"] = np.where(diagnosis_df_train["PATHOLOGY"]==target_disease, 1, 0)
    diagnosis_df_test["is_target_disease"] = np.where(diagnosis_df_test["PATHOLOGY"]==target_disease, 1, 0)
    
    # Undersample
    X_train = diagnosis_df_train[feature_columns]
    Y_train = diagnosis_df_train["is_target_disease"]
    rus = RandomUnderSampler(random_state=0)
    X_resampled_train, Y_resampled_train = rus.fit_resample(X_train, Y_train)

    # train model
    clf_model = model_list["tree-based"][model_name]
    clf_model.fit(X_resampled_train, Y_resampled_train)

    # evaluate on train set
    y_pred_train = clf_model.predict(X_resampled_train)
    train_acc = metrics.accuracy_score(Y_resampled_train, y_pred_train)
    train_recall = metrics.recall_score(Y_resampled_train, y_pred_train)
    train_f1 = metrics.f1_score(Y_resampled_train, y_pred_train)
    print(f"{target_disease} balanced train set accuracy", train_acc)
    cf_matrix = metrics.confusion_matrix(Y_resampled_train, y_pred_train)
    sns.heatmap(cf_matrix, annot=True, fmt='d')
    plt.xlabel("predicted label")
    plt.ylabel("true label")
    plt.title(f"{target_disease} - Balanced Train Set Confusion Matrix")
    plt.savefig(f'{disease_path}\\{img_filename}_train_cf.jpg', bbox_inches='tight')
    plt.clf()

    # evaluate on test set
    X_test = diagnosis_df_test[feature_columns]
    Y_test = diagnosis_df_test["is_target_disease"]
    y_pred_test = clf_model.predict(X_test)
    test_acc = metrics.accuracy_score(Y_test, y_pred_test)
    test_recall = metrics.recall_score(Y_test, y_pred_test)
    test_f1 = metrics.f1_score(Y_test, y_pred_test)
    print(f"{target_disease} test set accuracy", test_acc, "\n")
    cf_matrix = metrics.confusion_matrix(Y_test, y_pred_test)
    sns.heatmap(cf_matrix, annot=True, fmt='d')
    plt.xlabel("predicted label")
    plt.ylabel("true label")
    plt.title(f"{target_disease} - Test Set Confusion Matrix")
    plt.savefig(f'{disease_path}\\{img_filename}_test_cf.jpg', bbox_inches='tight')
    plt.clf()

    # feature importance
    top_n = 10
    clf_importance = clf_model.feature_importances_
    clf_importance = pd.DataFrame({'Feature': feature_columns, 'Importance': clf_importance}).sort_values(by="Importance", ascending=False)
    clf_importance["feature_en"] = clf_importance["Feature"].map(evidences_dict)
    clf_importance = clf_importance.head(top_n).reset_index()
    sns.barplot(data=clf_importance, y="feature_en", x="Importance", orient='h', color='steelblue')
    plt.title(f"{target_disease} - Top {top_n} most relevant symptoms\n({model_name})")
    plt.xlabel("Symptom Importance Score")
    plt.legend('')
    plt.savefig(f'{disease_path}\\{img_filename}_ftr_importance.jpg', bbox_inches='tight')
    plt.clf()

    # save model
    with open(f'{disease_path}\\{img_filename}_model.pkl','wb') as f:
      pickle.dump(clf_model, f)
    
    # save metrics
    feature_dict[target_disease] = {
      "train_positive_diagnosis": sum(Y_train),
      "train_negative_diagnosis": len(Y_train) - sum(Y_train),
      "test_positive_diagnosis": sum(Y_test),
      "test_negative_diagnosis": len(Y_test) - sum(Y_test),
      "balanced_train_set_metrics": {
        "accuracy": train_acc,
        "recall": train_recall,
        "f1": train_f1
      },
      "test_set_metrics": {
        "accuracy": test_acc,
        "recall": test_recall,
        "f1": test_f1
      },
      f"top{top_n}_relevant_symptoms": {row["feature_en"]: row["Importance"] for _, row in clf_importance.iterrows()}
    }

  # save results
  with open(f"{base_path}\\output\\diseases\\feature_importance_{model_name}.json", "w") as outfile: 
      json.dump(feature_dict, outfile, indent=True)

Training decision_tree...
Tuberculosis balanced train set accuracy 1.0
Tuberculosis test set accuracy 0.9995093994603395 

GERD balanced train set accuracy 1.0
GERD test set accuracy 0.9993532992886293 

SLE balanced train set accuracy 1.0
SLE test set accuracy 0.998290331452698 

HIV (initial infection) balanced train set accuracy 1.0
HIV (initial infection) test set accuracy 0.99995539995094 

Pulmonary neoplasm balanced train set accuracy 1.0
Pulmonary neoplasm test set accuracy 0.9990931323357789 

Training random_forest...
Tuberculosis balanced train set accuracy 1.0
Tuberculosis test set accuracy 0.99995539995094 

GERD balanced train set accuracy 1.0
GERD test set accuracy 0.9999776999754699 

SLE balanced train set accuracy 1.0
SLE test set accuracy 0.9998364664867798 

HIV (initial infection) balanced train set accuracy 1.0
HIV (initial infection) test set accuracy 1.0 

Pulmonary neoplasm balanced train set accuracy 1.0
Pulmonary neoplasm test set accuracy 0.99998513331698 



<Figure size 640x480 with 0 Axes>

## Logistic Regression

In [29]:
model_name = "logistic_regression"
feature_dict = {}
seed = 0
for target_disease in disease_list:
  # create path for outputs
  img_filename = re.sub('[^a-zA-Z0-9 \n\.]', '', target_disease).replace(" ", "_")
  disease_path = f'{base_path}\\output\\diseases\\{img_filename}\\{model_name}'
  if not os.path.exists(disease_path):
    os.makedirs(disease_path)
  
  # create target variable
  diagnosis_df_train["is_target_disease"] = np.where(diagnosis_df_train["PATHOLOGY"]==target_disease, 1, 0)
  diagnosis_df_test["is_target_disease"] = np.where(diagnosis_df_test["PATHOLOGY"]==target_disease, 1, 0)

  # Undersample
  X_train = diagnosis_df_train[feature_columns]
  X_train_std = X_train.std().to_list()
  Y_train = diagnosis_df_train["is_target_disease"]
  Y_train_std = np.std(Y_train)
  X_train_std = X_train_std/Y_train_std
  rus = RandomUnderSampler(random_state=0)
  X_resampled_train, Y_resampled_train = rus.fit_resample(X_train, Y_train)

  # train model
  clf_model = model_list[model_name]
  clf_model.fit(X_resampled_train, Y_resampled_train)

  # evaluate on train set
  y_pred_train = clf_model.predict(X_resampled_train)
  train_acc = metrics.accuracy_score(Y_resampled_train, y_pred_train)
  train_recall = metrics.recall_score(Y_resampled_train, y_pred_train)
  train_f1 = metrics.f1_score(Y_resampled_train, y_pred_train)
  print(f"{target_disease} balanced train set accuracy", train_acc)
  cf_matrix = metrics.confusion_matrix(Y_resampled_train, y_pred_train)
  sns.heatmap(cf_matrix, annot=True, fmt='d')
  plt.xlabel("predicted label")
  plt.ylabel("true label")
  plt.title(f"{target_disease} - Balanced Train Set Confusion Matrix")
  plt.savefig(f'{disease_path}\\{img_filename}_train.jpg', bbox_inches='tight')
  plt.clf()

  # evaluate on test set
  X_test = diagnosis_df_test[feature_columns]
  Y_test = diagnosis_df_test["is_target_disease"]
  y_pred_test = clf_model.predict(X_test)
  test_acc = metrics.accuracy_score(Y_test, y_pred_test)
  test_recall = metrics.recall_score(Y_test, y_pred_test)
  test_f1 = metrics.f1_score(Y_test, y_pred_test)
  print(f"{target_disease} test set accuracy", test_acc, "\n")
  cf_matrix = metrics.confusion_matrix(Y_test, y_pred_test)
  sns.heatmap(cf_matrix, annot=True, fmt='d')
  plt.xlabel("predicted label")
  plt.ylabel("true label")
  plt.title(f"{target_disease} - Test Set Confusion Matrix")
  plt.savefig(f'{disease_path}\\{img_filename}_test.jpg', bbox_inches='tight')
  plt.clf()

  # feature importance
  top_n = 10
  clf_importance = [clf_model.coef_[0][i]/X_train_std[i] for i in range(len(feature_columns))] # standadized coeff
  clf_importance = pd.DataFrame({'Feature': feature_columns, 'Importance': clf_importance}).sort_values(by="Importance", ascending=False)
  clf_importance["feature_en"] = clf_importance["Feature"].map(evidences_dict)
  clf_importance_all = clf_importance.copy().reset_index()
  clf_importance = clf_importance.head(top_n).reset_index()
  clf_importance = clf_importance[clf_importance["Importance"]>0]  # get only positive indicators of disease
  sns.barplot(data=clf_importance, y="feature_en", x="Importance", orient='h', color='steelblue')
  plt.title(f"{target_disease} - Top {top_n} most relevant symptoms\n({model_name})")
  plt.xlabel("Symptom Importance Score")
  plt.legend('')
  plt.savefig(f'{disease_path}\\{img_filename}_ftr_importance.jpg', bbox_inches='tight')
  plt.clf()

  # save model
  with open(f'{disease_path}\\{img_filename}_model.pkl','wb') as f:
    pickle.dump(clf_model, f)
  
  # save metrics
  feature_dict[target_disease] = {
    "train_positive_diagnosis": sum(Y_train),
    "train_negative_diagnosis": len(Y_train) - sum(Y_train),
    "test_positive_diagnosis": sum(Y_test),
    "test_negative_diagnosis": len(Y_test) - sum(Y_test),
    "balanced_train_set_metrics": {
      "accuracy": train_acc,
      "recall": train_recall,
      "f1": train_f1
    },
    "test_set_metrics": {
      "accuracy": test_acc,
      "recall": test_recall,
      "f1": test_f1
    },
    f"top{top_n}_relevant_symptoms": {row["feature_en"]: row["Importance"] for _, row in clf_importance.iterrows()}
  }

  # save standardized coeffs
  standardized_coeffs = {row["feature_en"]: row["Importance"] for _, row in clf_importance_all.iterrows()}
  with open(f"{disease_path}\\feature_importance.json", "w") as outfile: 
      json.dump(standardized_coeffs, outfile, indent=True)

# save results
with open(f"{base_path}\\output\\diseases\\feature_importance_{model_name}.json", "w") as outfile: 
    json.dump(feature_dict, outfile, indent=True)

Tuberculosis balanced train set accuracy 0.9999692212988612
Tuberculosis test set accuracy 0.9999405332679199 

GERD balanced train set accuracy 0.9998845221140151
GERD test set accuracy 0.99999256665849 

SLE balanced train set accuracy 0.9997050644644814
SLE test set accuracy 0.9997175330226197 

HIV (initial infection) balanced train set accuracy 0.9999827663461207
HIV (initial infection) test set accuracy 0.99995539995094 

Pulmonary neoplasm balanced train set accuracy 0.9999308293560213
Pulmonary neoplasm test set accuracy 0.9998959332188598 



<Figure size 640x480 with 0 Axes>