In [1]:
import pandas as pd
import json
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pickle
from constants import max_n_questions, base_path
import ast
import warnings
warnings.filterwarnings("ignore")



In [2]:
diagnosis_df_train = pd.read_csv(f"{base_path}\\input\\release_train_patients")

In [3]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list = []
evidences_dict = {}
for e in evidences.keys():
  # only binary symptoms and no antecedents
  if (not evidences[e]["possible-values"]) and (not evidences[e]["is_antecedent"]):
    evidences_list.append(e)
    evidences_dict[e] = evidences[e]["question_en"]
evidences_dict["AGE"] = "AGE"
evidences_dict["SEX"] = "SEX"
feature_columns = ["AGE", "SEX"] + evidences_list

In [4]:
def data_proc(df):
    df["binary_evidences"] = df["EVIDENCES"].apply(lambda x: [d for d in ast.literal_eval(x) if "@" not in d])
    for e in evidences_list:
        df[e] = df["binary_evidences"].apply(lambda x: 1 if e in x else 0)
    df["SEX"] = df["SEX"].map({'F': 0, 'M': 1})
    df = df[feature_columns + ["PATHOLOGY"]]
    return df

In [5]:
diagnosis_df_train = data_proc(diagnosis_df_train)
diagnosis_df_train

Unnamed: 0,AGE,SEX,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,PATHOLOGY
0,18,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,URTI
1,21,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,HIV (initial infection)
2,19,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Pneumonia
3,34,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,URTI
4,36,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,URTI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1025597,18,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis
1025598,28,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis
1025599,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis
1025600,26,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Epiglottitis


In [6]:
with open(f"{base_path}\\input\\release_conditions.json") as f:
  disease_dict = json.load(f)
disease_list = list(disease_dict.keys())
# disease_list

In [7]:
def get_next_question(evidences, nbrs, feature_importance_df):
    centroid = np.array([feature_importance_df.loc[e].values for e in evidences]).mean(axis=0)
    _, indices = nbrs.kneighbors([centroid])
    ask_list = [evidences_dict[evidences_list[i]] for i in indices[0] if evidences_dict[evidences_list[i]] not in evidences]
    try:
        return ask_list[0]
    except:
        return ""

In [8]:
diagnosis_df_train_count = diagnosis_df_train['PATHOLOGY'].value_counts().rename_axis('PATHOLOGY').to_frame('counts')
diagnosis_df_train_count

Unnamed: 0_level_0,counts
PATHOLOGY,Unnamed: 1_level_1
URTI,64368
Viral pharyngitis,61642
Anemia,50665
HIV (initial infection),29013
Localized edema,27825
Anaphylaxis,27718
Pulmonary embolism,27468
Influenza,26812
Bronchitis,26400
Allergic sinusitis,26203


In [9]:
diagnosis_df_train = diagnosis_df_train.groupby(['PATHOLOGY']).agg({e: "sum" for e in evidences_list})
diagnosis_df_train

Unnamed: 0_level_0,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,E_51,E_75,...,E_164,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23
PATHOLOGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acute COPD exacerbation / infection,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acute dystonic reactions,0,0,0,0,0,0,0,0,0,0,...,0,17292,17178,15439,0,0,0,0,0,0
Acute laryngitis,13252,24114,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acute otitis media,14634,25868,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acute pulmonary edema,0,19018,0,0,0,0,0,0,0,0,...,0,0,0,0,14099,0,0,0,0,0
Acute rhinosinusitis,4813,13578,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11044,10079,0
Allergic sinusitis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Anaphylaxis,0,27704,16639,27718,0,0,0,0,16772,0,...,0,0,0,0,0,0,0,0,0,0
Anemia,0,50175,0,0,30445,0,0,30320,0,0,...,0,0,0,0,0,0,0,0,0,0
Atrial fibrillation,0,0,0,0,0,15394,0,0,0,0,...,16051,0,0,0,0,0,0,0,0,0


In [10]:
diagnosis_df_train = diagnosis_df_train.join(diagnosis_df_train_count)
diagnosis_df_train

Unnamed: 0_level_0,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,E_51,E_75,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,counts
PATHOLOGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acute COPD exacerbation / infection,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,17661
Acute dystonic reactions,0,0,0,0,0,0,0,0,0,0,...,17292,17178,15439,0,0,0,0,0,0,25982
Acute laryngitis,13252,24114,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24129
Acute otitis media,14634,25868,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25917
Acute pulmonary edema,0,19018,0,0,0,0,0,0,0,0,...,0,0,0,14099,0,0,0,0,0,19018
Acute rhinosinusitis,4813,13578,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,11044,10079,0,13578
Allergic sinusitis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,26203
Anaphylaxis,0,27704,16639,27718,0,0,0,0,16772,0,...,0,0,0,0,0,0,0,0,0,27718
Anemia,0,50175,0,0,30445,0,0,30320,0,0,...,0,0,0,0,0,0,0,0,0,50665
Atrial fibrillation,0,0,0,0,0,15394,0,0,0,0,...,0,0,0,0,0,0,0,0,0,21036


In [11]:
for e in evidences_list:
    diagnosis_df_train[e] = diagnosis_df_train[e]/diagnosis_df_train["counts"]
diagnosis_df_train

Unnamed: 0_level_0,E_91,E_53,E_159,E_129,E_154,E_155,E_210,E_140,E_51,E_75,...,E_193,E_168,E_180,E_67,E_171,E_111,E_182,E_103,E_23,counts
PATHOLOGY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acute COPD exacerbation / infection,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17661
Acute dystonic reactions,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.665538,0.66115,0.594219,0.0,0.0,0.0,0.0,0.0,0.0,25982
Acute laryngitis,0.549215,0.999378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24129
Acute otitis media,0.564649,0.998109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25917
Acute pulmonary edema,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.74135,0.0,0.0,0.0,0.0,0.0,19018
Acute rhinosinusitis,0.35447,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.813375,0.742304,0.0,13578
Allergic sinusitis,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26203
Anaphylaxis,0.0,0.999495,0.600296,1.0,0.0,0.0,0.0,0.0,0.605094,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27718
Anemia,0.0,0.990329,0.0,0.0,0.600908,0.0,0.0,0.598441,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50665
Atrial fibrillation,0.0,0.0,0.0,0.0,0.0,0.731793,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21036


In [12]:
diagnosis_df_train = diagnosis_df_train.drop('counts', axis=1).transpose()
diagnosis_df_train.index.name = 'evidence'
diagnosis_df_train

PATHOLOGY,Acute COPD exacerbation / infection,Acute dystonic reactions,Acute laryngitis,Acute otitis media,Acute pulmonary edema,Acute rhinosinusitis,Allergic sinusitis,Anaphylaxis,Anemia,Atrial fibrillation,...,Sarcoidosis,Scombroid food poisoning,Spontaneous pneumothorax,Spontaneous rib fracture,Stable angina,Tuberculosis,URTI,Unstable angina,Viral pharyngitis,Whooping cough
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E_91,0.0,0.0,0.549215,0.564649,0.0,0.354470,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.644198,0.699664,0.0,0.493397,0.0
E_53,0.0,0.0,0.999378,0.998109,1.0,1.000000,0.0,0.999495,0.990329,0.0,...,1.0,0.0,1.0,1.0,1.0,0.000000,0.996163,1.0,1.000000,0.0
E_159,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.600296,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
E_129,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,1.000000,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
E_154,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.600908,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
E_171,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
E_111,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
E_182,0.0,0.0,0.000000,0.000000,0.0,0.813375,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
E_103,0.0,0.0,0.000000,0.000000,0.0,0.742304,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0


In [13]:
diagnosis_df_train.index = diagnosis_df_train.index.map(evidences_dict)
diagnosis_df_train

PATHOLOGY,Acute COPD exacerbation / infection,Acute dystonic reactions,Acute laryngitis,Acute otitis media,Acute pulmonary edema,Acute rhinosinusitis,Allergic sinusitis,Anaphylaxis,Anemia,Atrial fibrillation,...,Sarcoidosis,Scombroid food poisoning,Spontaneous pneumothorax,Spontaneous rib fracture,Stable angina,Tuberculosis,URTI,Unstable angina,Viral pharyngitis,Whooping cough
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Do you have a fever (either felt or measured with a thermometer)?,0.0,0.0,0.549215,0.564649,0.0,0.354470,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.644198,0.699664,0.0,0.493397,0.0
"Do you have pain somewhere, related to your reason for consulting?",0.0,0.0,0.999378,0.998109,1.0,1.000000,0.0,0.999495,0.990329,0.0,...,1.0,0.0,1.0,1.0,1.0,0.000000,0.996163,1.0,1.000000,0.0
Did you lose consciousness?,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.600296,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,1.000000,0.000000,0.0,...,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
Is your skin much paler than usual?,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.600908,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Do you feel like you are detached from your own body or your surroundings?,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
Do you feel like you are dying or were you afraid that you were about do die?,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
Do you have greenish or yellowish nasal discharge?,0.0,0.0,0.000000,0.000000,0.0,0.813375,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
Have you lost your sense of smell?,0.0,0.0,0.000000,0.000000,0.0,0.742304,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0


In [14]:
nbrs = NearestNeighbors(n_neighbors=max_n_questions, metric='cosine').fit(diagnosis_df_train)
# save model
with open(f'{base_path}\\output\\questionnaire\\questionnaire.pkl','wb') as f:
    pickle.dump(nbrs, f)

In [15]:
# save embeddings
with open(f'{base_path}\\output\\questionnaire\\questionnaire_embeddings.pkl','wb') as f:
    pickle.dump(diagnosis_df_train, f)

In [16]:
# URTI symptoms
evidences = [
    "Do you have a sore throat?",
    "Do you have a cough?",
    "Do you have a fever (either felt or measured with a thermometer)?"
]

# Anemia symptoms
# evidences = [
#     "Is your skin much paler than usual?",
#     "Have you recently had stools that were black (like coal)?",
#     "Do you constantly feel fatigued or do you have non-restful sleep?"
# ]

# URTI and Anemia - just to test mixed initial symptoms
# evidences = [
#     "Do you have a sore throat?",
#     "Do you have a cough?",
#     "Do you have a fever (either felt or measured with a thermometer)?",
#     "Is your skin much paler than usual?",
#     "Have you recently had stools that were black (like coal)?",
#     "Do you constantly feel fatigued or do you have non-restful sleep?"
# ]

initial_evidence_count = len(evidences)

# gets the next immediate evidence
get_next_question(evidences, nbrs, diagnosis_df_train)

'Do you have nasal congestion or a clear runny nose?'

In [17]:
# sequentially, gets next immediate evidence, assuming you answered yes to all the previous questions
for i in range(initial_evidence_count, max_n_questions):
    next_question = get_next_question(evidences, nbrs, diagnosis_df_train)
    if next_question:
        print(next_question)
        evidences.append(next_question)
    else:
        break

Do you have nasal congestion or a clear runny nose?
Do you have diffuse (widespread) muscle pain?
Do you have pain somewhere, related to your reason for consulting?
Are you experiencing shortness of breath or difficulty breathing in a significant way?
Have you had significantly increased sweating?
Have you recently had a loss of appetite or do you get full more quickly then usually?
Have you had chills or shivers?
Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?
Do you feel so tired that you are unable to do your usual activities or are you stuck in your bed all day long?
Are you feeling nauseous or do you feel like vomiting?
Have you had diarrhea or an increase in stool frequency?
Do you constantly feel fatigued or do you have non-restful sleep?
Have you had an involuntary weight loss over the last 3 months?
Do you have a cough that produces colored or more abundant sputum than usual?
Have you been coughing