In [1]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import json
from sklearn.neighbors import NearestNeighbors
import pickle
from constants import base_path, max_n_questions
import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_loaded = SentenceTransformer(f"{base_path}\\input\\BioBERT-mnli-snli-scinli-scitail-mednli-stsb")

In [3]:
with open(f"{base_path}\\input\\release_evidences.json") as f:
  evidences = json.load(f)
evidences_list_code = []
evidences_list = []
evidences_dict = {}
for e in evidences.keys():
  # only binary symptoms and no antecedents
  if (not evidences[e]["possible-values"]) and (not evidences[e]["is_antecedent"]):
    evidences_list_code.append(e)
    evidences_list.append(evidences[e]["question_en"])
    evidences_dict[e] = evidences[e]["question_en"]
evidences_list

['Do you have a fever (either felt or measured with a thermometer)?',
 'Do you have pain somewhere, related to your reason for consulting?',
 'Did you lose consciousness?',
 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?',
 'Is your skin much paler than usual?',
 'Do you feel your heart is beating fast (racing), irregularly (missing a beat) or do you feel palpitations?',
 'Have you recently thrown up blood or something resembling coffee beans?',
 'Have you recently had stools that were black (like coal)?',
 'Have you had diarrhea or an increase in stool frequency?',
 'Do you feel like you are (or were) choking or suffocating?',
 'Do you constantly feel fatigued or do you have non-restful sleep?',
 'Are you more irritable or has your mood been very unstable recently?',
 'Do you feel lightheaded and dizzy or do you feel like you are about to faint?',
 'Are you feeling nauseous or do you feel like vomiting?'

In [4]:
embeddings = model_loaded.encode(evidences_list)
print(embeddings.shape)

(96, 768)


In [5]:
embeddings_df = pd.DataFrame({f'embedding_{i}': embeddings[:, i] for i in range(embeddings.shape[1])})
embeddings_df["evidence"] = evidences_list
embeddings_df.set_index('evidence', inplace=True)
embeddings_df.head()

Unnamed: 0_level_0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_758,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767
evidence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Do you have a fever (either felt or measured with a thermometer)?,-0.186785,-0.254875,0.502846,0.712425,-1.102803,0.712057,0.485973,-1.265716,-0.518662,0.203821,...,0.299799,0.142753,0.022093,-0.721777,0.775123,0.539433,-0.980062,-0.945469,-0.171498,-0.308409
"Do you have pain somewhere, related to your reason for consulting?",0.37568,-0.688472,0.478563,0.068375,-0.615628,0.249935,0.442019,-0.695014,-0.052756,-0.468893,...,0.062913,0.224722,0.076708,0.217093,-0.620884,0.493132,0.071175,-0.177799,0.105786,-0.22716
Did you lose consciousness?,-0.144193,-0.120703,0.268528,-0.113711,0.066368,0.494026,0.399974,0.095985,-0.083846,-0.32208,...,0.443179,-0.343062,0.933766,0.416377,-0.162857,0.203479,-0.698537,0.649079,-0.004801,-0.520834
"Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?",0.173169,-0.55464,0.068565,0.259645,-0.15509,0.549358,-0.021356,-1.059719,-0.24826,-0.461824,...,-0.634844,-0.274938,-0.059905,0.451829,-0.034896,-0.133479,-0.572933,-0.076498,-0.083948,-0.333245
Is your skin much paler than usual?,0.127169,0.086014,-0.547387,0.224529,-0.049569,-0.074101,-0.000539,-0.337306,-0.502584,-0.140509,...,-0.200615,-0.453436,-0.580144,0.713091,-0.68222,0.003408,-0.827188,-0.286412,0.789284,-0.725382


In [6]:
nbrs = NearestNeighbors(n_neighbors=max_n_questions, metric='cosine').fit(embeddings_df)
# save model
with open(f'{base_path}\\output\\semantic_search\\semantic_search.pkl','wb') as f:
    pickle.dump(nbrs, f)

In [7]:
# test
input_test = "I'm always tired."
input_embeddings = model_loaded.encode([input_test])

In [8]:
print(input_embeddings.shape)

(1, 768)


In [9]:
_, indices = nbrs.kneighbors(input_embeddings)
ask_list = [evidences_list[i] for i in indices[0] if evidences_list[i] not in evidences]
# here, ask list assumes that all answers to previous questions are yes
# this is for testing purposes only. handling of "no" answers is in app.py
ask_list

['Do you feel so tired that you are unable to do your usual activities or are you stuck in your bed all day long?',
 'Do you constantly feel fatigued or do you have non-restful sleep?',
 'Do your symptoms of muscle weakness increase with fatigue and/or stress?',
 'Are your symptoms more prominent at night?',
 'Do you ever temporarily stop breathing while you’re asleep?',
 'Do you have bouts of choking or shortness of breath that wake you up at night?',
 'Do you have symptoms that are increased with physical exertion but alleviated with rest?',
 'Do you have pain that is increased when you breathe in deeply?',
 'Do you have chest pain even at rest?',
 'Do you have symptoms that get worse after eating?',
 'Do you have a burning sensation that starts in your stomach then goes up into your throat, and can be associated with a bitter taste in your mouth?',
 'Do you have annoying muscle spasms in your face, neck or any other part of your body?',
 'Have you noticed any new fatigue, generalize