### Part1 : 
### Prepare input data for diagnosis portion 

In [1]:
import pandas as pd
import spacy
import random
import re

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
def get_sentence_vectors(text, nlp):
    
    # get tokens for each word in a sentence
    embedding = nlp(text).vector.tolist()
    
    return embedding



Import and merge data

In [5]:
illness_df = pd.read_csv('diaganosis symtom/dia_t.csv')
symptom_df = pd.read_csv('diaganosis symtom/sym_t.csv')
links_df = pd.read_csv('diaganosis symtom/diffsydiw.csv')

In [6]:
illness_df.head()

Unnamed: 0,did,diagnose
0,1,Abdominal aortic aneurysm(enlarged major bloo...
1,2,Abdominal swelling
2,3,Abdominal trauma
3,4,Abrasions (scrapes)
4,5,ACE inhibitor induced coughblood pressure med...


In [7]:
symptom_df.head()

Unnamed: 0,syd,symptom
0,1,Upper abdominal pain
1,2,Lower abdominal pain
2,3,Abscess (Collection of pus)
3,4,Alcohol abuse
4,5,Anxiety (Nervousness)


In [8]:
source_data = (links_df
               .merge(illness_df, on="did")
               .merge(symptom_df, on="syd"))
source_data

Unnamed: 0,syd,did,wei,diagnose,symptom
0,1.0,163.0,2.0,Cholecystitisinflammation of the gallbladder,Upper abdominal pain
1,1.0,164.0,2.0,Choledocholithiasisstone in bile duct,Upper abdominal pain
2,1.0,165.0,1.0,Cholelithiasisgallstones,Upper abdominal pain
3,1.0,187.0,2.0,Constipation,Upper abdominal pain
4,1.0,306.0,2.0,Gastric ulcerstomach ulcer,Upper abdominal pain
...,...,...,...,...,...
5563,106.0,827.0,0.0,Vulvar squamous cell carcinomaskin cancer on ...,Vaginal bleeding after menopause
5564,186.0,966.0,2.0,Hair lossalopecia,Hair loss (Baldness)
5565,186.0,1415.0,0.0,Hypoparathyroidism,Hair loss (Baldness)
5566,186.0,1415.0,0.0,Hypoparathyroidism,Hair loss (Baldness)


In [9]:
source_data

Unnamed: 0,syd,did,wei,diagnose,symptom
0,1.0,163.0,2.0,Cholecystitisinflammation of the gallbladder,Upper abdominal pain
1,1.0,164.0,2.0,Choledocholithiasisstone in bile duct,Upper abdominal pain
2,1.0,165.0,1.0,Cholelithiasisgallstones,Upper abdominal pain
3,1.0,187.0,2.0,Constipation,Upper abdominal pain
4,1.0,306.0,2.0,Gastric ulcerstomach ulcer,Upper abdominal pain
...,...,...,...,...,...
5563,106.0,827.0,0.0,Vulvar squamous cell carcinomaskin cancer on ...,Vaginal bleeding after menopause
5564,186.0,966.0,2.0,Hair lossalopecia,Hair loss (Baldness)
5565,186.0,1415.0,0.0,Hypoparathyroidism,Hair loss (Baldness)
5566,186.0,1415.0,0.0,Hypoparathyroidism,Hair loss (Baldness)


In [10]:
# Remove missing data and select columns we needed

source_data = source_data.loc[~(source_data['symptom'].isna()) & ~(source_data['diagnose'].isna()),['did', 'syd', 'diagnose', 'symptom']]

source_data.columns = ['illness_id', 'symptom_id', 'illness', 'symptom']

In [11]:
source_data

Unnamed: 0,illness_id,symptom_id,illness,symptom
0,163.0,1.0,Cholecystitisinflammation of the gallbladder,Upper abdominal pain
1,164.0,1.0,Choledocholithiasisstone in bile duct,Upper abdominal pain
2,165.0,1.0,Cholelithiasisgallstones,Upper abdominal pain
3,187.0,1.0,Constipation,Upper abdominal pain
4,306.0,1.0,Gastric ulcerstomach ulcer,Upper abdominal pain
...,...,...,...,...
5563,827.0,106.0,Vulvar squamous cell carcinomaskin cancer on ...,Vaginal bleeding after menopause
5564,966.0,186.0,Hair lossalopecia,Hair loss (Baldness)
5565,1415.0,186.0,Hypoparathyroidism,Hair loss (Baldness)
5566,1415.0,186.0,Hypoparathyroidism,Hair loss (Baldness)


In [12]:
# tidy up some new messy characters
source_data['illness'] = source_data['illness'].str.replace('\x0b', ' ')
source_data['symptom'] = source_data['symptom'].str.replace('\x0b', ' ')

In [13]:
source_data

Unnamed: 0,illness_id,symptom_id,illness,symptom
0,163.0,1.0,Cholecystitis inflammation of the gallbladder,Upper abdominal pain
1,164.0,1.0,Choledocholithiasis stone in bile duct,Upper abdominal pain
2,165.0,1.0,Cholelithiasis gallstones,Upper abdominal pain
3,187.0,1.0,Constipation,Upper abdominal pain
4,306.0,1.0,Gastric ulcer stomach ulcer,Upper abdominal pain
...,...,...,...,...
5563,827.0,106.0,Vulvar squamous cell carcinoma skin cancer on ...,Vaginal bleeding after menopause
5564,966.0,186.0,Hair loss alopecia,Hair loss (Baldness)
5565,1415.0,186.0,Hypoparathyroidism,Hair loss (Baldness)
5566,1415.0,186.0,Hypoparathyroidism,Hair loss (Baldness)


Use our function to convert each symptom to a vector representation we can then save our symptom data down to the data folder

In [14]:
symptom_df = symptom_df.loc[~symptom_df['symptom'].isna()]

symptom_df['embedding'] = symptom_df.apply(lambda row: get_sentence_vectors(row['symptom'], nlp), axis=1)
symptom_df.columns = ['symptom_id', 'symptom', 'symptom_vector']

In [15]:
symptom_df

Unnamed: 0,symptom_id,symptom,symptom_vector
0,1,Upper abdominal pain,"[-0.3931533396244049, -1.122189998626709, -2.3..."
1,2,Lower abdominal pain,"[0.11224666982889175, -1.3984565734863281, -2...."
2,3,Abscess (Collection of pus),"[-4.394866943359375, -5.325353145599365, 3.294..."
3,4,Alcohol abuse,"[-1.0100150108337402, -3.2876999378204346, -1...."
4,5,Anxiety (Nervousness),"[-2.1256749629974365, -4.137800216674805, 4.55..."
...,...,...,...
267,295,Nipple discharge,"[-1.7614949941635132, 0.5206300020217896, -1.9..."
268,301,Shoulder stiffness or tightness,"[-1.2457798719406128, 2.181957483291626, -6.26..."
269,303,Arm stiffness or tightness,"[-1.7361524105072021, 1.2958674430847168, -4.8..."
270,304,High blood pressure,"[-0.959559977054596, 0.45383667945861816, -2.6..."


In [17]:
# remove messy charecters

symptom_df['symptom'] = symptom_df['symptom'].str.replace('\x0b', ' ')

In [18]:
source_data.to_pickle('input_data/source_data.pkl')
symptom_df.to_pickle('input_data/symptoms.pkl')


### Create a vector of symtpoms for each illness described in the data

Here we will loop through each illness described in the dataset
and flag (with a 1 or 0) each of the symptoms of that illness. 
We will do then compare a list of flagged symptoms described by 
the user to this using a cosine similarity function to diagnose
a potential illness.

In [None]:
print(source_data.columns)

Index(['illness_id', 'symptom_id', 'illness', 'symptom'], dtype='object')


In [23]:
# list of illness 
illnesses = list(source_data['illness'].drop_duplicates())

# list we will use store our illness vectors 
symptom_vectors = []

for illness in illnesses:
    
    illnesses_symptoms = list(source_data.loc[source_data['illness'] == illness, 'symptom'].drop_duplicates())
    
    symptom_df['related_to_illness'] = 0
    symptom_df.loc[symptom_df['symptom'].isin(illnesses_symptoms), "related_to_illness"] = 1
    
    symptom_vectors.append(list(symptom_df["related_to_illness"]))
    
diagnosis_data = pd.DataFrame({"illness":illnesses,
                                "illness_vector":symptom_vectors})
diagnosis_data.to_pickle('input_data/diagnosis_data.pkl')


In [None]:
print(source_data.columns)

Index(['illness_id', 'symptom_id', 'illness', 'symptom'], dtype='object')


In [None]:
print(source_data.columns)

Index(['illness_id', 'symptom_id', 'illness', 'symptom'], dtype='object')


In [None]:
print(source_data.columns)

Index(['illness_id', 'symptom_id', 'illness', 'symptom'], dtype='object')


In [None]:
print(source_data.columns)

Index(['illness_id', 'symptom_id', 'illness', 'symptom'], dtype='object')


### Part 2:
### Generate training samples with flagged entities

In this part we will attempt to generate some training examples for the NLU model to understand how our users will describe symtoms to the chat bot. To do this we simply loop through our symptoms, sometimes combining them before appending them to different beginnings/endings.

We also make sure to tag any symptom describtions as a symptom entity, which will be understood by Rasa's NLU model. You can see this is accomplished by inserting '(symptom)' beside any mention of symptoms.

The output of this is inserted into nlu.md within the main project.

In [20]:
number_of_symtoms = [1, 2, 3, 4]
start_of_description = [
    "I have",
    "I'm suffering from",
    "I have really bad",
    "My symptoms are",
    "For the last few days I have had",
    "My husband is suffering from" ,
    "My wife is suffering from",
    "My son is suffering from",
    "My daughter is suffering from",
    "My child is suffering from",
    "I don't feel well, I have"
]

# Get some examples of users describing different number of symtoms 

for symptoms_count in number_of_symtoms:
    
    # make 100 example of each number of symtoms 
    for ex in range(1, 101):
        
        description_beginning = random.choice(start_of_description)
        
        # collect some random symtoms 
        symptom_1 = symptom_df['symptom'].sample(1).iloc[0].lower()
        symptom_2 = symptom_df['symptom'].sample(1).iloc[0].lower()
        symptom_3 = symptom_df['symptom'].sample(1).iloc[0].lower()
        symptom_4 = symptom_df['symptom'].sample(1).iloc[0].lower()
        
        symptoms = [symptom_1, symptom_2, symptom_3, symptom_4]
        symptom_entity = []
        
        # Remove paranthases from symptoms and add nessecary entity tags to symtoms 
        for symptom in symptoms:
            symptom = re.sub(r"\([^)]+\)", "", symptom).strip()
            symptom = f"[{symptom}](symptom)"
            symptom_entity.append(symptom)
            
        symptom_1 = symptom_entity[0]
        symptom_2 = symptom_entity[1]
        symptom_3 = symptom_entity[2]
        symptom_4 = symptom_entity[3]
        
        # Create the training sample strig 
        
        if symptoms_count == 1:
            
            symptom_string = f"- {description_beginning} {symptom_1}"
            
        if symptoms_count == 2:
            
            symptom_string = f"- {description_beginning} {symptom_1} and {symptom_2}"
            
        if symptoms_count == 3:
            
            symptom_string = f"- {description_beginning} {symptom_1}, {symptom_2}, and {symptom_3}"
            
        if symptoms_count == 4:
            
            symptom_string = f"- {description_beginning} {symptom_1}, {symptom_2}, {symptom_3}, {symptom_4}"
        
        print(symptom_string)

- I'm suffering from [flatulence](symptom)
- My child is suffering from [darkening of the skin](symptom)
- For the last few days I have had [scrotal pain](symptom)
- My husband is suffering from [joint pain](symptom)
- My symptoms are [agitated](symptom)
- I have [liver failure](symptom)
- My daughter is suffering from [heel pain](symptom)
- I don't feel well, I have [alcohol abuse](symptom)
- I have really bad [rash](symptom)
- My son is suffering from [abscess](symptom)
- My husband is suffering from [elbow swelling](symptom)
- My wife is suffering from [tongue swelling](symptom)
- I don't feel well, I have [diarrhea](symptom)
- My wife is suffering from [ringing in ears](symptom)
- My son is suffering from [shoulder ache or pain](symptom)
- I'm suffering from [back ache or pain](symptom)
- My husband is suffering from [skin pain](symptom)
- I'm suffering from [painful urination](symptom)
- My wife is suffering from [skin bumps](symptom)
- My daughter is suffering from [bloody diarrh

In [21]:
source_data.to_pickle('input_data/diagnosis_data.pkl')

In [22]:
print(source_data.columns)

Index(['illness_id', 'symptom_id', 'illness', 'symptom'], dtype='object')
