In [2]:
import json
import re
import spacy as sp
import pandas as pd

with open('data/all_subjects.json') as f:
    subjects = json.load(f)

print(f'There are {len(subjects)} subjects in the dataset')

There are 640 subjects in the dataset


## Exploratory Data Analysis

#### Create dataframe with useful information

In [10]:
subject_data = {
    'id': [subject['id'] for subject in subjects],
    'subject_title': [subject['title'] for subject in subjects],
    'url': [subject['link'] for subject in subjects],
    #'summary': [subject['summary'] for subject in subjects],
    #'synonyms': [', '.join(subject['synonyms']) if subject['synonyms'] else None for subject in subjects],
    'icpc': [', '.join(subject['codes']['icpc']) if subject['codes']['icpc'] else "" for subject in subjects],
    #'medical_source': [', '.join(subject['codes']['medical_source']) if subject['codes']['medical_source'] else None for subject in subjects]
}

subject_df = pd.DataFrame(subject_data)

subject_df


#convert subject_title, url and icpc to string
subject_df['subject_title'] = subject_df['subject_title'].astype(str)
subject_df['url'] = subject_df['url'].astype(str)
subject_df['icpc'] = subject_df['icpc'].astype(str)

#if there is a ; in icpc code column, convert to ,
subject_df['icpc'] = subject_df['icpc'].str.replace(';', ',')

#convert all lower case letters to upper case in icpc code column
subject_df['icpc'] = subject_df['icpc'].str.upper()

subject_df.to_csv('data/subject_data.csv', index=False)
print(subject_df.dtypes)



id                int64
subject_title    object
url              object
icpc             object
dtype: object


In [None]:
#clean the text in the summary
html_pattern = re.compile(r'<[^>]+>')
escape_pattern = re.compile(r'\\u[0-9a-fA-F]{4}')
newline_pattern = re.compile(r'(\n|\t|\xa0|\u202f)')

clean_text = [newline_pattern.sub(' ', escape_pattern.sub(' ', html_pattern.sub(' ', item))) for item in subject_df['summary']]

subject_df['summary'] = [item.strip() for item in clean_text]

#### Exploration of ICPC codes

In [None]:
subjects_no_icpc = []
subjects_icpc = []
count = 0

for i in range(len(subject_df)):
    subject_id = subject_df['id'][i]
    subject_title = subject_df['subject_title'][i]
    icpc_code = subject_df['icpc'][i]

    
    if icpc_code is None:
        subjects_no_icpc.append((subject_id, subject_title))
    else:
        # Check for invalid ICPC codes and correct them
        if ';' in icpc_code:
            count+= 1
            icpc_code = icpc_code.replace(';', ',')
            subject_df.loc[subject_df['id'] == subject_id, 'icpc'] = icpc_code

        subjects_icpc.append((subject_id, subject_title, icpc_code))

print(f'Number of subjects with no ICPC codes: {len(subjects_no_icpc)}')
print(f'Number of subjects with ICPC codes: {len(subjects_icpc)}')

print(count)

Number of subjects with no ICPC codes: 48
Number of subjects with ICPC codes: 592
22


In [None]:
subjects_multiple_icpc = [item for item in subjects_icpc if ',' in item[2]]
print(f'Number of subjects with multiple ICPC codes: {len(subjects_multiple_icpc)}')

Number of subjects with multiple ICPC codes: 193


In [None]:
subjects_specific_icpc = [item for item in subjects_icpc if '.' in item[2]]
print(f'Number of subjects with specific ICPC codes: {len(subjects_specific_icpc)}')

Number of subjects with specific ICPC codes: 97


In [None]:
subject_df[subject_df['subject_title'].str.contains('buikpijn', case=False, na=False)]


Unnamed: 0,id,subject_title,summary,synonyms,icpc,medical_source
223,11672,Buikpijn,"Buikpijn komt vaak door te vet eten, een virus...",Pijn in de buik,"D01, D02, D06, D08, D12, D73, D87",NHG
323,11833,Buikpijn bij kinderen,Buikpijn bij kinderen kan bijvoorbeeld komen d...,,"D87, D73, D12, D08, D06, D02, D01","FMS, NHG"


In [None]:
#ICPC codes can be used to classify the situations into different categories. 
#The categories are: symptomen en klachten (01-29), diagnostische/preventieve verrichtingen (30-49), 
#medicatie/therapeutische verrichtingen (50-59), uitslagen van onderzoek (60-61), administratieve verrichtingen (62), v
#erwijzingen/andere verrichtingen (63-69), ziekten (70-99).

subject_df['icpc_category'] = subject_df['id'].apply(lambda x: [])


def assign_icpc_category(category, subject_id):
    categories_map = {
        (1, 29): 'symptomen en klachten',
        (30, 49): 'diagnostische/preventieve verrichtingen',
        (50, 59): 'medicatie/therapeutische verrichtingen',
        (60, 61): 'uitslagen van onderzoek',
        (62, 62): 'administratieve verrichtingen',
        (63, 69): 'verwijzingen/andere verrichtingen',
        (70, 99): 'ziekten'
    }

    categories = []
    for cat_range, cat_name in categories_map.items():
        if cat_range[0] <= category <= cat_range[1]:
            categories.append(cat_name)

    return categories

for subject in subjects_icpc:
    icpc_code = subject[2].replace(" ", "")
    categories = set()

    if ',' in icpc_code:
        categories_list = icpc_code.split(',')
        for category in categories_list:
            if category:
                
                category = int(category[1:3])
                new_categories = assign_icpc_category(category, subject[0])
                for new_cat in new_categories:
                    categories.add(new_cat)
    else:
        category = int(icpc_code[1:3])
        new_categories = assign_icpc_category(category, subject[0])
        for new_cat in new_categories:
            categories.add(new_cat)

    # Find the indices where the condition is true
    indices = subject_df.index[subject_df['id'] == subject[0]].tolist()

    # Update categories for each index separately
    for idx in indices:
        current_categories = set(subject_df.at[idx, 'icpc_category'])
        current_categories.update(categories)
        subject_df.at[idx, 'icpc_category'] = list(current_categories)


In [None]:
icpc_category_counts = subject_df['icpc_category'].apply(pd.Series).stack().value_counts()
icpc_category_counts
 
subject_df[subject_df['icpc_category'].apply(lambda x: 'verwijzingen/andere verrichtingen' in x)]

  icpc_category_counts = subject_df['icpc_category'].apply(pd.Series).stack().value_counts()


Unnamed: 0,id,subject_title,summary,synonyms,icpc,medical_source,icpc_category
30,11878,Levenseinde,Het is belangrijk om met uw arts te praten ove...,"Overlijden, Doodgaan, Sterven, Wilsverklaring,...",A69.01,"FMS, NHG",[verwijzingen/andere verrichtingen]
476,14821,Vermoeidheid bij kanker die niet meer te genez...,"Als kanker niet meer te genezen is, kunt u vee...",,"A04, A69.01",NHG,"[verwijzingen/andere verrichtingen, symptomen ..."


In [None]:
subject_df[(subject_df['medical_source'].str.contains('FMS', na=False)) & (subject_df['icpc'].notnull())]

#alleen allebei
subjects_fms_nhg = subject_df[(subject_df['medical_source'].str.contains('FMS', na=False)) & (subject_df['medical_source'].str.contains('NHG', na=False)) & (subject_df['icpc'].notnull())].drop(columns=['summary'])

subjects_fms_nhg.to_csv('data/subjects_fms_nhg.csv', index=False)

#alleen FMS en niet ook NHG 
subjects_fms = subject_df[(subject_df['medical_source'].str.contains('FMS', na=False)) & (subject_df['medical_source'].str.contains('NHG', na=False) == False) & (subject_df['icpc'].notnull())].drop(columns=['summary'])

#save the dataframe to a csv file
subjects_fms.to_csv('data/subjects_only_fms.csv', index=False)



#### Exploration of medical sources

In [None]:
subjects_no_medical_source = []
for i in range(len(subject_df)):
    if subject_df['medical_source'][i] is None:
        subjects_no_medical_source.append((subject_df['id'][i], subject_df['subject_title'][i]))

print(f'Number of subjects with no medical source: {len(subjects_no_medical_source)}')

Number of subjects with no medical source: 26


#### Exploration of synonyms

In [None]:
no_synonyms= []
synonyms_present = []

for i in range(len(subject_df)):
    if subject_df['synonyms'][i] is None:
        no_synonyms.append((subject_df['id'][i], subject_df['subject_title'][i]))
    else:
        synonyms_present.append((subject_df['id'][i], subject_df['subject_title'][i], subject_df['synonyms'][i]))

print(f'Number of subjects with no synonyms listed: {len(no_synonyms)}')
print(f'Number of subjects with listed synonyms: {len(synonyms_present)}')

Number of subjects with no synonyms listed: 128
Number of subjects with listed synonyms: 512


In [None]:
synonyms_no_icpc = []
for i in range(len(subject_df)):
    if subject_df['synonyms'][i] is not None and subject_df['icpc'][i] is None:
        synonyms_no_icpc.append((subject_df['id'][i], subject_df['subject_title'][i], subject_df['synonyms'][i]))

print(f'Number of subjects with synonyms but no ICPC codes: {len(synonyms_no_icpc)}')
synonyms_no_icpc

Number of subjects with synonyms but no ICPC codes: 36


[(22397, 'PEG-sonde', 'Gastrostomie-katheter'),
 (11150, 'Bloed uit de vagina', 'ongesteld, menstruatie'),
 (26987, 'Nadenken over je leven', 'Zingeving, spiritualiteit, levensvragen'),
 (25746,
  'Psychische klachten en werk',
  'Re-integratie bij psychische klachten'),
 (12635, 'Te langzaam werkende schildklier', 'Hypothyreoïdie, Hashimoto'),
 (21360, 'Kijkoperatie van de knie', 'Artroscopie'),
 (12642,
  'Pijnstillers zoals ibuprofen, naproxen en diclofenac (NSAID)',
  'Ontstekingsremmer'),
 (20507, "Röntgenfoto's", 'Röntgenonderzoek'),
 (17228, 'MRI', 'Magnetic Resonance Imaging'),
 (11664, 'Vitamine D', 'Ergocalciferol, Colecalciferol, D'),
 (12016, 'Ziekte van Scheuermann', 'Scheuermann, Juveniele kyfose'),
 (21410, 'Pijn onder in buik en in bekken', 'Chronische bekkenpijn'),
 (12588, 'Sterilisatie bij de vrouw', 'Eileiders afsluiten'),
 (12584, 'Hormoonring', 'Vaginale ring, anticonceptie-ring'),
 (19449, 'Echo', 'Echografie'),
 (27403,
  'Hoge druk in de hersenen',
  'idiopathi

## Natural Language Processing

In [None]:
#extract title and summary  from each subject and store in a list
docs = []
for subject in subjects:
    doc = subject['title'] + '. ' + subject['summary']
    docs.append(doc)

html_pattern = re.compile(r'<[^>]+>')
escape_pattern = re.compile(r'\\u[0-9a-fA-F]{4}')
newline_pattern = re.compile(r'(\n|\t|\xa0|\u202f)')


clean_text = [newline_pattern.sub(' ', escape_pattern.sub(' ', html_pattern.sub(' ', item))) for item in docs]

clean_text = [item.strip() for item in clean_text]

In [3]:
nlp = sp.load('nl_core_news_sm')
for i in range(0,len(subjects)-1):
    doc = nlp("Deze gel of crème mag je niet gebruiken als je zwanger bent of wilt worden.")
    for token in doc:
        print(token.text, token.pos_, token.dep_)

    #delete prepositions and stop words, and punctuation and lemmatize tokens (to get canonical form)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.pos_ == 'ADP']
    print(lemmas)
    #give me all adjacent pairs of words
    pairs = []
    for i in range(len(lemmas)-1):
        pairs.append((lemmas[i], lemmas[i+1]))
    print(pairs)

Deze DET det
gel NOUN obj
of CCONJ cc
crème ADJ conj
mag AUX aux
je PRON nsubj
niet ADV advmod
gebruiken VERB ROOT
als SCONJ mark
je PRON nsubj
zwanger ADJ advcl
bent AUX cop
of CCONJ cc
wilt VERB conj
worden AUX cop
. PUNCT punct
['gel', 'crème', 'gebruiken', 'zwanger', 'willen']
[('gel', 'crème'), ('crème', 'gebruiken'), ('gebruiken', 'zwanger'), ('zwanger', 'willen')]
Deze DET det
gel NOUN obj
of CCONJ cc
crème ADJ conj
mag AUX aux
je PRON nsubj
niet ADV advmod
gebruiken VERB ROOT
als SCONJ mark
je PRON nsubj
zwanger ADJ advcl
bent AUX cop
of CCONJ cc
wilt VERB conj
worden AUX cop
. PUNCT punct
['gel', 'crème', 'gebruiken', 'zwanger', 'willen']
[('gel', 'crème'), ('crème', 'gebruiken'), ('gebruiken', 'zwanger'), ('zwanger', 'willen')]
Deze DET det
gel NOUN obj
of CCONJ cc
crème ADJ conj
mag AUX aux
je PRON nsubj
niet ADV advmod
gebruiken VERB ROOT
als SCONJ mark
je PRON nsubj
zwanger ADJ advcl
bent AUX cop
of CCONJ cc
wilt VERB conj
worden AUX cop
. PUNCT punct
['gel', 'crème', 'geb

In [None]:
#