In [1]:
import json
import re
import spacy as sp
import pandas as pd

with open('data/all_subjects.json') as f:
    subjects = json.load(f)

print(f'There are {len(subjects)} subjects in the dataset')

There are 640 subjects in the dataset


In [None]:
for subject in subjects:
    for category in subject['content']

#### Create dataframe with useful information

In [2]:
data = {
    'id': [],
    'subject_title': [],
    'summary': [],
    'synonyms': [],
    'icpc': []
}

for subject in subjects:
    data['id'].append(subject['id']) #extract the id 
    data['subject_title'].append(subject['title']) #extract the title of the subject
    data['summary'].append(subject['summary']) #extract the text of the subject
    synonyms = subject['synonyms'] #extract the synonyms of the subject
    if synonyms:
        data['synonyms'].append(', '.join(synonyms))
    else:
        data['synonyms'].append(None)
    icpc_codes = subject['codes']['icpc'] #extract the icpc code of the subject
    if icpc_codes:
        data['icpc'].append(', '.join(icpc_codes))
    else:
        data['icpc'].append(None)  # if there is no icpc code, give None

df = pd.DataFrame(data)

#clean the text in summary
html_pattern = re.compile(r'<[^>]+>')
escape_pattern = re.compile(r'\\u[0-9a-fA-F]{4}')
newline_pattern = re.compile(r'(\n|\t|\xa0|\u202f)')

clean_text = [newline_pattern.sub(' ', escape_pattern.sub(' ', html_pattern.sub(' ', item))) for item in df['summary']]

df['summary'] = [item.strip() for item in clean_text]

df

Unnamed: 0,id,subject_title,summary,synonyms,icpc
0,11381,Bijtwond,Een bijtwond komt door een beet van een dier o...,Gebeten,S13
1,11463,Open been,Een open been is een wond op het onderbeen die...,Ulcus cruris,S97
2,11323,Dikke enkels of onderbenen,Uw enkels of onderbenen kunnen dik worden door...,Oedeem,K07
3,11165,Wondroos,Wondroos is een ontsteking van de huid. He...,"Erysipelas, Cellulitis",S76.01
4,16408,Prik tegen pneumokokken,Pneumokokken zijn bacteriën. Vooral ouderen k...,Pneumokokkenprik,A44
...,...,...,...,...,...
635,12640,Vitiligo,Vitiligo is een aandoening waarbij de pigmentc...,,
636,22377,Downsyndroom,Kinderen met downsyndroom: hebben een verst...,,A90
637,16796,Trombose in het oog,Bij trombose in het oog zitten 1 of meer bloed...,,"K94, F94"
638,12081,Dikkere borsten bij mannen,Dikkere borsten komen bij pubers en bij mannen...,"Gynaecomastie, Borstzwelling bij mannen",Y29.01


In [11]:
#give me the number of categories
print(f'There are {len(df)} categories in the dataset')

There are 640 categories in the dataset


#### Exploration of subjects and ICPC codes

In [3]:
no_icpc = []
icpc_present = []

for i in range(len(df)):
    if df['icpc'][i] is None:
        no_icpc.append((df['id'][i], df['subject_title'][i]))
    else:
        icpc_present.append((df['id'][i], df['subject_title'][i], df['icpc'][i]))

print(f'Number of subjects with no ICPC codes: {len(no_icpc)}')
print(f'Number of subjects with ICPC codes: {len(icpc_present)}')

print("\nSubjects with no ICPC codes:")
for subject_id, subject_title in no_icpc:
    print(f'ID: {subject_id}, Title: {subject_title}')

Number of subjects with no ICPC codes: 48
Number of subjects with ICPC codes: 592

Subjects with no ICPC codes:
ID: 22397, Title: PEG-sonde
ID: 11150, Title: Bloed uit de vagina
ID: 26987, Title: Nadenken over je leven
ID: 25746, Title: Psychische klachten en werk
ID: 12635, Title: Te langzaam werkende schildklier
ID: 11417, Title: Zwanger
ID: 28624, Title: Slecht zien en blind zijn
ID: 21360, Title: Kijkoperatie van de knie
ID: 12642, Title: Pijnstillers zoals ibuprofen, naproxen en diclofenac (NSAID)
ID: 21160, Title: Bloedverdunners
ID: 20507, Title: Röntgenfoto's
ID: 17228, Title: MRI
ID: 20140, Title: CT-scan
ID: 11664, Title: Vitamine D
ID: 12016, Title: Ziekte van Scheuermann
ID: 21410, Title: Pijn onder in buik en in bekken
ID: 12361, Title: Ziek kind
ID: 12588, Title: Sterilisatie bij de vrouw
ID: 12584, Title: Hormoonring
ID: 19449, Title: Echo
ID: 27403, Title: Hoge druk in de hersenen
ID: 23875, Title: Uitzaaiingen in de hersenen
ID: 22777, Title: Veranderingen in zorg als 

In [4]:
icpc_present

[(11381, 'Bijtwond', 'S13'),
 (11463, 'Open been', 'S97'),
 (11323, 'Dikke enkels of onderbenen', 'K07'),
 (11165, 'Wondroos', 'S76.01'),
 (16408, 'Prik tegen pneumokokken', 'A44'),
 (11314, 'Eczeem', 'S87'),
 (11923, 'Eetprobleem', 'T05, T06'),
 (28622, 'Contacteczeem', 'S88'),
 (11098, 'Depressie', 'P03, P77, P76'),
 (11438, 'Zon en zonnebrand', 'S14'),
 (11243, 'Eczeem met schilfers op hoofd, gezicht of oren', 'S86'),
 (11429, 'Lange tijd of vaak neusklachten', 'R97, R07'),
 (11594, 'Allergie', 'A12'),
 (17786, 'Eczeem door erfelijke aanleg', 'S87'),
 (11596, 'Droge huid', 's21'),
 (11668, 'Jeuk', 's02'),
 (11385, 'Hooikoorts', 'R97'),
 (12278, 'Ontsteking van de huid rond de mond', 'S88'),
 (11184, 'Ziekte van Parkinson', 'N87'),
 (24122, 'Zwanger en corona', 'R83.03'),
 (24126, 'Corona in het ziekenhuis', 'R83.03'),
 (24128, 'Na corona', 'R83.03'),
 (24124, 'Corona hebben', 'R83.03'),
 (23992, 'Coronaprik', 'R83.03'),
 (15757, 'Corona', 'R83'),
 (24136, 'Lang klachten na corona', 

In [5]:
multiple_icpc = []
for i in range(len(df)):
    if df['icpc'][i] is not None:
        if ',' in df['icpc'][i]:
            multiple_icpc.append((df['id'][i], df['subject_title'][i], df['icpc'][i]))

print(f'Number of subjects with multiple ICPC codes: {len(multiple_icpc)}')

Number of subjects with multiple ICPC codes: 173


In [6]:
multiple_icpc

[(11923, 'Eetprobleem', 'T05, T06'),
 (11098, 'Depressie', 'P03, P77, P76'),
 (11429, 'Lange tijd of vaak neusklachten', 'R97, R07'),
 (24130, 'Naaste met corona', 'R83.03, Z22'),
 (11916, 'Darmkanker', 'D16, D75'),
 (11102, 'Artrose', 'L91, L90, L89, L84, L20'),
 (11221, 'Knieklachten', 'L99, L94, L90, L15, L78, L96'),
 (26412, 'Overgewicht bij kinderen', 'T83, T82'),
 (11630, 'Gezond leven', 'P17, T5, T83, T91'),
 (11387, 'Keelpijn', 'r76, R72, R21.01'),
 (11261, 'Reumatoïde artritis', 'N93, L88'),
 (12462, 'Kunstheup', 'L89, L75, L13'),
 (11181, 'Hartaanval', 'K76, K75, K74'),
 (28299, 'Werk en privé-leven', 'Z05, Z29'),
 (11968,
  'Communicatie, hoorproblemen en verstandelijke beperkingen',
  'P85, H86'),
 (11467, 'Slecht horen', 'H02, H84, H86'),
 (11294, 'Borstkanker opsporen', 'X26, X76, A97'),
 (11345, 'Ziekte van Dupuytren', 'L99.03, L87'),
 (21070, 'Plekje op de huid', 'S99, S77'),
 (11639, 'Moedervlekken', 'S82, S77'),
 (26410, 'Overgewicht bij volwassenen', 'T83, T82'),
 (1

In [7]:
df['icpc'].value_counts()
print(f'There are {len(df["icpc"].value_counts())} unique ICPC codes')

There are 494 unique ICPC codes


#### Exploration of subjects and synonyms

In [12]:
no_synonyms= []
synonyms_present = []

for i in range(len(df)):
    if df['synonyms'][i] is None:
        no_synonyms.append((df['id'][i], df['subject_title'][i]))
    else:
        synonyms_present.append((df['id'][i], df['subject_title'][i], df['synonyms'][i]))

print(f'Number of subjects with no synonyms listed: {len(no_synonyms)}')
print(f'Number of subjects with listed synonyms: {len(synonyms_present)}')

# print("\nSubjects with no ICPC codes:")
# for subject_id, subject_title in no_icpc:
#     print(f'ID: {subject_id}, Title: {subject_title}')

Number of subjects with no synonyms listed: 128
Number of subjects with listed synonyms: 512


In [10]:
synonyms_no_icpc = []
for i in range(len(df)):
    if df['synonyms'][i] is not None and df['icpc'][i] is None:
        synonyms_no_icpc.append((df['id'][i], df['subject_title'][i], df['synonyms'][i]))

print(f'Number of subjects with synonyms but no ICPC codes: {len(synonyms_no_icpc)}')
synonyms_no_icpc

Number of subjects with synonyms but no ICPC codes: 36


[(22397, 'PEG-sonde', 'Gastrostomie-katheter'),
 (11150, 'Bloed uit de vagina', 'ongesteld, menstruatie'),
 (26987, 'Nadenken over je leven', 'Zingeving, spiritualiteit, levensvragen'),
 (25746,
  'Psychische klachten en werk',
  'Re-integratie bij psychische klachten'),
 (12635, 'Te langzaam werkende schildklier', 'Hypothyreoïdie, Hashimoto'),
 (21360, 'Kijkoperatie van de knie', 'Artroscopie'),
 (12642,
  'Pijnstillers zoals ibuprofen, naproxen en diclofenac (NSAID)',
  'Ontstekingsremmer'),
 (20507, "Röntgenfoto's", 'Röntgenonderzoek'),
 (17228, 'MRI', 'Magnetic Resonance Imaging'),
 (11664, 'Vitamine D', 'Ergocalciferol, Colecalciferol, D'),
 (12016, 'Ziekte van Scheuermann', 'Scheuermann, Juveniele kyfose'),
 (21410, 'Pijn onder in buik en in bekken', 'Chronische bekkenpijn'),
 (12588, 'Sterilisatie bij de vrouw', 'Eileiders afsluiten'),
 (12584, 'Hormoonring', 'Vaginale ring, anticonceptie-ring'),
 (19449, 'Echo', 'Echografie'),
 (27403,
  'Hoge druk in de hersenen',
  'idiopathi

In [8]:
#extract title and summary  from each subject and store in a list
docs = []
for subject in subjects:
    doc = subject['title'] + '. ' + subject['summary']
    docs.append(doc)

html_pattern = re.compile(r'<[^>]+>')
escape_pattern = re.compile(r'\\u[0-9a-fA-F]{4}')
newline_pattern = re.compile(r'(\n|\t|\xa0|\u202f)')


clean_text = [newline_pattern.sub(' ', escape_pattern.sub(' ', html_pattern.sub(' ', item))) for item in docs]

clean_text = [item.strip() for item in clean_text]

In [29]:
nlp = sp.load('nl_core_news_sm')
for i in range(0,len(subjects)-1):
    doc = nlp(clean_text[i])
    for token in doc:
        print(token.text, token.pos_, token.dep_)

    #delete prepositions and stop words, and punctuation and lemmatize tokens (to get canonical form)
    lemmas = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.pos_ == 'ADP']
    print(lemmas)
    #give me all adjacent pairs of words
    pairs = []
    for i in range(len(lemmas)-1):
        pairs.append((lemmas[i], lemmas[i+1]))
    print(pairs)

Ziekte NOUN ROOT
van ADP case
Parkinson PROPN nmod
. PUNCT punct
   SPACE dep
Bij ADP case
de DET det
ziekte NOUN obl
van ADP case
Parkinson PROPN nmod
krijg VERB ROOT
je PRON nsubj
problemen NOUN obj
met ADP case
bewegen NOUN nmod
. PUNCT punct
    SPACE dep
Ook ADV advmod
bijvoorbeeld ADV advmod
denken VERB advcl
, PUNCT punct
slikken VERB conj
en CCONJ cc
slapen VERB conj
gaan VERB ROOT
vaak ADJ advmod
moeilijker ADJ advmod
. PUNCT punct
    SPACE dep
Medicijnen NOUN nsubj
kunnen AUX aux
ervoor ADV advmod
zorgen VERB ROOT
dat SCONJ mark
je PRON nsubj
minder DET advmod
stijf NOUN ccomp
bent AUX cop
en CCONJ cc
minder PRON advmod
trilt VERB conj
. PUNCT punct
    SPACE dep
Beweeg ADP case
elke DET det
dag NOUN obl
minstens ADV amod
een DET det
half ADJ amod
uur NOUN ROOT
. PUNCT punct
    SPACE dep
Maak VERB ROOT
je PRON nmod:poss
huis NOUN obj
veilig ADJ xcomp
, PUNCT punct
zodat SCONJ mark
je PRON nsubj
niet ADV advmod
valt VERB parataxis
. PUNCT punct
['ziekte', 'Parkinson', '  ', 

In [None]:
#