In [1]:
from europeana_sparql import *
from text_mining import *

import requests
import requests
import os
import re
import string
from nltk import word_tokenize,sent_tokenize
from nltk.corpus import stopwords

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

How many objects from the DBNL can we find in Europeana?

In [5]:
institution = 'Digitale bibliotheek voor de Nederlandse letteren, DBNL, Nederland'

query = """
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX edm: <http://www.europeana.eu/schemas/edm/>
PREFIX ore: <http://www.openarchives.org/ore/terms/>
PREFIX html: <http://www.w3.org/1999/xhtml/vocab#>

SELECT COUNT( DISTINCT ?object )
WHERE {

?object ore:proxyIn ?local_aggr .
?object ore:proxyFor ?cho .
?eur_aggr edm:aggregatedCHO ?cho .
?eur_aggr a edm:EuropeanaAggregation .

?local_aggr edm:provider ?glam .

?local_aggr edm:dataProvider '"""+institution+"""' .
}
"""

df = run_query(query)
print( f"There are {df['callret-0.value'].iloc[0]} items in the DBNL collection." )

There are 17043 items in the DBNL collection.


What are the content types of the objects from the DBNL?

In [3]:

query = """
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX edm: <http://www.europeana.eu/schemas/edm/>
PREFIX ore: <http://www.openarchives.org/ore/terms/>
PREFIX html: <http://www.w3.org/1999/xhtml/vocab#>

SELECT ?type , COUNT(?type)
WHERE {

?object ore:proxyFor ?cho .
?eur_aggr edm:aggregatedCHO ?cho .
?eur_aggr a edm:EuropeanaAggregation .

?object ore:proxyIn ?local_aggr .
?local_aggr edm:dataProvider 'Digitale bibliotheek voor de Nederlandse letteren, DBNL, Nederland' .

?object edm:type ?type .

}
GROUP BY ?type
"""

types = run_query(query)


for i,row in types.iterrows():
    print(f"{row['type.value']}\t{row['callret-1.value']}")

TEXT	16949
VIDEO	78
SOUND	16


Which metadata is available for these objects?

In [35]:

query = """
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX edm: <http://www.europeana.eu/schemas/edm/>
PREFIX ore: <http://www.openarchives.org/ore/terms/>
PREFIX html: <http://www.w3.org/1999/xhtml/vocab#>

SELECT ?predicate ?object
WHERE {
<http://data.europeana.eu/proxy/provider/2021602/33AD47F261E191383316211C879B87A182DEDD70> ?predicate ?object . 
}

"""

metadata = run_query(query)


In [36]:

ns = {
'dc:':'http://purl.org/dc/elements/1.1/',
'dcterms:':'http://purl.org/dc/terms/',
'edm:':'http://www.europeana.eu/schemas/edm/',
'ore:':'http://www.openarchives.org/ore/terms/',
'html:':'http://www.w3.org/1999/xhtml/vocab#',
'skos:':'http://www.w3.org/2004/02/skos/core#',  
'rdf:':'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
}

for i,row in metadata.iterrows():
    predicate = row['predicate.value']
    for url in ns:
         predicate = re.sub(ns[url],url,predicate)
    
    print(f"{predicate}\n{row['object.value']}\n")

rdf:type
http://www.openarchives.org/ore/terms/Proxy

dcterms:created
1889

ore:proxyFor
http://data.europeana.eu/item/2021602/33AD47F261E191383316211C879B87A182DEDD70

dc:contributor
Couperus, Louis

dc:creator
Couperus, Louis

dc:date
2009-07-16

dc:identifier
coup002elin01

dc:identifier
http://www.dbnl.org/titels/titel.php?id=coup002elin01

dc:language
nld

dc:rights
© dbnl

dc:title
Eline Vere

dcterms:issued
2009-07-16

edm:europeanaProxy
false

edm:type
TEXT

ore:proxyIn
http://data.europeana.eu/aggregation/provider/2021602/33AD47F261E191383316211C879B87A182DEDD70



Which texts by Louis Couperus can be found in the DBNL collection?

In [18]:
query = """
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX edm: <http://www.europeana.eu/schemas/edm/>
PREFIX ore: <http://www.openarchives.org/ore/terms/>
PREFIX html: <http://www.w3.org/1999/xhtml/vocab#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT *
WHERE {

?object ore:proxyIn ?local_aggr .
?object ore:proxyFor ?cho .
?eur_aggr edm:aggregatedCHO ?cho .
?eur_aggr a edm:EuropeanaAggregation .

?local_aggr edm:dataProvider '"""+institution+"""' .

?object dc:title ?title .
?object edm:type 'TEXT' .
?object dcterms:created ?date . 
?object dc:creator 'Couperus, Louis' .
?object dc:identifier ?id .

?object ore:proxyIn ?local_aggr .

}

ORDER BY ASC(?object)
"""

couperus =run_query(query) 

print( f'There are {couperus.drop_duplicates(subset="object.value", keep="last").shape[0]} titles by Louis Couperus.')

There are 78 titles by Louis Couperus.


In [19]:
for i,row in couperus.drop_duplicates(subset="object.value").sort_values(by='date.value').iloc[:30].iterrows():
    print(f"{row['title.value']} ({row['date.value']})\n{row['object.value']}\n")

Eline Vere (1889)
http://data.europeana.eu/proxy/provider/2021602/33AD47F261E191383316211C879B87A182DEDD70

Noodlot (1891)
http://data.europeana.eu/proxy/provider/2021602/8FC51446390C3D80B9A2B43C9346C2D9C5C33AC5

Extaze. Een boek van geluk (1892)
http://data.europeana.eu/proxy/provider/2021602/1B603DCEB299BADEE7E4DEBBDBF47AA11DFCF39C

Majesteit (1893)
http://data.europeana.eu/proxy/provider/2021602/B00183CC93B156930D1B496B52F0DD79D43D9649

Reis-impressies (1894)
http://data.europeana.eu/proxy/provider/2021602/6C9C0848A5B4B95D50ACBB8BAF95A914EEA3A3B8

Williswinde (1895)
http://data.europeana.eu/proxy/provider/2021602/98DAEB1ECCBDF89E150129A7953FDE8178A6A362

Wereldvrede (1895)
http://data.europeana.eu/proxy/provider/2021602/ADC739F26BE3171469C124EE47C5483A4FCDAE8D

De verzoeking van den H. Antonius (1896)
http://data.europeana.eu/proxy/provider/2021602/17F29134B883942D8E0F919030850B5BB0DE8578

Hooge troeven (1896)
http://data.europeana.eu/proxy/provider/2021602/85108325C48AF34A5AAE490DB

What are the most frequent words in Couperus' "Eline Vere"?

In [20]:
ids = ['coup002stil01' ,'coup002elin01']

for text in ids:
    url = f'https://www.dbnl.org/nieuws/text.php?id={text}'
    print(url)
    response = requests.get(url)
    if response:
        out = open(text,'w',encoding='utf-8')
        out.write(response.text)
        out.close()
        print(f"{text} has been downloaded!")
    

https://www.dbnl.org/nieuws/text.php?id=coup002stil01
coup002stil01 has been downloaded!
https://www.dbnl.org/nieuws/text.php?id=coup002elin01
coup002elin01 has been downloaded!


In [25]:
text_file = open('coup002elin01.txt',encoding='utf-8')
full_text = text_file.read() 

dutch_stopwords = stopwords.words('dutch')

# Calculate number of words
words = word_tokenize(full_text.lower())
words = remove_punctuation(words)
eline_vere = [word for word in words if word not in dutch_stopwords]

word_frequencies = Counter(eline_vere)
print(f'The novel contains {len(eline_vere)} words and {len(word_frequencies)} unique words')

max = 20

print('\nMost frequent words:')
for word,count in word_frequencies.most_common(max):
    print(f"{word} => {count}")

The novel contains 92753 words and 13691 unique words

Most frequent words:
zoo => 1240
den => 1212
eline => 1099
zoû => 810
hare => 570
mevrouw => 562
terwijl => 453
paul => 441
wel => 434
betsy => 434
zag => 407
vroeg => 372
sprak => 370
o => 363
dien => 361
otto => 332
eene => 324
marie => 322
waar => 319
goed => 301


In [26]:
text_file = open('coup002stil01.txt',encoding='utf-8')
full_text = text_file.read() 

# Calculate number of words
words = word_tokenize(full_text.lower())
words = remove_punctuation(words)
stille_kracht = [word for word in words if word not in dutch_stopwords]

In [39]:
words_list = []
for word in eline_vere:
    if word not in stille_kracht:
        words_list.append(word)
        
unique_words = Counter(words_list)

print("The following words from 'Eline Vere' are never used in 'De Stille Kracht':")
for word,count in unique_words.most_common(20):
    print(f"{word} => {count}")
    

The following words from 'Eline Vere' are never used in 'De Stille Kracht':
eline => 1099
betsy => 434
marie => 322
vincent => 285
lili => 278
henk => 277
heur => 244
freddy => 233
frédérique => 231
etienne => 206
mathilde => 185
emilie => 183
georges => 181
raat => 171
erlevoort => 128
jeanne => 125
oom => 107
verstraeten => 101
vere => 97
tante => 89


In [40]:
words_list = []
for word in stille_kracht:
    if word not in eline_vere:
        words_list.append(word)
        
unique_words = Counter(words_list)

print("The following words from 'De Stille kracht' are never used in 'Eline Vere':")
for word,count in unique_words.most_common(20):
    print(f"{word} => {count}")

The following words from 'De Stille kracht' are never used in 'Eline Vere':
oudijck => 229
theo => 133
doddy => 120
addy => 108
laboewangi => 90
oerip => 54
batavia => 54
eldersma => 48
kandjeng => 41
ngadjiwa => 38
does => 37
bevolking => 37
indische => 35
luce => 35
pangéran => 34
ida => 33
patjaram => 32
ambtenaren => 28
javaansche => 26
soerabaia => 21


In [41]:
from nltk.text import Text
novel = Text(stille_kracht)

In [42]:
novel.concordance('batavia' , width = 50 , lines = 15)

Displaying 15 of 54 matches:
rezident mooie nonna batavia woonde zeide stil sp
ng twee kleine sinjo batavia meêbracht hunne groo
i wanhopig vervelend batavia waar twee maanden ge
ng vooral hare maand batavia ná zoo maand pervers
ren laboewangi dorst batavia deed geloofde nauwli
d geloofde nauwlijks batavia vertelde verzekerde 
ertellende kennissen batavia races buitenzorg bal
ent gewest open kwam batavia semarang soerabaia v
 ontstemd laboewangi batavia gedrukt verveling bi
uve mousseline japon batavia hoed mauve papavers 
 voorbijreed spoedig batavia eene teleurstelling 
s periodiek behoefte batavia ging paar maanden he
jne opvoeding geheel batavia gehad gymnazium will
lang mevrouw oudijck batavia vroeg ida twee maand
ndel mevrouw oudijck batavia geamuzeerd jullie ve


In [43]:
novel.concordance('soerabaia' , width = 50 , lines = 15)

Displaying 15 of 21 matches:
nkele minuten trein soerabaia aankwam zag mevrouw
am batavia semarang soerabaia vorstenlanden zoû h
de gedrild zoo goed soerabaia kwam voorstellingen
 italiaansche opera soerabaia verleden keer socie
oen alleen semarang soerabaia vorstenlanden heel 
eiden feest waarvan soerabaia zoû hooren streelde
 zoo kiespijn moest soerabaia den dentist zoû wel
lgenden dag vertrok soerabaia logeerde deed waarl
espijn zeide daarom soerabaia vond aardig hield z
 italiaansche opera soerabaia hoogte voelden groo
 zet zaal vol komen soerabaia meê helpen waar mev
 eva eva trok terug soerabaia waar ging boodschap
id hangen couranten soerabaia batavia toe versche
dijck hersteld ging soerabaia logeeren kennissen 
 weg léonie invloed soerabaia groote mannen den h
