In [1]:
from stcn import *
import json
from collections import Counter

In [2]:
query = '''

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <http://schema.org/>
PREFIX kb: <http://data.bibliotheken.nl/def#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT * WHERE {
?title schema:mainEntityOfPage ?mainEntity .
?mainEntity schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn>  . 

?title schema:inLanguage 'en' .

} 

'''

df = run_query(query)
print( f'There {df.drop_duplicates(subset="title.value", keep="last").shape[0]} English titles in the STCN.')

There 1299 English titles in the STCN.


In [3]:
query = '''

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <http://schema.org/>
PREFIX kb: <http://data.bibliotheken.nl/def#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT * WHERE {
?resource schema:mainEntityOfPage ?mainEntity .
?mainEntity schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn>  . 

?resource schema:inLanguage 'ang' .

OPTIONAL {
?resource schema:name ?title . }

OPTIONAL {
?resource schema:author ?author_node . 
?author_node schema:author ?author_info .
?author_info  rdfs:label ?author_name . }

}

'''

oe_df = run_query(query)
print( f'There {oe_df.drop_duplicates(subset="title.value", keep="last").shape[0]} Old English titles in the STCN.')

There 5 Old English titles in the STCN.


## Titles in English

The cell below requests more detailed metadata about the titles in English.

In [4]:
query = '''

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <http://schema.org/>
PREFIX kb: <http://data.bibliotheken.nl/def#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT * WHERE {

?resource schema:mainEntityOfPage ?mainEntity .
?mainEntity schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn>  . 

?resource schema:inLanguage 'en' .
?resource schema:publication ?publ .


?resource schema:name ?title . 

OPTIONAL {
?resource schema:alternateName ?alt_title . }

OPTIONAL {
?resource schema:author ?author_node . 
?author_node schema:author ?author_info .
?author_info  rdfs:label ?author_name . }

OPTIONAL {
?resource schema:inLanguage ?language . }


OPTIONAL {
?publ schema:startDate ?publ_year . 
?publ schema:description ?imprint . 
?publ schema:publishedBy ?publisher . 
?publisher schema:name ?publ_name .
?publisher schema:location ?location_node .
?location_node schema:address ?address .
?address schema:addressLocality ?place . 
}

OPTIONAL { 
?resource schema:about ?subject .
?subject skos:prefLabel ?subject_label .
} 


}

'''

df = run_query(query)

print(df.shape)
print(df.columns)


(3938, 36)
Index(['resource.type', 'resource.value', 'mainEntity.type',
       'mainEntity.value', 'publ.type', 'publ.value', 'title.type',
       'title.value', 'alt_title.type', 'alt_title.value', 'author_node.type',
       'author_node.value', 'author_info.type', 'author_info.value',
       'author_name.type', 'author_name.value', 'language.type',
       'language.value', 'publ_year.type', 'publ_year.value', 'imprint.type',
       'imprint.value', 'publisher.type', 'publisher.value', 'publ_name.type',
       'publ_name.value', 'location_node.type', 'location_node.value',
       'address.type', 'address.value', 'place.type', 'place.value',
       'subject.type', 'subject.value', 'subject_label.type',
       'subject_label.value'],
      dtype='object')


## Create JSON

To make the analysis easier, the metadata are converted to JSON

In [5]:
import numpy as np

topics_dict = dict()


def read_topics():
    topics_dict = dict()
    topics = pd.read_excel('Topics.xlsx')
    for i,row in topics.iterrows():
        topics_dict[ row['topic'] ] = row['group']
    return topics_dict
    

def cluster_subject(subject):
    
    global topics_dict
    
    if not topics_dict:
        topics_dict = read_topics()
    
    if not(pd.isna(subject)):    
        return topics_dict[subject]
    else:
        return np.nan

topics_dict = read_topics()
    
df['subject_cluster'] = df['subject_label.value'].apply(cluster_subject)


def year_publication(year_str):
    if re.search( r'\d{4}' , str(year_str) ):
        return int(year_str)
    else:
        return np.nan
    
df['year'] = df['publ_year.value'].apply(year_publication)

In [6]:
unique_ids = df.sort_values('year')['resource.value'].unique()

data = []

def print_value(value):
    if pd.isna(value):
        return ''
    else:
        return value
    
def get_values(df,field,fields_dict):
    rows = df.drop_duplicates(field)
    all_rows = []
    for i,row in rows.iterrows():
        values = dict()
        for f in fields_dict:
            values[fields_dict[f]] = print_value(row[f])
        all_rows.append(values)
    return all_rows    

for resource in unique_ids:
    record = dict()
    record['ppn'] = resource
    #print(resource)

    df_resource = df[ df['resource.value'] == resource ]
    
    record['title'] = df_resource.iloc[0]['title.value']
    record['alternative_title'] = df_resource.iloc[0]['alt_title.value']
    record['year'] = df_resource.iloc[0]['publ_year.value']
    
    ## Publishers
    unique_id = 'publisher.value'
    fields_dict = { 'publisher.value':'ppn',
        'publ_name.value':'name'
        #'place.value':'place'
    }
    
    all_publishers = get_values(df_resource,unique_id,fields_dict)
    record['publishers'] = all_publishers

    ## Subjects
    unique_id = 'subject.value'
    fields_dict = {'subject.value':'ppn',
              'subject_label.value':'label',
               'subject_cluster':'group' }
    all_subjects = get_values(df_resource,unique_id,fields_dict)
    record['subjects'] = all_subjects
    
    ## Authors
    unique_id = 'author_info.value'
    fields_dict = {'author_info.value':'ppn',
              'author_name.value':'name' }
    all_authors = get_values(df_resource,unique_id,fields_dict)
    record['authors'] = all_authors
    
    ## Languages
    unique_id = 'language.value'
    fields_dict = {'language.value':'code' }
    all_languages = get_values(df_resource,unique_id,fields_dict)
    record['languages'] = all_languages
    
    data.append(record)
    
with open('english_titles.json','w',encoding='utf-8') as out:
    out.write(json.dumps(data,indent=4))

In [7]:
f = open('english_titles.json')
json_data = json.load(f)
f.close()

## Bibles

In [12]:
count = 0 

for book in json_data:
    #print(book['title'])
    if re.search( 'geneva', str(book['alternative_title']) , re.IGNORECASE):
        count += 1
        print(f"{book['ppn']}\n{book['title']}\n{book['year']}")
        print({book['alternative_title']})
        for publ in book['publishers']:
            print(publ['name'])
        print('\n\n')
print(count)

http://data.bibliotheken.nl/id/nbt/p320837467
The CL. psalmes of David in prose and meter. For the vse of the Kirk of Scotland
1601
{'Bible English Geneva'}
Hart, Andrew
Charteris, Henry (heirs)
Canin, Abraham



http://data.bibliotheken.nl/id/nbt/p265019737
The New Testament of ovr Lord Iesvs Christ.
1601
{'Bible English Geneva'}
Charteris, Henry (heirs)
Hart, Andrew
Canin, Isaac Jansz



http://data.bibliotheken.nl/id/nbt/p335881874
The Bible, that is, The holy Scriptures conteined in the Olde and Newe Testament
1633
{'Bible English Geneva'}
Stam, Jan Fredericksz
Craffort, Thomas



http://data.bibliotheken.nl/id/nbt/p335875777
The Bible, that is, The holy Scriptures conteined in the Olde and Newe Testament
1633
{'Bible English Geneva'}
Barker, Christopher (deputies of)
Stam, Jan Fredericksz



http://data.bibliotheken.nl/id/nbt/p08720021X
The Bible: that is, The Holy Scriptvres contained in the Old and New Testament
1640
{'Bible English Geneva'}
Stafford, Thomas
Boxe, Willem Christi

## Average number of books per year

In [9]:
years = Counter()

for book in json_data:
    years.update([book['year']])
        
for year,count in years.most_common():
    print(year,count)
        
print(sum(years.values()))

nan 96
17XX 79
1640 28
1638 26
1688 24
16XX 20
1624 18
1637 17
1649 17
1687 14
1712 14
1609 13
1639 13
1652 13
1644 12
1682 12
1686 12
1689 12
1635 11
1675 11
1678 11
1711 11
1720 11
1604 10
1619 10
1620 10
1621 10
1622 10
1750 10
1605 9
1610 9
1615 9
1650 9
1673 9
1677 9
1590 8
1602 8
1617 8
1618 8
1636 8
1672 8
1710 8
1719 8
1721 8
1752 8
1599 7
1608 7
1625 7
1629 7
1641 7
1643 7
1651 7
1660 7
1680 7
1683 7
1691 7
1692 7
1718 7
1728 7
1738 7
1765 7
163X 7
1597 6
1616 6
1623 6
1628 6
1630 6
1633 6
1634 6
1646 6
1659 6
1661 6
1663 6
1684 6
1727 6
1730 6
1740 6
1754 6
1759 6
1586 5
1611 5
1626 5
1631 5
1632 5
1645 5
1648 5
1662 5
1664 5
1700 5
1731 5
1735 5
1772 5
1784 5
179X 5
170X 5
1588 4
1600 4
1601 4
1606 4
1607 4
1614 4
1653 4
1654 4
1657 4
1679 4
1685 4
1690 4
1695 4
1697 4
1698 4
1716 4
1725 4
1732 4
1741 4
1751 4
1764 4
1782 4
1581 3
1582 3
1584 3
1585 3
1589 3
1598 3
1603 3
1642 3
1647 3
1655 3
1658 3
1666 3
1676 3
1703 3
1722 3
1724 3
1726 3
1733 3
1737 3
1744 3
1745 3
1747 3

In [10]:
print(min(years.keys()))
print(max(years.keys()))

print(int(max(years.keys()))-int(min(years.keys())))

## All books

years = Counter()

for book in json_data:
    years.update([book['year']])
    
total = sum(years.values())
print(total)
print(len(years.keys()))
print(total/len(years.keys()))

TypeError: '<' not supported between instances of 'float' and 'str'

In [None]:

for book in json_data:
    if not pd.isna(book['year']) and re.search( r'\d{4}' , book['year']):
        if int(book['year']) == 1638: 
            print(f"{book['ppn']}\n{book['title']}\n{book['year']}")
            for publ in book['publishers']:
                print(publ['name'])
            print('\n\n')
    
print(count)

## List of all titles

In [11]:
count = 0
for book in json_data:
    print(f"{book['ppn']}\n{book['title']}\n{book['year']}\n\n")
    count += 1
        
print(count)


http://data.bibliotheken.nl/id/nbt/p424370913
Exercitatio alphabetica nova et utilissima, variis expressa lingvis et characteribvs.
1569


http://data.bibliotheken.nl/id/nbt/p344098192
Certayne newes of the whole discription, ayde, and helpe of the christian princes and nobles, the vvhich for the comfort and deliuerance of the poore Christians in the low Countries, are gathered together.
1574


http://data.bibliotheken.nl/id/nbt/p409609145
Colloqves ov dialogves avec vn dictionaire en six langues: flamen, anglois, alleman, françois, espaignol, & italien [...].
1576


http://data.bibliotheken.nl/id/nbt/p832930423
The apologie or defence, of [...] prince William [...] prince of Orange [...] against the proclamation [...] by the king of Spaine
1581


http://data.bibliotheken.nl/id/nbt/p298780496
The apologie or defence, of [...] prince William [...] prince of Orange [...] against the proclamation [...] by the king of Spaine
1581


http://data.bibliotheken.nl/id/nbt/p264085450
A pleasavnt

## Subjects

In [None]:
subjects = []

for book in json_data:
    subjects.append(book['subjects'][0]['group'])
        
subjects_freq = Counter(subjects)

total = sum(subjects_freq.values())

for subject,count in subjects_freq.most_common():
    print(f"{subject} {count} {(count/total)*100}")

In [None]:
## Grammars and dictionaries

In [None]:
publishers = Counter()
count = 0

for book in json_data:
    if book['subjects'][0]['group'] == 'Language and literature':
        count += 1
        print(f"{book['ppn']}\n{book['title']}\n{book['year']}")
        publ_name = ''
        for publ in book['publishers']:
            print(publ['name'])
            publ_name += publ['name'] + '; '
        publishers.update([publ_name])
        for subject in book['subjects']:
            print(subject['label'])
        print('\n\n')
        
print(count)
        
for p,i in publishers.most_common():
    print(f"{p},{i}")

In [22]:
publishers = Counter()
count = 0

for book in json_data:
    if re.search( r'\d{4}', str(book['year']) ) and int(book['year']) > 1700:
        if book['subjects'][0]['group'] == 'Language and literature' and not re.search(r'(dict)|(gramm)|(taal)|(language)',book['title'],re.IGNORECASE):
            count += 1
            print(f"{book['ppn']}\n{book['title']}\n{book['year']}")
            publ_name = ''
            for publ in book['publishers']:
                print(publ['name'])
                publ_name += publ['name'] + '; '
            publishers.update([publ_name])
            for subject in book['subjects']:
                print(subject['label'])
            print('\n\n')
        
print(count)
        
for p,i in publishers.most_common():
    print(f"{p},{i}")

http://data.bibliotheken.nl/id/nbt/p268938709
Love for love: a comedy.
1710
Johnson, Thomas
s.n.
English language and literature



http://data.bibliotheken.nl/id/nbt/p402314840
The rehearsal; a comedy.
1710
s.n.
Johnson, Thomas
English language and literature



http://data.bibliotheken.nl/id/nbt/p402315154
The chances, a comedy.
1710
Johnson, Thomas
s.n.
English language and literature



http://data.bibliotheken.nl/id/nbt/p268937818
The old batchelour. A comedy.
1710
Johnson, Thomas
s.n.
English language and literature



http://data.bibliotheken.nl/id/nbt/p412692457
Hamlet, prince of Denmark. A tragedy.
1710
Johnson, Thomas
s.n.
English language and literature



http://data.bibliotheken.nl/id/nbt/p326673261
She wou'd if she cou'd. A comedy.
1710
s.n.
Johnson, Thomas
English language and literature



http://data.bibliotheken.nl/id/nbt/p268937982
The mourning bride. A tragedy.
1711
Johnson, Thomas
English language and literature



http://data.bibliotheken.nl/id/nbt/p412692538
The 

## Publishers

In [None]:
publishers = Counter()

for book in json_data:
    publ_name = ''
    for publ in book['publishers']:
        publ_name += publ['name'] + '; '
    publishers.update([publ_name])


print(count)


print('Shilder')
    
total = 0
for p,i in publishers.most_common():
    if re.search('Schilders, Richard' , p):
        print(p)
        total += i
print(total)
        
print('Giles Thorp')
    
total = 0
for p,i in publishers.most_common():
    if re.search('Thorp' , p):

        total += i
print(total)

print('Cann')

total = 0
for p,i in publishers.most_common():
    if re.search('Cann' , p):
        total += i
print(total)

print('Thomas Johnson')

total = 0
for p,i in publishers.most_common():
    if re.search('Johnson, Thomas' , p):

        total += i
print(total)

In [None]:
## Titles by Johnson

In [None]:
publishers = Counter()
count = 0

for book in json_data:

    publ_name = ''
    for publ in book['publishers']:
            publ_name += publ['name'] + '; '
            
    print(book)
            
    if re.search('Johnson, Thomas' , publ_name):
        print(f"{book['ppn']}\n{book['title']}\n{book['year']}")



In [None]:
df_unique = df.drop_duplicates('resource.value')
print(df_unique.query( 'year>=1450 and year<1600' ).shape)
print(df_unique.query( 'year>=1600 and year<1700' ).shape)
print(df_unique.query( 'year>=1700 and year<1800' ).shape)
print(df_unique.query( 'year>=1800' ).shape)

In [None]:
pre_1600 = df.query( 'year>=1450 and year<1600' )

df_grouped = pre_1600.groupby('publisher.value')['resource.value'].count()
publishers = pd.DataFrame( data = df_grouped.sort_values(ascending=False) )
publishers = publishers.reset_index()
publishers.columns = ['publisher','nr_books']

print(publishers)

In [None]:
62 + 693 + 307

In [None]:
## Can 

In [None]:
for i,row in df.iterrows():
    imprint = row['imprint.value']
    if re.search( r'=' , str(imprint) ):
        print(row['resource.value'])
        print(row['title.value'])
        print(imprint)