# Download page content per magazine

This notebook guides you through SRU and OAI of the KB: National Library of the Netherlands, in order to collect magazine pages based on magazine title.

### Install the necessary packages

In [None]:
%pip install pandas
%pip install requests
%pip install BeautifulSoup4
%pip install lxml
%pip install html5lib

### Import the necessary packages

In [1]:
## Import the necessary packages 
import pandas as pd
from bs4 import BeautifulSoup
import requests
import xml
import re

### Defining the API key
An API key is needed to query and download material.

In [2]:
apikey = "" #Insert the API key here

### Specifying the desired magazine
Specify the magazine that is of interest based on an alternative title in Delpher. The query is not case sensitive.

In [3]:
magazine_title = "De Ingenieur" 

### Retrieving the magazine identifiers
Before we can download the actual content, we need a list of magazines that were published by the desired magazine and a reference to the content. We put this list in a dataframe in which we store some additional metadata. This dataframe is used later for accessing the content of each page.

In [None]:
## Extract the identifiers
## This might take a while
identifierList = []
startRecord = 0
maximumRecords = 100
recordCounter = 0

## Assemble the query based on the parameters, we set the  maximumRecords to 1000 to prevent overloading the system
query = f"http://jsru.kb.nl/sru/sru?version=1.2&operation=searchRetrieve"\
        f"&x-collection=DTS_document"\
        f"&recordSchema=didl"\
        f"&startRecord={startRecord}"\
        f"&maximumRecords={maximumRecords}"\
        f"&query=(alternative=\"{magazine_title}\")"

print(query)

page = requests.get(query)
soup = BeautifulSoup(page.content,'xml')

# Total number of records
records = int(soup.find('srw:numberOfRecords').text)
print(f"Total number of records: {records}")

# Iterate through the query results to extract metadata
while recordCounter < records:
    # Update the query for pagination
    query = f"http://jsru.kb.nl/sru/sru?version=1.2&operation=searchRetrieve"\
            f"&x-collection=DTS_document"\
            f"&recordSchema=didl"\
            f"&startRecord={recordCounter + 1}"\
            f"&maximumRecords={maximumRecords}"\
            f"&query=(alternative=\"{magazine_title}\")"

    response = requests.get(query)
    soup = BeautifulSoup(response.content, 'xml')

    for item in soup.find_all('didl:Item'):
        # Extract article ID
        page_id = item.get('dc:identifier')

        # Iterate through each didl:Component in this didl:Item
        for component in item.find_all('didl:Component'):
            # Check for and extract OCR descriptor
            descriptor = component.find('didl:Descriptor')
            if descriptor:
                statement = descriptor.find('didl:Statement')
                if statement and statement.get('dc:type') == 'role' and statement.text == 'OCR':
                    resource = component.find('didl:Resource')
                    if resource and resource.get('ref'):
                        ocr_link = resource.get('ref')
                        ocr_metadata = component.find("didl:Component", {"dc:identifier": page_id})
                        identifierList.append({
                            'magazine': magazine_title,
                            'magazineID': page_id,
                            'ocrLink': ocr_link
                        })

    # Increment the record counter
    recordCounter += maximumRecords
    print(f"Processed {min(recordCounter, records)} of {records} records")

http://jsru.kb.nl/sru/sru?version=1.2&operation=searchRetrieve&x-collection=DTS_document&recordSchema=didl&startRecord=0&maximumRecords=100&query=(alternative="De Ingenieur")
Total number of records: 5360
Processed 100 of 5360 records
Processed 200 of 5360 records
Processed 300 of 5360 records
Processed 400 of 5360 records
Processed 500 of 5360 records
Processed 600 of 5360 records
Processed 700 of 5360 records
Processed 800 of 5360 records
Processed 900 of 5360 records
Processed 1000 of 5360 records
Processed 1100 of 5360 records
Processed 1200 of 5360 records
Processed 1300 of 5360 records
Processed 1400 of 5360 records
Processed 1500 of 5360 records
Processed 1600 of 5360 records
Processed 1700 of 5360 records
Processed 1800 of 5360 records
Processed 1900 of 5360 records
Processed 2000 of 5360 records
Processed 2100 of 5360 records
Processed 2200 of 5360 records
Processed 2300 of 5360 records
Processed 2400 of 5360 records
Processed 2500 of 5360 records
Processed 2600 of 5360 record

In [None]:
# Convert the list of identifiers to a DataFrame
df = pd.DataFrame(identifierList)
print(df.head())

       magazine           articleID  \
0  De Ingenieur  dts:2979001:mpeg21   
1  De Ingenieur  dts:2979001:mpeg21   
2  De Ingenieur  dts:2979001:mpeg21   
3  De Ingenieur  dts:2979001:mpeg21   
4  De Ingenieur  dts:2979001:mpeg21   

                                             ocrLink ocrConfidence  
0  http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  
1  http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  
2  http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  
3  http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  
4  http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  


In [None]:
# Remove duplicate OCR links where the magazine ID excludes page information
dfIdentifiers = df.drop_duplicates(subset='ocrLink', keep='last')
print(dfIdentifiers.head)

csvdfArticles = dfIdentifiers.to_html('Articles.html')

<bound method NDFrame.head of             magazine           articleID  \
0       De Ingenieur  dts:2979001:mpeg21   
1       De Ingenieur  dts:2979001:mpeg21   
2       De Ingenieur  dts:2979001:mpeg21   
3       De Ingenieur  dts:2979001:mpeg21   
4       De Ingenieur  dts:2979001:mpeg21   
...              ...                 ...   
118701  De Ingenieur  dts:2944020:mpeg21   
118702  De Ingenieur  dts:2944020:mpeg21   
118703  De Ingenieur  dts:2944020:mpeg21   
118704  De Ingenieur  dts:2944020:mpeg21   
118705  De Ingenieur  dts:2944020:mpeg21   

                                                  ocrLink ocrConfidence  
0       http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  
1       http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  
2       http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  
3       http://resolver.kb.nl/resolve?urn=dts:2979001:...          None  
4       http://resolver.kb.nl/resolve?urn=dts:2979001:...          None

### Retrieve the content of the articles
Based on the OCR links present in the Articles dataframe, the content of each page can be retrieved.

In [31]:
## Retrieve the content of the articles based on the identifiers
## If there are a lot of articles, this can take a while

contentList = []

for index, row in dfIdentifiers.head(100).iterrows():
  identifier = row['ocrLink']
  url = requests.get(identifier)

  if url.status_code == 200:
      soup = BeautifulSoup(url.content, "xml")
      text = ''
      for item in soup.findAll('p'):
          text = text + (item.text)
      contentList.append([identifier, text])
  else:
      contentList.append([identifier, "Not enough rights to view digital object"])   

## Create a dataframe
dfContent = pd.DataFrame(contentList, columns = ['ocrLink', 'content'])
len(dfContent)
dfContent.head(10)

Unnamed: 0,ocrLink,content
0,http://resolver.kb.nl/resolve?urn=dts:2979001:...,
1,http://resolver.kb.nl/resolve?urn=dts:2979001:...,
2,http://resolver.kb.nl/resolve?urn=dts:2979001:...,REGISTERVAN DEWERKEN VAN HET KONINKLIJK INSTIT...
3,http://resolver.kb.nl/resolve?urn=dts:2979001:...,
4,http://resolver.kb.nl/resolve?urn=dts:2979001:...,KONINKLIJK INSTITUUT VAN INGENIEURS.REGISTER19...
5,http://resolver.kb.nl/resolve?urn=dts:2979001:...,2REGISTER.dent voor de waarneming van het alge...
6,http://resolver.kb.nl/resolve?urn=dts:2979001:...,REGISTER.8gebouw voor het Koninklijk Instituut...
7,http://resolver.kb.nl/resolve?urn=dts:2979001:...,4REGISTER.Examens voor waterbouwkundige opzich...
8,http://resolver.kb.nl/resolve?urn=dts:2979001:...,;;kc.istkk.5Hesselb erg (J. H.) wordt lid. N. ...
9,http://resolver.kb.nl/resolve?urn=dts:2979001:...,6REGISTER.Mededeeling omtrent een reis naar Ne...


### Merge the dataframes
This is an additional step to store everything in one dataframe.

In [32]:
dfArticles = dfIdentifiers.merge(dfContent, on = 'ocrLink', how = 'inner' )
dfArticles.head(10)

Unnamed: 0,magazine,articleID,ocrLink,ocrConfidence,content
0,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,
1,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,
2,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,REGISTERVAN DEWERKEN VAN HET KONINKLIJK INSTIT...
3,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,
4,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,KONINKLIJK INSTITUUT VAN INGENIEURS.REGISTER19...
5,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,2REGISTER.dent voor de waarneming van het alge...
6,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,REGISTER.8gebouw voor het Koninklijk Instituut...
7,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,4REGISTER.Examens voor waterbouwkundige opzich...
8,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,;;kc.istkk.5Hesselb erg (J. H.) wordt lid. N. ...
9,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,6REGISTER.Mededeeling omtrent een reis naar Ne...


In [33]:
# Remove all rows with no content
# Some pages are empty and therefore contain no articles

dfArticles = dfArticles[dfArticles['content'].str.strip().ne('')]
dfArticles.head(10)

Unnamed: 0,magazine,articleID,ocrLink,ocrConfidence,content
2,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,REGISTERVAN DEWERKEN VAN HET KONINKLIJK INSTIT...
4,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,KONINKLIJK INSTITUUT VAN INGENIEURS.REGISTER19...
5,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,2REGISTER.dent voor de waarneming van het alge...
6,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,REGISTER.8gebouw voor het Koninklijk Instituut...
7,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,4REGISTER.Examens voor waterbouwkundige opzich...
8,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,;;kc.istkk.5Hesselb erg (J. H.) wordt lid. N. ...
9,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,6REGISTER.Mededeeling omtrent een reis naar Ne...
10,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,REGISTER.7Algemeen overzicht van de toepassing...
11,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,8REGISTER.Afdeeling voor Spoorwegbouw en Spoor...
12,De Ingenieur,dts:2979001:mpeg21,http://resolver.kb.nl/resolve?urn=dts:2979001:...,,KEGISTEK.9*J ong (J. de) wordt lid. N. 67. BN/...


In [34]:
csvdfArticles = dfArticles.to_html('Articles.html')