In [1]:
import requests
from bs4 import BeautifulSoup
from lxml import etree
from tqdm.notebook import trange, tqdm
import json
import pandas as pd

from utils import get_text, determine_qp_type

In [None]:
persons_base_url = 'https://cau.gelehrtenverzeichnis.de/persons'

all_links = []
for page in trange(1, 180):
    # Create url and retreive data
    url = f'{persons_base_url}?page={page}'
    response = requests.get(url)

    if response.status_code != 200:
        raise Exception(response.status_code)
    
    # Create soup
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all links
    found_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('/person/')]
    # Extend list
    all_links.extend(found_links)

# Unique entries
all_links = set(all_links)

In [15]:
# Save all links
with open('out/all_links.txt', 'w') as file:
    for link in all_links:
        file.write(f"{link}\n")

In [2]:
# Read all links
with open('out/all_links.txt', 'r') as file:
    all_links = [line.strip() for line in file.readlines()]

In [3]:
# Get person ids
person_ids = [link.split('/')[-1] for link in all_links]

In [4]:
rdf_base_url = 'https://cau.gelehrtenverzeichnis.de/data/about'
namespaces = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
              'j.0': 'http://gelehrtenverzeichnis.de/',
              'owl': 'http://www.w3.org/2002/07/owl#',
              'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'}

data = []
for id in tqdm(person_ids):
    # Get persons data
    rdf_url = f'{rdf_base_url}/{id}'
    response = requests.get(rdf_url)
    if response.status_code != 200:
        raise Exception(response.status_code)
    
    # Parse the XML content
    root = etree.fromstring(response.content)
    family_name = get_text(root, './/j.0:familyName', namespaces)
    given_name = get_text(root, './/j.0:givenName', namespaces)

    # Get qualification papers info
    qualification_papers = root.findall('.//j.0:qualificationPaper', namespaces)
    qualification_paper_urls = [qp.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource') for qp in qualification_papers]

    qp_data = []
    for qp_url in qualification_paper_urls:
        qp_response = requests.get(qp_url)
        if qp_response.status_code != 200:
            raise Exception(response.status_code)
        # Parse
        qp_root = etree.fromstring(qp_response.content)
        title = get_text(qp_root, './/j.0:title', namespaces)
        subject = get_text(qp_root, './/j.0:subject', namespaces)

        qp_types = qp_root.findall('.//rdf:type', namespaces)
        qp_types_list = [type.get('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource') for type in qp_types]
        qp_type = determine_qp_type(qp_types_list)

        qp_data.append({'title': title,
                        'subject': subject,
                        'type': qp_type})
    
    data.append({'id': id,
                'family_name': family_name,
                'given_name': given_name,
                'qualification_papers': qp_data})
       

  0%|          | 0/1071 [00:00<?, ?it/s]

In [5]:
# Save data as JSON
with open('out/data.json', 'w', encoding='utf-8') as json_file:
    json.dump(data, json_file, ensure_ascii=False, indent=4)

In [6]:
# Read data from JSON file
with open('out/data.json', 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)

# Convert to DataFrame
df = pd.json_normalize(data, 'qualification_papers', ['id', 'family_name', 'given_name'])

# Save DataFrame to CSV
df.to_csv('out/data.csv', index=False)