In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Clean data

In [65]:
loaded_df = pd.read_csv('leiden_publications.csv')
loaded_df["id"] = range(len(loaded_df))
loaded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124064 entries, 0 to 124063
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   identifier   124064 non-null  object
 1   datestamp    124064 non-null  object
 2   creator      122941 non-null  object
 3   title        123682 non-null  object
 4   language     118846 non-null  object
 5   date         123684 non-null  object
 6   type         123681 non-null  object
 7   subject      46996 non-null   object
 8   source       99936 non-null   object
 9   description  116044 non-null  object
 10  license      81863 non-null   object
 11  format       85082 non-null   object
 12  contributor  15350 non-null   object
 13  publisher    8362 non-null    object
 14  alternative  132 non-null     object
 15  relation     1620 non-null    object
 16  id           124064 non-null  int64 
dtypes: int64(1), object(16)
memory usage: 16.1+ MB


In [66]:
# TODO:
# - [x] Identifier column
# - [x] datestamp column 
# - [x] source column
# - [x] language column
# - [x] title column
# - [x] description column
# - [ ] type column
loaded_df = pd.read_csv('leiden_publications.csv')
loaded_df["id"] = range(len(loaded_df))
loaded_df.info()
df = loaded_df.copy()

# parse the identifier colum into new columns: ['oai:scholarlypublications.universiteitleiden.nl:item_4172863', 'doi:10.3390/ijerph21121561', 'lucris-id:\\xa01338718282', 'https://hdl.handle.net/1887/4172863']
# - oai
df['oai'] = df['identifier'].str.extract(r'(oai:\S+)')
# - doi
df['doi'] = df['identifier'].str.extract(r'(doi:\S+)')
# - lucris-id
df['lucris-id'] = df['identifier'].str.extract(r'(lucris-id:\S+)')
# - url
df['url'] = df['identifier'].str.extract(r'(https:\S+)')
# - remove the identifier column
df = df.drop(columns=['identifier'])

# Date column is in the form of: ['2024', '2026-02-11']
# - parse the date column into a new column. Get the second date (2026-02-11) if it exists, otherwise get the first date (2024)
# - convert the date column to a datetime object
df['date'] = df['date'].str.extract(r'(\d{4}-\d{2}-\d{2})')
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Type column is in the form of: ['Article in monograph or in proceedings', 'info:eu-repo/semantics/article', 'Text']
# - parse the type column into a new column with the info:eu-repo/semantics/article value. Extract only the article part

df['type'] = df['type'].str.extract(r'(info:eu-repo/semantics/\S+)')
# - remove the info:eu-repo/semantics/ part
df['type'] = df['type'].str.replace("info:eu-repo/semantics/", "").str.replace("',", "")
# - remove the type column

df = df[['oai', 'doi', 'lucris-id', 'url', 'date', 'source', 'language', 'title', 'description', "type", "id"]]

# remove ", ', , and \n from the description column
df['description'] = df['description'].str.replace("'", '').str.replace('"', '').str.replace(',', '').str.replace('\n', '')
df["title"] = df["title"].str.replace("'", '').str.replace('"', '').str.replace(',', '').str.replace('\n', '')

df["id"] = df["oai"].str.extract(r'item_(\d+)').astype(int)

# # save cleaned data to a new parquet file
df.to_parquet('./data/leiden_publications_cleaned.parquet')
df.info()

# "data/Faculty_of_law.csv"
# "data/Faculty_of_LUMC.csv"
# "data/Faculty_of_Science.csv"
# "data/Faculty_of_Social_and_Behavioural_Sciences.csv"
# "data/leiden_Faculty_of_Archaeology.csv"
# "data/leiden_Faculty_of_Humanities.csv"
# "data/leiden_governance.csv"

# Load faculty data
faculty_files = [
    "data/Faculty_of_law.csv",
    "data/Faculty_of_LUMC.csv",
    "data/Faculty_of_Science.csv",
    "data/Faculty_of_Social_and_Behavioural_Sciences.csv",
    "data/leiden_Faculty_of_Archaeology.csv",
    "data/leiden_Faculty_of_Humanities.csv",
    "data/leiden_governance.csv"
]

faculty_names = [
    "Faculty of Law",
    "LUMC",
    "Faculty of Science",
    "Faculty of Social and Behavioural Sciences",
    "Faculty of Archaeology",
    "Faculty of Humanities",
    "Governance and Global Affairs"
]

# ...existing code...


# create a faculty DataFrame with the faculty name and the id column
faculty_dfs = []
for faculty_file, faculty_name in zip(faculty_files, faculty_names):
    faculty_df = pd.read_csv(faculty_file)
    faculty_df['faculty'] = faculty_name
    faculty_dfs.append(faculty_df[['faculty', "id", "title"]])
# Merge all faculty DataFrames into one
faculty_df = pd.concat(faculty_dfs, ignore_index=True)

faculty_df.info()

faculty_df = faculty_df.dropna(subset=['id'])

# First merge on title
faculty_df_unique_title = faculty_df.drop_duplicates(subset=['title'])
df = df.merge(faculty_df_unique_title[['title', 'faculty']], on='title', how='left')

# Then merge on id for rows that didn't get a faculty assignment
faculty_df_unique_id = faculty_df.drop_duplicates(subset=['id'])
mask = df['faculty'].isna()
df_missing = df[mask].drop(columns=['faculty']).merge(
    faculty_df_unique_id[['id', 'faculty']], 
    on='id', 
    how='left'
)
df.loc[mask, 'faculty'] = df_missing['faculty'].values

# ...existing code...

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124064 entries, 0 to 124063
Data columns (total 17 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   identifier   124064 non-null  object
 1   datestamp    124064 non-null  object
 2   creator      122941 non-null  object
 3   title        123682 non-null  object
 4   language     118846 non-null  object
 5   date         123684 non-null  object
 6   type         123681 non-null  object
 7   subject      46996 non-null   object
 8   source       99936 non-null   object
 9   description  116044 non-null  object
 10  license      81863 non-null   object
 11  format       85082 non-null   object
 12  contributor  15350 non-null   object
 13  publisher    8362 non-null    object
 14  alternative  132 non-null     object
 15  relation     1620 non-null    object
 16  id           124064 non-null  int64 
dtypes: int64(1), object(16)
memory usage: 16.1+ MB
<class 'pandas.core.frame.DataFrame'>
R

In [67]:
df.to_parquet('./data/leiden_publications_cleaned.parquet')

In [68]:
df.head()

Unnamed: 0,oai,doi,lucris-id,url,date,source,language,title,description,type,id,faculty
0,oai:scholarlypublications.universiteitleiden.n...,"doi:10.3390/ijerph21121561',","lucris-id:\xa01338718282',",https://hdl.handle.net/1887/4172863'],2026-02-11,International Journal of Environmental Researc...,en,An organizational-level workplace intervention...,Health and Well-being,article,4172863,
1,oai:scholarlypublications.universiteitleiden.n...,"doi:10.1186/s13071-024-06239-z',","lucris-id:\xa01303680279',",https://hdl.handle.net/1887/3731020'],2026-02-03,Parasites & Vectors,en,Landscape level associations between birds mos...,Environmental Biology,article,3731020,
2,oai:scholarlypublications.universiteitleiden.n...,"doi:10.1145/3594805.3607131',","lucris-id:\xa01271712022',",https://hdl.handle.net/1887/3718557'],2025-06-08,FOGA '23: proceedings of the 17th ACM/SIGEVO c...,en,General Boolean function benchmark suite,Algorithms and the Foundations of Software tec...,article,3718557,
3,oai:scholarlypublications.universiteitleiden.n...,"doi:10.5281/zenodo.7984135',","lucris-id:\xa01301256640',",https://hdl.handle.net/1887/3720309'],2025-06-05,,en,Informetric methods for studying the diversity...,Merit Expertise and Measurement,other,3720309,
4,oai:scholarlypublications.universiteitleiden.n...,,"lucris-id:\xa01309019857',",https://hdl.handle.net/1887/4094364'],2025-06-01,The Oxford Handbook of Historical Morphology,en,South America,[Horizon 2020(H2020) 818854 Descriptive and Co...,bookPart,4094364,


In [69]:
df = loaded_df.copy()

import ast
def safe_parse_authors(text):
    if not isinstance(text, str):
        return []
    try:
        # If it's already in list format
        if text.startswith('[') and text.endswith(']'):
            return ast.literal_eval(text)
        # If it's a single author without brackets
        else:
            return [text]
    except:
        # If parsing fails, return the original as a single-item list
        return [text]
    
# give a id to every publication


df["creators_parsed"] = df["creator"].apply(safe_parse_authors)
df["subjects_parsed"] = df["subject"].apply(safe_parse_authors)

# now create a seperate dataframe with the authors and the ids of the publications, with every author on a seperate row


In [70]:
authors_df = df.explode('creators_parsed')[['creators_parsed', 'id']]
authors_df = authors_df.rename(columns={'creators_parsed': 'author'})

subjects_df = df.explode('subjects_parsed')[['subjects_parsed', 'id']]
subjects_df = subjects_df.rename(columns={'subjects_parsed': 'subject'})

# replace " with empty string
authors_df['author'] = authors_df['author'].str.replace('"', '')
subjects_df['subject'] = subjects_df['subject'].str.replace('"', '')

# save to csv file
authors_df.to_csv('./data/authors.csv', index=False)
subjects_df.to_csv('./data/subjects.csv', index=False)

In [80]:
import duckdb
con.close()
# delete the database file if it exists
import os
if os.path.exists('data.db'):
    # ask the user if they want to delete the file
    delete = input("The database file already exists. Do you want to delete it? (y/n)")
    if delete == 'y':
        os.remove('data.db')
    else:
        print("Exiting the program.")
        raise KeyboardInterrupt
    

con = duckdb.connect("./data.db")

# load the data into the database
con.execute("CREATE TABLE authors (author VARCHAR, id INTEGER)")
con.execute("CREATE TABLE subjects (subject VARCHAR, id INTEGER)")
#  0   oai          124064 non-null  object        
#  1   doi          71129 non-null   object        
#  2   lucris-id    59686 non-null   object        
#  3   url          123684 non-null  object        
#  4   date         87613 non-null   datetime64[ns]
#  5   source       99936 non-null   object        
#  6   language     118846 non-null  object        
#  7   title        123682 non-null  object        
#  8   description  116044 non-null  object        
#  9   type         123681 non-null  object  
# 10   id           124064 non-null  int64
con.execute("CREATE TABLE publications (oai VARCHAR, doi VARCHAR, lucris_id VARCHAR, url VARCHAR, date DATE, source VARCHAR, language VARCHAR, title VARCHAR, description VARCHAR, type VARCHAR, id INTEGER, faculty VARCHAR)")

con.execute("COPY authors FROM 'data/authors.csv' (HEADER)")
con.execute("COPY subjects FROM 'data/subjects.csv' (HEADER)")
con.execute("COPY publications FROM 'data/leiden_publications_cleaned.parquet'")

con.execute(query = """
PRAGMA create_fts_index('publications', 'id', 'description');
""")
# join authors and subject on id
# query = """
# SELECT authors.author, subjects.subject
# FROM authors
# JOIN subjects
# ON authors.id = subjects.id
# WHERE authors.author = 'Lange, J. de'
# """

# result = con.execute(query)
# df = result.fetchdf()

# select first 10 publications
query = """
SELECT p.id, p.title, p.date, p.source, p.language, p.description, p.type
FROM publications p
"""

result = con.execute(query)
df = result.fetchdf()

con.close()
df.head()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unnamed: 0,id,title,date,source,language,description,type
0,4172863,An organizational-level workplace intervention...,2026-02-11,International Journal of Environmental Researc...,en,Health and Well-being,article
1,3731020,Landscape level associations between birds mos...,2026-02-03,Parasites & Vectors,en,Environmental Biology,article
2,3718557,General Boolean function benchmark suite,2025-06-08,FOGA '23: proceedings of the 17th ACM/SIGEVO c...,en,Algorithms and the Foundations of Software tec...,article
3,3720309,Informetric methods for studying the diversity...,2025-06-05,,en,Merit Expertise and Measurement,other
4,4094364,South America,2025-06-01,The Oxford Handbook of Historical Morphology,en,[Horizon 2020(H2020) 818854 Descriptive and Co...,bookPart


In [81]:
con.close()

In [74]:
df = pd.read_parquet('./data/leiden_publications_cleaned.parquet')
df.head()

Unnamed: 0,oai,doi,lucris-id,url,date,source,language,title,description,type,id,faculty
0,oai:scholarlypublications.universiteitleiden.n...,"doi:10.3390/ijerph21121561',","lucris-id:\xa01338718282',",https://hdl.handle.net/1887/4172863'],2026-02-11,International Journal of Environmental Researc...,en,An organizational-level workplace intervention...,Health and Well-being,article,4172863,
1,oai:scholarlypublications.universiteitleiden.n...,"doi:10.1186/s13071-024-06239-z',","lucris-id:\xa01303680279',",https://hdl.handle.net/1887/3731020'],2026-02-03,Parasites & Vectors,en,Landscape level associations between birds mos...,Environmental Biology,article,3731020,
2,oai:scholarlypublications.universiteitleiden.n...,"doi:10.1145/3594805.3607131',","lucris-id:\xa01271712022',",https://hdl.handle.net/1887/3718557'],2025-06-08,FOGA '23: proceedings of the 17th ACM/SIGEVO c...,en,General Boolean function benchmark suite,Algorithms and the Foundations of Software tec...,article,3718557,
3,oai:scholarlypublications.universiteitleiden.n...,"doi:10.5281/zenodo.7984135',","lucris-id:\xa01301256640',",https://hdl.handle.net/1887/3720309'],2025-06-05,,en,Informetric methods for studying the diversity...,Merit Expertise and Measurement,other,3720309,
4,oai:scholarlypublications.universiteitleiden.n...,,"lucris-id:\xa01309019857',",https://hdl.handle.net/1887/4094364'],2025-06-01,The Oxford Handbook of Historical Morphology,en,South America,[Horizon 2020(H2020) 818854 Descriptive and Co...,bookPart,4094364,


In [72]:
import duckdb
con = duckdb.connect("data.db")

# select the count of publications per type per year
query = """
SELECT type, EXTRACT(YEAR FROM date) AS year, COUNT(*) as count
FROM publications
WHERE year > 2000
GROUP BY type, year;
"""

# select the authors and the publication titles
query = """
SELECT authors.author, publications.title, publications.date, publications.type, publications.url
FROM authors
JOIN publications
ON authors.id = publications.id
WHERE authors.author = 'Lange, J. de'
"""

# get all unique subjects
query = """
SELECT authors.author, publications.title, publications.date, publications.type, publications.url
FROM publications
JOIN authors
ON authors.id = publications.id
WHERE authors.author = 'Lange, J. de'
"""

result = con.execute(query)
df = result.fetchdf()
df.head()

Unnamed: 0,author,title,date,type,url
