# Works on African History

If the SPARQLWrapper has already been installed, please place a 'hash' in front of the command below. 

In [None]:
!pip install SPARQLWrapper
!pip install pyvis
!pip install networkx

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
from IPython.core.display import display, HTML
import pandas as pd
import os
from collections import Counter
import plotly.express as px
import requests
from os.path import basename

url = 'https://raw.githubusercontent.com/peterverhaar/stcn-sparql/refs/heads/main/social_network_analysis.py'
response = requests.get(url)
with open(basename(url),'w',encoding='utf-8') as out:
    out.write(response.text)

import re
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

    
endpoint = 'https://data.bibliotheken.nl/sparql'
sparql = SPARQLWrapper(endpoint)

def run_query(sparql_query):
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    data = sparql.query().convert()
    df = pd.json_normalize(data['results']['bindings']) 
    return df


def count_items(identifier,name):
    items = Counter()

    item_names = dict()
    for i,row in df.iterrows():
        if item_names.get(row[identifier]) is None:
            item_names[row[identifier]] = row[name]

    for book in df['book.value'].unique():
        book_df = df[ df['book.value'] == book ]
        book_items = book_df[identifier].unique()
        for item in book_items:
            items.update([item_names[item]])
            
    return items

This notebook searches for titles in the STCN about the history of Africa. The identifier of this topic is &lt; http://data.bibliotheken.nl/id/thes/p15544669X &gt;

Of course you can also search using other topics. The identified of "Astronomy", for example, is   
&lt; http://data.bibliotheken.nl/id/thes/p155446002 &gt;

In [None]:
sparql_query = """

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <http://schema.org/>
PREFIX kb: <http://data.bibliotheken.nl/def#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT *

WHERE {

?book schema:mainEntityOfPage ?mainEntity .
?mainEntity schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn>  . 

?book schema:about <http://data.bibliotheken.nl/id/thes/p15544669X> .
?book schema:publication ?publ .
?book schema:name ?title .

OPTIONAL {
?publ schema:startDate ?publ_year . 
}
 

} 

"""

df = run_query(sparql_query)

In [None]:
print(f'{df["book.value"].unique().shape[0]} books in total.\n\n')

shown = []
nr = 0

for i,book in df.iterrows():
    if book["book.value"] not in shown:
        nr += 1
        print(f'{nr}.\n{book["book.value"]}')
        print(f"{book['title.value']}\n{book['publ_year.value']}\n")
        shown.append(book["book.value"])

## Number of books per year

In [None]:
years = Counter()
invalid = []

for i,row in df.drop_duplicates('book.value').sort_values( by = 'publ_year.value').iterrows():
    if re.search(r'\d{4}',str(row['publ_year.value'])):
        years.update([int(row['publ_year.value'])])
    else:
        invalid.append(row['publ_year.value'])

if len(invalid)>0:
    print("Invalid years:")
    print(','.join(set(invalid)) )

x_axis = list(years.keys())
y_axis = list(years.values())

for year in range(min(x_axis),max(x_axis)):
    if year not in x_axis:
        x_axis.append(year)
        y_axis.append(0)
        
timeline = pd.DataFrame( {'year':x_axis,'number_of_titles':y_axis} )
timeline = timeline.sort_values(by='year')    

In [None]:

fig = px.line(timeline,
              x='year', 
              y='number_of_titles',
              width=600, 
              height=400)

fig.show()

## Authors

In [None]:
query = '''

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <http://schema.org/>
PREFIX kb: <http://data.bibliotheken.nl/def#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT * {

?book schema:mainEntityOfPage ?mainEntity .
?mainEntity schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn>  . 

?book schema:about <http://data.bibliotheken.nl/id/thes/p15544669X> .

OPTIONAL {
?book schema:author ?author_node . 
?author_node schema:author ?author_info .
?author_info  rdfs:label ?author_name . }
}

'''
df = run_query(query)

In [None]:
authors = count_items('author_info.value','author_name.value')

x_axis = []
y_axis = []

minimum_titles = 3

for name,count in authors.most_common():
    if not(pd.isna(name)):
        if re.search(r'[(]',str(name)):
            name = name[:name.index('(')]
            
        if count>minimum_titles:
            x_axis.append(name)
            y_axis.append(count)
        
df_barchart = pd.DataFrame({'Name':x_axis,'Number of titles':y_axis})


fig = px.bar(df_barchart , 
             y = 'Name' , 
             x='Number of titles',
                height=1000)

fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()


## Publishers

In [None]:
query = '''

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <http://schema.org/>
PREFIX kb: <http://data.bibliotheken.nl/def#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT * {

?book schema:mainEntityOfPage ?mainEntity .
?mainEntity schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn>  . 

?book schema:about <http://data.bibliotheken.nl/id/thes/p15544669X> .

?book schema:publication ?publ .

OPTIONAL {
?publ schema:publishedBy ?publisher . 
?publisher schema:name ?publ_name .
?publisher schema:location ?location_node .
?location_node schema:address ?address .
?address schema:addressLocality ?place . 
}

}

'''
df = run_query(query)

In [None]:
publishers = count_items('publisher.value','publ_name.value') 
        
x_axis = []
y_axis = []

minimum_titles = 2

for publisher,count in publishers.most_common():
    if not(pd.isna(publisher)) and not(re.search(r's[.]n[.]',publisher)):
        if count>minimum_titles:
            x_axis.append(publisher)
            y_axis.append(count)
        
x_label = 'Publisher'
y_label = 'Number of titles'
        
df_barchart = pd.DataFrame({x_label:x_axis,y_label:y_axis})


fig = px.bar(df_barchart , 
             y = x_label , 
             x= y_label,
            height = 4000)

fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()

## Cities

In [None]:
cities = count_items('place.value','place.value')   
        
x_axis = []
y_axis = []

max_characters = 30

for city,count in cities.most_common():
    if not(pd.isna(city)):
        if len(city)>max_characters:
            city = city[:max_characters] + ' ...'
        x_axis.append(city)
        y_axis.append(count)
        
x_label = 'City'
y_label = 'Number of titles'
        
df_barchart = pd.DataFrame({x_label:x_axis,y_label:y_axis})


fig = px.bar(df_barchart , 
             y = x_label , 
             x = y_label,
            height = 1500)

fig['layout']['yaxis']['autorange'] = "reversed"
fig.show()

## Social Network Analysis

In [None]:
query = '''

PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX schema: <http://schema.org/>
PREFIX kb: <http://data.bibliotheken.nl/def#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

SELECT * {

?book schema:mainEntityOfPage ?mainEntity .
?mainEntity schema:isPartOf <http://data.bibliotheken.nl/id/dataset/stcn>  . 

?book schema:about <http://data.bibliotheken.nl/id/thes/p15544669X> .

?book schema:publication ?publ .

OPTIONAL {
?publ schema:publishedBy ?publisher . 
?publisher schema:name ?publ_name .
}

OPTIONAL {
?book schema:author ?author_node . 
?author_node schema:author ?author_info .
?author_info  rdfs:label ?author_name . }

}

'''
df = run_query(query)

In [None]:
unique_ids = df['book.value'].unique().tolist()
edges = []

names = dict()
for i,row in df.iterrows():
    publ_id = row['publisher.value']
    if not(pd.isna(publ_id)):
        publ_id = os.path.basename(publ_id)
        publ_name = row['publ_name.value']
        publ_name = re.sub(r',','',publ_name)
        names[publ_id]=(publ_name,'Publisher')
    author_id = row['author_info.value']
    if not(pd.isna(author_id)):
        author_id = os.path.basename(author_id)
        author_name = row['author_name.value']
        author_name = re.sub(r',','',author_name)
        names[author_id]=(author_name,'Author')
    
for book in unique_ids:
    book_df = df[ df['book.value'] == book ]
    book_publishers = book_df['publisher.value'].unique().tolist()
    book_authors = book_df['author_info.value'].unique().tolist()

    for publisher in book_publishers:
        for author in book_authors:
            if not(pd.isna(publisher)) and not(pd.isna(author)):
                edges.append( (os.path.basename(publisher),os.path.basename(author)))

out = open("nodes.csv",'w',encoding='utf-8')
out.write('Id,Label,Type\n')
for name in names:
    out.write(f'{name},"{names[name][0]}","{names[name][1]}"\n')
out.close()

out = open("edges.csv",'w',encoding='utf-8')
out.write('Source,Target,Type\n')
for edge in edges:
    if not(pd.isna(edge[0])) and not(pd.isna(edge[1])): 
        out.write(f'{edge[0]},{edge[1]},"Undirected"\n')
out.close()

In [None]:
from social_network_analysis import *

color_node_type1 = '#cc270a'
color_node_type2 = '#007788' 
background_color = '#f5f6f7'
font_size = 70
min_node_size = 30 
max_node_size = 200


net = make_network(color_node_type1 = color_node_type1 ,color_node_type2 = color_node_type2, background_color = background_color, font_size = font_size, min_node_size = min_node_size , max_node_size = max_node_size)

net.save_graph( f'network.html')
net.show( f'network.html')