## 1 General overview

In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON
import ssl
import pprint as pp

ssl._create_default_https_context = ssl._create_unverified_context

wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

total_query = """
SELECT DISTINCT ?gendername (COUNT(?person) AS ?tot)
WHERE
{
       ?person wdt:P31 wd:Q5 ;
              wdt:P21 ?gender ;
              wdt:P106/wdt:P279* wd:Q1792450. 
       ?gender rdfs:label ?gendername . 
       FILTER (lang(?gendername) = 'en')
}
GROUP BY ?gendername
ORDER BY DESC(?tot)
"""

sparql_wd = SPARQLWrapper(wikidata_endpoint)
sparql_wd.setQuery(total_query)
sparql_wd.setReturnFormat(JSON)
totalResults = sparql_wd.query().convert()

artHistorians = dict()
totalNumList = list()

for result in totalResults["results"]["bindings"]:
    gender = result["gendername"]["value"]
    total = result["tot"]["value"]
    artHistorians[gender] = total
    totalNumList.append(int(total))
    print("There are " + total +" "+ gender + " art historians.")
    
totalNum = sum(totalNumList)
print("There is a total of " + str(totalNum) +" art historians.")

pp.pprint(artHistorians)

There are 10511 male art historians.
There are 4781 female art historians.
There are 2 non-binary art historians.
There is a total of 15294 art historians.
{'female': '4781', 'male': '10511', 'non-binary': '2'}


In [2]:
percList = {'female': [], 'male': [], 'non-binary': []}

for key in artHistorians.keys():
    num = int(artHistorians.get(key))
    quot = num*100
    perc = quot / totalNum
    percList[key] = perc
    
print(percList)


{'female': 31.260625081731398, 'male': 68.72629789459918, 'non-binary': 0.013077023669412841}


In [3]:
from math import pi

import pandas as pd

from bokeh.io import output_notebook, show
from bokeh.palettes import Category20c
from bokeh.plotting import figure
from bokeh.transform import cumsum

chart_colors = ['#e8cdda', '#c4ddda', '#989898']

output_notebook()
x = percList

data = pd.Series(x).reset_index(name='value').rename(columns={'index':'genders'})
data['angle'] = data['value']/data['value'].sum() * 2*pi
data['color'] = chart_colors[:len(x)]

p = figure(plot_height=350, title="Gender overview in percentages", toolbar_location=None,
           tools="hover", tooltips="@value %", x_range=(-0.5, 1.0))

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", color = "color", legend_field='genders', source=data)

p.axis.axis_label= None
p.axis.visible= False
p.grid.grid_line_color = None

show(p)

In [4]:
genders = ['male','female','non-binary']

totalm = int(artHistorians.get('male'))
totalf = int(artHistorians.get('female'))
totalx = int(artHistorians.get('non-binary'))

color_list = ['#c4ddda','#e8cdda', '#989898']

p = figure(x_range=genders, plot_height=500, title="Gender overview")
p.vbar(x=genders, top=[totalm, totalf, totalx], color=color_list, width=0.6)

p.xgrid.grid_line_color = None
p.y_range.start = 0

show(p)

## 2 Geographical distribution

In [19]:
female_query = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
SELECT DISTINCT ?countryCode ?countryLabel (COUNT(?person) AS ?totwomen)
WHERE
{
       ?person wdt:P31 wd:Q5 ;
            wdt:P21 wd:Q6581072 ;
            wdt:P106/wdt:P279* wd:Q1792450;
            wdt:P27 ?country.
       ?country wdt:P463 wd:Q458 .
       ?country wdt:P297 ?countryCode.
       SERVICE wikibase:label {bd:serviceParam wikibase:language "en" }
}
GROUP BY ?countryCode ?countryLabel
ORDER BY DESC(?totwomen)
"""

sparql_wd = SPARQLWrapper(wikidata_endpoint)
sparql_wd.setQuery(female_query)
sparql_wd.setReturnFormat(JSON)
femaleResults = sparql_wd.query().convert()

for result in femaleResults["results"]["bindings"]:
    country = result["countryCode"]["value"]
    country_label = result["countryLabel"]["value"]
    totaln = result["totwomen"]["value"]
    print("Female art historians in " + country_label + " (" + country + ")" ": " +totaln)


Female art historians in Germany (DE): 464
Female art historians in France (FR): 199
Female art historians in Spain (ES): 146
Female art historians in Slovenia (SI): 136
Female art historians in Italy (IT): 128
Female art historians in Poland (PL): 121
Female art historians in United Kingdom (GB): 108
Female art historians in Austria (AT): 100
Female art historians in Kingdom of the Netherlands (NL): 73
Female art historians in Czech Republic (CZ): 70
Female art historians in Hungary (HU): 56
Female art historians in Sweden (SE): 54
Female art historians in Denmark (DK): 52
Female art historians in Estonia (EE): 41
Female art historians in Finland (FI): 28
Female art historians in Belgium (BE): 23
Female art historians in Greece (GR): 13
Female art historians in Lithuania (LT): 13
Female art historians in Bulgaria (BG): 12
Female art historians in Romania (RO): 10
Female art historians in Slovakia (SK): 8
Female art historians in Portugal (PT): 7
Female art historians in Croatia (HR): 

In [6]:
male_query = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wikibase: <http://wikiba.se/ontology#>
SELECT DISTINCT ?countryCode ?countryLabel (COUNT(?person) AS ?totmen)
WHERE
{
       ?person wdt:P31 wd:Q5 ;
            wdt:P21 wd:Q6581097 ;
            wdt:P106/wdt:P279* wd:Q1792450;
            wdt:P27 ?country.
       ?country wdt:P463 wd:Q458 .
       ?country wdt:P297 ?countryCode.
       SERVICE wikibase:label {bd:serviceParam wikibase:language "en" }
}
GROUP BY ?countryCode ?countryLabel
ORDER BY DESC(?totmen)
"""


sparql_wd = SPARQLWrapper(wikidata_endpoint)
sparql_wd.setQuery(male_query)
sparql_wd.setReturnFormat(JSON)
maleResults = sparql_wd.query().convert()

for result in maleResults["results"]["bindings"]:
    country = result["countryCode"]["value"]
    country_label = result["countryLabel"]["value"]
    totaln = result["totmen"]["value"]
    print("Male art historians in " + country_label + " (" + country + ")" ": " +totaln)


Male art historians in Germany (DE): 1879
Male art historians in France (FR): 767
Male art historians in United Kingdom (GB): 383
Male art historians in Austria (AT): 339
Male art historians in Italy (IT): 332
Male art historians in Poland (PL): 257
Male art historians in Spain (ES): 239
Male art historians in Kingdom of the Netherlands (NL): 192
Male art historians in Hungary (HU): 155
Male art historians in Sweden (SE): 146
Male art historians in Czech Republic (CZ): 126
Male art historians in Belgium (BE): 120
Male art historians in Denmark (DK): 103
Male art historians in Slovenia (SI): 102
Male art historians in Greece (GR): 44
Male art historians in Romania (RO): 44
Male art historians in Finland (FI): 43
Male art historians in Estonia (EE): 37
Male art historians in Bulgaria (BG): 25
Male art historians in Croatia (HR): 19
Male art historians in Lithuania (LT): 17
Male art historians in Latvia (LV): 15
Male art historians in Portugal (PT): 13
Male art historians in Slovakia (SK)

In [20]:
import pprint as pp
countries = dict()
for result in maleResults["results"]["bindings"]:
    country = result["countryLabel"]["value"]
    totaln = result["totmen"]["value"]
    countries[country] = [totaln] #male
for result in femaleResults["results"]["bindings"]:
    country = result["countryLabel"]["value"]
    totaln = result["totwomen"]["value"]
    countries[country].append(totaln) #male

pp.pprint(countries)

{'Austria': ['339', '100'],
 'Belgium': ['120', '23'],
 'Bulgaria': ['25', '12'],
 'Croatia': ['19', '7'],
 'Czech Republic': ['126', '70'],
 'Denmark': ['103', '52'],
 'Estonia': ['37', '41'],
 'Finland': ['43', '28'],
 'France': ['767', '199'],
 'Germany': ['1879', '464'],
 'Greece': ['44', '13'],
 'Hungary': ['155', '56'],
 'Ireland': ['7', '3'],
 'Italy': ['332', '128'],
 'Kingdom of the Netherlands': ['192', '73'],
 'Latvia': ['15', '5'],
 'Lithuania': ['17', '13'],
 'Luxembourg': ['5', '3'],
 'Poland': ['257', '121'],
 'Portugal': ['13', '7'],
 'Romania': ['44', '10'],
 'Slovakia': ['9', '8'],
 'Slovenia': ['102', '136'],
 'Spain': ['239', '146'],
 'Sweden': ['146', '54'],
 'United Kingdom': ['383', '108']}


In [8]:
import json
countries = dict()
for result in maleResults["results"]["bindings"]:
    country = result["countryLabel"]["value"]
    country_code = result["countryCode"]["value"]
    totaln = result["totmen"]["value"]
    countries[country] = {"id": country_code}
    countries[country]["male"] = totaln #male
for result in femaleResults["results"]["bindings"]:
    country = result["countryLabel"]["value"]
    if country not in countries.keys():
        country_code = result["countryCode"]["value"]
        countries[country] = {"id": country_code}
    totaln = result["totwomen"]["value"]
    countries[country]["female"] = totaln #male

pp.pprint(countries)
with open('data.json', 'w') as f:
    json.dump(countries, f, indent=4)

{'Austria': {'female': '100', 'id': 'AT', 'male': '339'},
 'Belgium': {'female': '23', 'id': 'BE', 'male': '120'},
 'Bulgaria': {'female': '12', 'id': 'BG', 'male': '25'},
 'Croatia': {'female': '7', 'id': 'HR', 'male': '19'},
 'Czech Republic': {'female': '70', 'id': 'CZ', 'male': '126'},
 'Denmark': {'female': '52', 'id': 'DK', 'male': '103'},
 'Estonia': {'female': '41', 'id': 'EE', 'male': '37'},
 'Finland': {'female': '28', 'id': 'FI', 'male': '43'},
 'France': {'female': '199', 'id': 'FR', 'male': '767'},
 'Germany': {'female': '464', 'id': 'DE', 'male': '1879'},
 'Greece': {'female': '13', 'id': 'GR', 'male': '44'},
 'Hungary': {'female': '56', 'id': 'HU', 'male': '155'},
 'Ireland': {'female': '3', 'id': 'IE', 'male': '7'},
 'Italy': {'female': '128', 'id': 'IT', 'male': '332'},
 'Kingdom of the Netherlands': {'female': '73', 'id': 'NL', 'male': '192'},
 'Latvia': {'female': '5', 'id': 'LV', 'male': '15'},
 'Lithuania': {'female': '13', 'id': 'LT', 'male': '17'},
 'Luxembourg':

In [21]:
json_list = list()
for country, data in countries.items():
    d = dict()
    d["name"] = country
    d["id"] = data["id"]
    if "male" in data.keys():
        d["male"] = data["male"]
    if "female" in data.keys():
        d["female"] = data["female"]
    json_list.append(d)

pp.pprint(json_list)
with open('data.json', 'w') as f:
    json.dump(json_list, f, indent=4)

TypeError: list indices must be integers or slices, not str

In [18]:
my_countries = list(countries.keys())
males = list()
females = list()
for element in list(countries.values()):
    males.append(element[0])
    females.append(element[1])
    
males = list(map(int, males)) 
females = list(map(int, females))

print(my_countries)
print(males)
print(females)

['Germany', 'France', 'United Kingdom', 'Austria', 'Italy', 'Poland', 'Spain', 'Kingdom of the Netherlands', 'Hungary', 'Sweden', 'Czech Republic', 'Belgium', 'Denmark', 'Slovenia', 'Greece', 'Romania', 'Finland', 'Estonia', 'Bulgaria', 'Croatia', 'Lithuania', 'Latvia', 'Portugal', 'Slovakia', 'Ireland', 'Luxembourg']
[1879, 767, 383, 339, 332, 257, 239, 192, 155, 146, 126, 120, 103, 102, 44, 44, 43, 37, 25, 19, 17, 15, 13, 9, 7, 5]
[464, 199, 108, 100, 128, 121, 146, 73, 56, 54, 70, 23, 52, 136, 13, 10, 28, 41, 12, 7, 13, 5, 7, 8, 3, 3]


In [22]:
gender = ["male", "female"]
colors = ["#c4ddda", "#e8cdda"]

data = {'countries' : my_countries,
        'male'   : males,
        'female' : females}

p = figure(x_range=my_countries,  plot_width=1000, plot_height=500, title="Fruit counts by year",
           toolbar_location=None, tools="hover")

p.vbar_stack(gender, x='countries', width=0.9, color=colors, source=data,
             legend_label=gender)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
p.xaxis.major_label_orientation = "vertical"

show(p)

In [23]:
tot = list()
for element in list(countries.values()):
    tot.append(int(element[1])+int(element[0]))
print(tot)

percentages = dict()

for x in range(26):
    quot1 = males[x] *100
    quot2 = females[x] *100
    total = tot[x]
    percentage1 = quot1/total
    percentage2 = quot2/total
    key = my_countries[x]
    percentages[key] =[round(percentage1), round(percentage2)]
    
pp.pprint(percentages)

male_perc = list()
female_perc = list()

for element in list(percentages.values()):
    male_perc.append(element[0])
    female_perc.append(element[1])
    
print(male_perc)
print(female_perc)

[2343, 966, 491, 439, 460, 378, 385, 265, 211, 200, 196, 143, 155, 238, 57, 54, 71, 78, 37, 26, 30, 20, 20, 17, 10, 8]
{'Austria': [77, 23],
 'Belgium': [84, 16],
 'Bulgaria': [68, 32],
 'Croatia': [73, 27],
 'Czech Republic': [64, 36],
 'Denmark': [66, 34],
 'Estonia': [47, 53],
 'Finland': [61, 39],
 'France': [79, 21],
 'Germany': [80, 20],
 'Greece': [77, 23],
 'Hungary': [73, 27],
 'Ireland': [70, 30],
 'Italy': [72, 28],
 'Kingdom of the Netherlands': [72, 28],
 'Latvia': [75, 25],
 'Lithuania': [57, 43],
 'Luxembourg': [62, 38],
 'Poland': [68, 32],
 'Portugal': [65, 35],
 'Romania': [81, 19],
 'Slovakia': [53, 47],
 'Slovenia': [43, 57],
 'Spain': [62, 38],
 'Sweden': [73, 27],
 'United Kingdom': [78, 22]}
[80, 79, 78, 77, 72, 68, 62, 72, 73, 73, 64, 84, 66, 43, 77, 81, 61, 47, 68, 73, 57, 75, 65, 53, 70, 62]
[20, 21, 22, 23, 28, 32, 38, 28, 27, 27, 36, 16, 34, 57, 23, 19, 39, 53, 32, 27, 43, 25, 35, 47, 30, 38]


In [24]:
gender = ["male", "female"]
colors = ["#c4ddda", "#e8cdda"]

data = {'countries' : my_countries,
        'male'   : male_perc,
        'female' : female_perc}

p = figure(x_range=my_countries,  plot_width=1000, plot_height=500, title="Percentages of women and men in art history per country",
           toolbar_location=None)

p.vbar_stack(gender, x='countries', width=0.7, color=colors, source=data,
             legend_label=gender)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
p.xaxis.major_label_orientation = "vertical"

show(p)

## 3 Occupation

In [25]:
from collections import OrderedDict

male_occupation_query = """
SELECT DISTINCT ?occupationLabel (COUNT(?man) AS ?totman)
WHERE
{
       ?man wdt:P31 wd:Q5 ;
              wdt:P21 wd:Q6581097 ;
              wdt:P106/wdt:P279* wd:Q1792450;
              wdt:P106 ?occupation ;
              wdt:P27 ?country.
       ?country wdt:P463 wd:Q458 .
       SERVICE wikibase:label {bd:serviceParam wikibase:language "en" }
}
GROUP BY ?occupationLabel
ORDER BY DESC(?totman)
LIMIT 11
"""

sparql_wd.setQuery(male_occupation_query)
sparql_wd.setReturnFormat(JSON)
occupationResults = sparql_wd.query().convert()

artOccupations = {'male': {},'female': {}}

for result in occupationResults["results"]["bindings"]:
    occupation = result["occupationLabel"]["value"]
    total = result["totman"]["value"]
    artOccupations["male"][occupation] = int(total)


In [26]:
female_occupation_query = """
SELECT DISTINCT ?occupationLabel (COUNT(?woman) AS ?totwoman)
WHERE
{
       ?woman wdt:P31 wd:Q5 ;
              wdt:P21 wd:Q6581072 ;
              wdt:P106/wdt:P279* wd:Q1792450;
              wdt:P106 ?occupation ;
              wdt:P27 ?country.
       ?country wdt:P463 wd:Q458 .
       SERVICE wikibase:label {bd:serviceParam wikibase:language "en" }
}
GROUP BY ?occupationLabel
ORDER BY DESC(?totwoman)
LIMIT 11
"""

sparql_wd.setQuery(female_occupation_query)
sparql_wd.setReturnFormat(JSON)
occupationResults = sparql_wd.query().convert()

for result in occupationResults["results"]["bindings"]:
    occupation = result["occupationLabel"]["value"]
    total = result["totwoman"]["value"]
    artOccupations["female"][occupation] = int(total)


if 'art historian' in artOccupations["female"]:
    del artOccupations["female"]['art historian']
if 'art historian' in artOccupations["male"]:
    del artOccupations["male"]['art historian']

pp.pprint(artOccupations)


{'female': {'archaeologist': 100,
            'architectural historian': 53,
            'art critic': 73,
            'curator': 115,
            'exhibition curator': 127,
            'historian': 92,
            'journalist': 41,
            'translator': 44,
            'university teacher': 245,
            'writer': 188},
 'male': {'archaeologist': 757,
          'architect': 347,
          'architectural historian': 461,
          'art critic': 278,
          'curator': 256,
          'exhibition curator': 208,
          'historian': 399,
          'painter': 281,
          'university teacher': 1354,
          'writer': 652}}


In [34]:
f_occupation_list = sorted(artOccupations["female"].items(), key=lambda x:x[1],reverse=True)
m_occupation_list = sorted(artOccupations["male"].items(), key=lambda x:x[1],reverse=True)

print("""
Most common occupations for female art historians:
""")
f_occ_list = list()
f_count_list = list()

for occValue in f_occupation_list:
    occ = occValue[0]
    value = str(occValue[1])
    f_occ_list.append(occ)
    f_count_list.append(value)
    print(occ+"("+value+")")
    
f_occ_list.reverse()
f_count_list.reverse()

print(f_occ_list)
print(f_count_list)


print("""
Most common occupations for male art historians:
""")
m_occ_list = list()
m_count_list = list()
for occValue in m_occupation_list:
    occ = occValue[0]
    value = str(occValue[1])
    m_occ_list.append(occ)
    m_count_list.append(value)
    print(occ+"("+value+")")
    
m_occ_list.reverse()
m_count_list.reverse() 

print(m_occ_list)
print(m_count_list)


Most common occupations for female art historians:

university teacher(245)
writer(188)
exhibition curator(127)
curator(115)
archaeologist(100)
historian(92)
art critic(73)
architectural historian(53)
translator(44)
journalist(41)
['journalist', 'translator', 'architectural historian', 'art critic', 'historian', 'archaeologist', 'curator', 'exhibition curator', 'writer', 'university teacher']
['41', '44', '53', '73', '92', '100', '115', '127', '188', '245']

Most common occupations for male art historians:

university teacher(1354)
archaeologist(757)
writer(652)
architectural historian(461)
historian(399)
architect(347)
painter(281)
art critic(278)
curator(256)
exhibition curator(208)
['exhibition curator', 'curator', 'art critic', 'painter', 'architect', 'historian', 'architectural historian', 'writer', 'archaeologist', 'university teacher']
['208', '256', '278', '281', '347', '399', '461', '652', '757', '1354']


In [35]:
from bokeh.embed import components
from bokeh.plotting import output_notebook
from bokeh.models import FactorRange

output_notebook()

occupations = m_occ_list
counts = m_count_list

p = figure(y_range=FactorRange(factors=occupations), plot_height=300, title="Most common occupations for men",
            toolbar_location=None, tools="")

p.hbar(y=occupations, right=counts, height=0.7, fill_color="#c4ddda", line_color="#c4ddda")

show(p)

In [36]:
from bokeh.embed import components
from bokeh.plotting import output_notebook
from bokeh.models import FactorRange

output_notebook()

occupations = f_occ_list
counts = f_count_list

p = figure(y_range=FactorRange(factors=occupations), plot_height=300, title="Most common occupations for men",
            toolbar_location=None, tools="")

p.hbar(y=occupations, right=counts, height=0.7, fill_color="#e8cdda", line_color="#e8cdda")

show(p)

In [None]:
import json

json_out = list()
for occ in f_occupation_list:
    json_out.append({
        'gender': 'female',
        'occupation': occ[0],
        'number': occ[1],
    })
for occ in m_occupation_list:
    json_out.append({
        'gender': 'male',
        'occupation': occ[0],
        'number': occ[1],
    })
with open('alt_occ_data.json', 'w') as f:
    json.dump(json_out, f, indent=4)
    

In [None]:
membership_query = """
SELECT DISTINCT ?institutionLabel ?countryLabel ?genderLabel (count(?historian) as ?count) WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?historian wdt:P31 wd:Q5;
             wdt:P21 ?gender;
             wdt:P106/wdt:P279* wd:Q1792450;
             wdt:P463 ?institution.
  ?institution wdt:P17 ?country
  
}

group by ?institutionLabel ?countryLabel ?genderLabel order by DESC(?count)
"""


sparql_wd.setQuery(membership_query)
sparql_wd.setReturnFormat(JSON)
membership_query_results = sparql_wd.query().convert()

orgs = dict()

for result in membership_query_results["results"]["bindings"]:
    if result["institutionLabel"]["value"] not in orgs.keys():
        orgs[result["institutionLabel"]["value"]] = dict()
        orgs[result["institutionLabel"]["value"]]["country"] = result["countryLabel"]["value"]
    orgs[result["institutionLabel"]["value"]][result["genderLabel"]["value"]] = result["count"]["value"]
    
print(orgs)

In [None]:
import json

lst = ['German Archaeological Institute', 'Académie des Inscriptions et Belles-Lettres',
       'Royal Swedish Academy of Letters, History and Antiquities','Austrian Archaeological Institute', 
       'Real Academia de Bellas Artes de San Fernando', 'British Academy',
       'Royal Netherlands Academy of Arts and Sciences', 'Lincean Academy', 
       'Hungarian Academy of Sciences']

    
json_out = list()
for org in lst:
    json_out.append({
        'institution': org,
        'male': orgs[org]['male'],
        'female': orgs[org]['female'],
        'country': orgs[org]['country']
    })
with open('institution_data.json', 'w') as f:
    json.dump(json_out, f, indent=4)

## 4 Scholarly works

In [38]:
import sys
print(sys.version)

from SPARQLWrapper import SPARQLWrapper, JSON
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"


scholarly_works_query = """
SELECT distinct ?year ?genderLabel (count(?historian) as ?count) WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?historian wdt:P31 wd:Q5;
                    wdt:P21 ?gender;
                    wdt:P106/wdt:P279* wd:Q1792450.
                    
  ?authorial_work wdt:P50 ?historian.
  ?authorial_work wdt:P31/wdt:P279* wd:Q55915575.
  ?authorial_work wdt:P577 ?publication_date.
}

group by (year(xsd:dateTime(?publication_date)) as ?year) ?genderLabel order by (?year)
"""

sparql_wd = SPARQLWrapper(wikidata_endpoint)
sparql_wd.setQuery(scholarly_works_query)
sparql_wd.setReturnFormat(JSON)
scholarly_works_results = sparql_wd.query().convert()

for line in scholarly_works_results["results"]["bindings"]:
    print('Number of scholarly works created by', line['genderLabel']['value'],\
          'art historians in year ', line['year']['value'], ':', line['count']['value'])


3.8.1 (v3.8.1:1b293b6006, Dec 18 2019, 14:08:53) 
[Clang 6.0 (clang-600.0.57)]
Number of scholarly works created by male art historians in year  196 : 2
Number of scholarly works created by male art historians in year  198 : 2
Number of scholarly works created by female art historians in year  198 : 2
Number of scholarly works created by male art historians in year  1829 : 2
Number of scholarly works created by male art historians in year  1855 : 2
Number of scholarly works created by male art historians in year  1856 : 2
Number of scholarly works created by male art historians in year  1857 : 2
Number of scholarly works created by male art historians in year  1859 : 2
Number of scholarly works created by male art historians in year  1860 : 4
Number of scholarly works created by male art historians in year  1861 : 2
Number of scholarly works created by male art historians in year  1864 : 6
Number of scholarly works created by male art historians in year  1865 : 4
Number of scholarly wo

In [39]:
scholarly_works_results = [i for i in scholarly_works_results["results"]["bindings"] if not i["year"]["value"] == "198"]
for line in scholarly_works_results:
    print('Number of scholarly works created by', line['genderLabel']['value'],\
          'art historians in year ', line['year']['value'], ':', line['count']['value'])


Number of scholarly works created by male art historians in year  196 : 2
Number of scholarly works created by male art historians in year  1829 : 2
Number of scholarly works created by male art historians in year  1855 : 2
Number of scholarly works created by male art historians in year  1856 : 2
Number of scholarly works created by male art historians in year  1857 : 2
Number of scholarly works created by male art historians in year  1859 : 2
Number of scholarly works created by male art historians in year  1860 : 4
Number of scholarly works created by male art historians in year  1861 : 2
Number of scholarly works created by male art historians in year  1864 : 6
Number of scholarly works created by male art historians in year  1865 : 4
Number of scholarly works created by male art historians in year  1866 : 4
Number of scholarly works created by male art historians in year  1867 : 8
Number of scholarly works created by male art historians in year  1868 : 2
Number of scholarly works 

Number of scholarly works created by female art historians in year  2015 : 32
Number of scholarly works created by male art historians in year  2015 : 76
Number of scholarly works created by male art historians in year  2016 : 70
Number of scholarly works created by female art historians in year  2016 : 67
Number of scholarly works created by male art historians in year  2017 : 57
Number of scholarly works created by female art historians in year  2017 : 58
Number of scholarly works created by male art historians in year  2018 : 40
Number of scholarly works created by female art historians in year  2018 : 28
Number of scholarly works created by male art historians in year  2019 : 28
Number of scholarly works created by female art historians in year  2019 : 26
Number of scholarly works created by female art historians in year  2020 : 6
Number of scholarly works created by male art historians in year  2020 : 6


In [42]:
gender = ["male", "female"]
colors = ["#c4ddda", "#e8cdda"]
    
scores = [(i, i+19) for i in range(1820, 2040, 20)]

current_range_index = 0
score_data = [{'range': i, 'male': 0, 'female': 0} for i in scores]
for line in scholarly_works_results:
    if int(line['year']['value']) > scores[current_range_index][1]:
        current_range_index += 1
    score_data[current_range_index][line['genderLabel']['value']] += int(line['count']['value'])


data = {'20-year-periods' : [str(i) for i in scores],
        'male'   : [i['male'] for i in score_data],
        'female' : [i['female'] for i in score_data]}

p = figure(x_range=data['20-year-periods'],  plot_width=1000, plot_height=500, title="Scholarly work per 20-year period",
           toolbar_location=None, tools="hover")

p.vbar_stack(gender, x='20-year-periods', width=0.9, color=colors, source=data,
             legend_label=gender)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
p.xaxis.major_label_orientation = "vertical"

show(p)

In [43]:
json_out = list()
for score in score_data:
    json_out.append({
        'range': str(score['range'][0]) + ' - ' + str(score['range'][1]),
        'male': score['male'],
        'female': score['female']
    })
with open('data4.json', 'w') as f:
    json.dump(json_out, f, indent=4)

In [45]:
import numpy as np 

true_f = [i['female'] for i in score_data]
true_m = [i['male'] for i in score_data]
true_len = [f+m for (f, m) in zip(true_f, true_m)]

female_ratio = [np.true_divide(i['female'], l)*100 for (i, l) in zip(score_data, true_len)]
male_ratio = [np.true_divide(i['male'], l)*100 for (i, l) in zip(score_data, true_len)]

data = {'20-year-periods' : [str(i) for i in scores],
        'male'   : male_ratio,
        'female' : female_ratio}

p = figure(x_range=data['20-year-periods'],  plot_width=1000, plot_height=500, title="Scholarly work per 20-year period",
           toolbar_location=None, tools="hover")

p.vbar_stack(gender, x='20-year-periods', width=0.9, color=colors, source=data,
             legend_label=gender)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
p.xaxis.major_label_orientation = "vertical"

show(p)

In [46]:
json_out = list()
for score, male, female in zip(scores, male_ratio, female_ratio):
    json_out.append({
        'range': str(score[0]) + ' - ' + str(score[1]),
        'male': str(round(male, 2)),
        'female': str(round(female, 2))
    })
with open('data5.json', 'w') as f:
    json.dump(json_out, f, indent=4)

In [47]:
ssl._create_default_https_context = ssl._create_unverified_context

wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

birthdate_query = """
SELECT distinct ?year ?genderLabel (count(?historian) as ?count) WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?historian wdt:P31 wd:Q5;
                    wdt:P21 ?gender;
                    wdt:P106/wdt:P279* wd:Q1792450;
             wdt:P569 ?date. FILTER (?date > "1790-01-01"^^xsd:dateTime)
}

group by (year(xsd:dateTime(?date)) as ?year) ?genderLabel order by (?year)
"""

sparql_wd = SPARQLWrapper(wikidata_endpoint)
sparql_wd.setQuery(birthdate_query)
sparql_wd.setReturnFormat(JSON)
birthdate_results = sparql_wd.query().convert()

for line in birthdate_results["results"]["bindings"]:
    print('Number of', line['genderLabel']['value'], 'art historians born in',\
          line['year']['value'], ':', line['count']['value'])

Number of male art historians born in 1790 : 2
Number of male art historians born in 1791 : 4
Number of male art historians born in 1792 : 4
Number of male art historians born in 1793 : 5
Number of female art historians born in 1794 : 3
Number of male art historians born in 1794 : 6
Number of male art historians born in 1795 : 5
Number of male art historians born in 1796 : 6
Number of male art historians born in 1797 : 9
Number of female art historians born in 1797 : 1
Number of male art historians born in 1798 : 7
Number of male art historians born in 1799 : 4
Number of male art historians born in 1800 : 9
Number of male art historians born in 1801 : 9
Number of female art historians born in 1801 : 1
Number of male art historians born in 1802 : 8
Number of male art historians born in 1803 : 6
Number of male art historians born in 1804 : 13
Number of male art historians born in 1805 : 14
Number of female art historians born in 1805 : 1
Number of male art historians born in 1806 : 14
Nu

Number of female art historians born in 1926 : 32
Number of male art historians born in 1927 : 104
Number of female art historians born in 1927 : 48
Number of male art historians born in 1928 : 116
Number of female art historians born in 1928 : 46
Number of male art historians born in 1929 : 122
Number of female art historians born in 1929 : 39
Number of male art historians born in 1930 : 99
Number of female art historians born in 1930 : 39
Number of male art historians born in 1931 : 118
Number of female art historians born in 1931 : 39
Number of male art historians born in 1932 : 84
Number of female art historians born in 1932 : 37
Number of male art historians born in 1933 : 90
Number of female art historians born in 1933 : 29
Number of male art historians born in 1934 : 89
Number of female art historians born in 1934 : 27
Number of male art historians born in 1935 : 89
Number of female art historians born in 1935 : 30
Number of male art historians born in 1936 : 104
Number of femal

In [51]:
graph = figure(title = "Art Historians by Date", y_range=[0, 140], plot_width=800, plot_height=500, toolbar_location=None, tools="hover")  
     
female_births = [int(line['year']['value'])+30 for line in birthdate_results["results"]["bindings"] if line['genderLabel']['value'] == 'female']
female_nums = [line['count']['value'] for line in birthdate_results["results"]["bindings"] if line['genderLabel']['value'] == 'female']
male_births = [int(line['year']['value'])+30 for line in birthdate_results["results"]["bindings"] if line['genderLabel']['value'] == 'male']
male_nums = [line['count']['value'] for line in birthdate_results["results"]["bindings"] if line['genderLabel']['value'] == 'male']

graph.line(female_births, female_nums, color="#e8cdda", legend_label='female')  
graph.line(male_births, male_nums, color="#c4ddda", legend_label='male')  
graph.legend.location = "top_left"
graph.legend.orientation = "horizontal"
graph.xaxis.major_label_orientation = "vertical"
     
show(graph) 

In [52]:
ssl._create_default_https_context = ssl._create_unverified_context

wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

lifespan_query = """
SELECT DISTINCT * WHERE {

{ SELECT distinct (year(xsd:dateTime(?dateOfBirth)) as ?yearOfBirth) (year(xsd:dateTime(?dateOfDeath)) as ?yearOfDeath) ?genderLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?historian wdt:P31 wd:Q5;
             wdt:P21 ?gender;
             wdt:P106/wdt:P279* wd:Q1792450;
             wdt:P569 ?dateOfBirth;
             wdt:P570 ?dateOfDeath. FILTER (?dateOfDeath > "1825-01-01"^^xsd:dateTime)
} } UNION 

{ SELECT distinct (year(xsd:dateTime(?dateOfBirth)) as ?yearOfBirth) ?genderLabel WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
  ?historian wdt:P31 wd:Q5;
                    wdt:P21 ?gender;
                    wdt:P106/wdt:P279* wd:Q1792450;
             wdt:P569 ?dateOfBirth. FILTER (?dateOfBirth > "1930-01-01"^^xsd:dateTime)
} }
}
"""

sparql_wd = SPARQLWrapper(wikidata_endpoint)
sparql_wd.setQuery(lifespan_query)
sparql_wd.setReturnFormat(JSON)
lifespan_results = sparql_wd.query().convert()

scores = [(i, i+19) for i in range(1820, 2040, 20)]
score_data = [{'range': i, 'male': 0, 'female': 0, 'non-binary': 0} for i in scores]

for idx, span in enumerate(scores):
    for historian in lifespan_results["results"]["bindings"]:
        if int(historian['yearOfBirth']['value'])+30 <= span[1] and ('yearOfDeath' not in historian or int(historian['yearOfDeath']['value']) > span[0]):
            score_data[idx][historian['genderLabel']['value']] += 1
            
for line in score_data:
    print('Number of male art historians active in the timespan', line['range'], ':', line['male'])
    print('Number of female art historians active in the timespan', line['range'], ':', line['female'])


Number of male art historians active in the timespan (1820, 1839) : 212
Number of female art historians active in the timespan (1820, 1839) : 6
Number of male art historians active in the timespan (1840, 1859) : 353
Number of female art historians active in the timespan (1840, 1859) : 7
Number of male art historians active in the timespan (1860, 1879) : 560
Number of female art historians active in the timespan (1860, 1879) : 14
Number of male art historians active in the timespan (1880, 1899) : 882
Number of female art historians active in the timespan (1880, 1899) : 44
Number of male art historians active in the timespan (1900, 1919) : 1317
Number of female art historians active in the timespan (1900, 1919) : 124
Number of male art historians active in the timespan (1920, 1939) : 1705
Number of female art historians active in the timespan (1920, 1939) : 363
Number of male art historians active in the timespan (1940, 1959) : 1940
Number of female art historians active in the timespan 

In [53]:
json_out = list()
for score in score_data:
    json_out.append({
        'range': str(score['range'][0]) + ' - ' + str(score['range'][1]),
        'male': score['male'],
        'female': score['female']
    })
with open('data2.json', 'w') as f:
    json.dump(json_out, f, indent=4)

In [55]:
data = {'20-year-periods' : [str(i) for i in scores],
        'male'   : [i['male'] for i in score_data],
        'female' : [i['female'] for i in score_data]}

p = figure(x_range=data['20-year-periods'],  plot_width=800, plot_height=500, title="Active Span of Art Historians",
           toolbar_location=None, tools="hover")

p.vbar_stack(gender, x='20-year-periods', width=0.9, color=colors, source=data,
             legend_label=gender)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
p.xaxis.major_label_orientation = "vertical"

show(p)

In [57]:
true_f = [i['female'] for i in score_data]
true_m = [i['male'] for i in score_data]
true_len = [f+m for (f, m) in zip(true_f, true_m)]

female_ratio = [np.true_divide(i['female'], l)*100 for (i, l) in zip(score_data, true_len)]
male_ratio = [np.true_divide(i['male'], l)*100 for (i, l) in zip(score_data, true_len)]

data = {'20-year-periods' : [str(i) for i in scores],
        'male'   : male_ratio,
        'female' : female_ratio}

p = figure(x_range=data['20-year-periods'],  plot_width=800, plot_height=500, title="Active Span of Art Historians",
           toolbar_location=None, tools="hover")

p.vbar_stack(gender, x='20-year-periods', width=0.9, color=colors, source=data,
             legend_label=gender)

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.axis.minor_tick_line_color = None
p.outline_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"
p.xaxis.major_label_orientation = "vertical"

show(p)

In [58]:
json_out = list()
for score, male, female in zip(scores, male_ratio, female_ratio):
    json_out.append({
        'range': str(score[0]) + ' - ' + str(score[1]),
        'male': str(round(male, 2)),
        'female': str(round(female, 2))
    })
with open('data3.json', 'w') as f:
    json.dump(json_out, f, indent=4)

## 5 ARTchives integration

In [None]:
import pprint as pp
import csv
from pathlib import Path

from SPARQLWrapper import SPARQLWrapper, JSON
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from rdflib import Namespace , Literal , URIRef
from rdflib.namespace import RDF , RDFS


artchives_endpoint = "http://artchives.fondazionezeri.unibo.it/sparql"


count_individuals_by_class_query = """
SELECT ?class (COUNT(?individual) AS ?tot)
WHERE { ?individual a ?class .}
GROUP BY ?class ?tot
"""

sparql_wd = SPARQLWrapper(artchives_endpoint)
sparql_wd.setQuery(count_individuals_by_class_query)
sparql_wd.setReturnFormat(JSON)
results = sparql_wd.query().convert()

artchives_tot = dict()
for result in results["results"]["bindings"]:
    artc_class = result["class"]["value"]
    total = result["tot"]["value"]
    artchives_tot[artc_class] = total
    
pp.pprint(artchives_tot)   

In [None]:
artc_query = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX art:<https://w3id.org/artchives/>
SELECT ?person ?collector
WHERE {
?collection art:hasSubjectPeople ?person .
?collector a wd:Q5 .
}
"""

sparql_wd = SPARQLWrapper(artchives_endpoint)
sparql_wd.setQuery(artc_query)
sparql_wd.setReturnFormat(JSON)
results = sparql_wd.query().convert()

subjects = list()
collectors = list()

for result in results["results"]["bindings"]:
    person_code = result["person"]["value"]
    collector_code = result["collector"]["value"]
    subjects.append(person_code)
    collectors.append(collector_code)
    
pp.pprint(subjects)
pp.pprint(collectors)


In [None]:
wiki_subjects = set()
wiki_collectors = set()

for person in subjects:
    if "wikidata.org/entity/" in person:
                uri = "<"+ person + ">"
                wiki_subjects.add(uri)
for person in collectors:
    if "wikidata.org/entity/" in person:
                uri = "<"+ person + ">"
                wiki_collectors.add(uri)
                

pp.pprint("There are " + str(len(wiki_collectors)) + " collectors in artchives.")                
pp.pprint(wiki_subjects)
pp.pprint(wiki_collectors)

In [None]:
wd = Namespace("http://www.wikidata.org/entity/") 
wdt = Namespace("http://www.wikidata.org/prop/direct/")

historians = ' '.join(wiki_subjects)

wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

wikidata_query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT DISTINCT ?historian ?historianname ?gendername
WHERE {
    VALUES ?historian {"""+historians+"""} . 
    ?historian wdt:P21 ?gender . 
    ?historian rdfs:label ?historianname .
    ?gender rdfs:label ?gendername .
    FILTER (langMatches(lang(?gendername), "EN"))
    FILTER (langMatches(lang(?historianname), "EN"))
    
    } 
"""

sparql_wd = SPARQLWrapper(wikidata_endpoint)
sparql_wd.setQuery(wikidata_query)
sparql_wd.setReturnFormat(JSON)
gender_results = sparql_wd.query().convert()

womenuris = set()
subjectsDict = dict()
femalehistoriansDict = dict()

for result in gender_results["results"]["bindings"]:
    historian = result["historian"]["value"]
    gender = result["gendername"]["value"]
    name = result["historianname"]["value"]
    subjectsDict[historian] = (name,gender)
    if gender == "female":
        womenuris.add("<" + historian + ">")
        femalehistoriansDict[historian] = (name,gender)

        
        
    
pp.pprint(subjectsDict)

In [None]:
wd = Namespace("http://www.wikidata.org/entity/") 
wdt = Namespace("http://www.wikidata.org/prop/direct/")

collectors = ' '.join(wiki_collectors)

wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

collectors_query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT DISTINCT ?collector ?collectorname ?gendername
WHERE {
    VALUES ?collector {"""+collectors+"""} . 
    ?collector wdt:P21 ?gender . 
    ?collector rdfs:label ?collectorname .
    ?gender rdfs:label ?gendername .
    FILTER (langMatches(lang(?gendername), "EN"))
    FILTER (langMatches(lang(?collectorname), "EN"))
    
    } 
"""

sparql_coll = SPARQLWrapper(wikidata_endpoint)
sparql_coll.setQuery(collectors_query)
sparql_coll.setReturnFormat(JSON)
gender_c_results = sparql_coll.query().convert()

female_collectors_uris = set()
collectorsDict = dict()
femalecollectorsDict = dict()

for result in gender_c_results["results"]["bindings"]:
    collector = result["collector"]["value"]
    gender = result["gendername"]["value"]
    name = result["collectorname"]["value"]
    collectorsDict[collector] = (name,gender)
    if gender == "female":
        female_collectors_uris.add("<" + collector + ">")
        femalecollectorsDict[collector] = (name,gender)

        
        
    
pp.pprint(collectorsDict)
pp.pprint(len(collectorsDict))

In [None]:
pp.pprint(femalehistoriansDict)
pp.pprint(femalecollectorsDict)
print("There are "+str(len(femalecollectorsDict))+ " female collectors inside ARTchives.")

In [None]:
womenurisList = list(womenuris)
pp.pprint(womenurisList)

women_collectors_urisList = list(female_collectors_uris)
pp.pprint(women_collectors_urisList)

women_in_artchives = womenurisList + women_collectors_urisList
pp.pprint(women_in_artchives)
pp.pprint(len(women_in_artchives))

In [None]:
wd = Namespace("http://www.wikidata.org/entity/") 
wdt = Namespace("http://www.wikidata.org/prop/direct/")

women = ' '.join(women_in_artchives)

wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

women_query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?woman ?name ?genderLabel ?countryLabel ?memberofLabel ?occupationLabel 
WHERE { 
      VALUES ?woman {"""+women+"""} . 
      ?woman rdfs:label ?name;
              wdt:P21 ?gender;
              wdt:P27 ?country;
              wdt:P106 ?occupation;
              wdt:P463 ?memberof. 
  FILTER (langMatches(lang(?name), "EN"))
  SERVICE wikibase:label {bd:serviceParam wikibase:language "en" }              
}

"""

sparql_women = SPARQLWrapper(wikidata_endpoint)
sparql_women.setQuery(women_query)
sparql_women.setReturnFormat(JSON)
women_results = sparql_women.query().convert()

final_dict = {}

for result in women_results["results"]["bindings"]:
    woman = result["woman"]["value"]
    empty_set = set()
    empty_set_two = set()
    empty_set_three = set()
    final_dict[woman]= {"name":name, "gender":gender,"occupation":empty_set,"country":empty_set_two, "memberof":empty_set_three}


for result in women_results["results"]["bindings"]:
    woman = result["woman"]["value"]
    name = result["name"]["value"]
    gender = result["genderLabel"]["value"]
    country = result["countryLabel"]["value"]
    occupation = result["occupationLabel"]["value"]
    memberof = result["memberofLabel"]["value"]
    final_dict[woman]["name"] = name
    final_dict[woman]["gender"] = gender
    final_dict[woman]["occupation"].add(occupation)
    final_dict[woman]["country"].add(country)
    final_dict[woman]["memberof"].add(memberof)
    

         
pp.pprint(final_dict)

In [None]:
women_works_query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT ?woman ?scholarly_workLabel ?memberofLabel ?occupationLabel 
WHERE { 
      VALUES ?woman {"""+women+"""} . 
      ?woman rdfs:label ?name.
      ?scholarly_work wdt:P50 ?woman.
      ?scholarly_work wdt:P31/wdt:P279* wd:Q55915575.
  FILTER (langMatches(lang(?name), "EN"))
  SERVICE wikibase:label {bd:serviceParam wikibase:language "en" }              
}

"""


sparql_women_works = SPARQLWrapper(wikidata_endpoint)
sparql_women_works.setQuery(women_works_query)
sparql_women_works.setReturnFormat(JSON)
women_works_results = sparql_women_works.query().convert()

for result in women_works_results["results"]["bindings"]:
    if result['woman']['value'] in final_dict.keys():
        if 'works' not in final_dict[result['woman']['value']]:
            final_dict[result['woman']['value']]['works'] = [result['scholarly_workLabel']['value']]
        else:
            final_dict[result['woman']['value']]['works'].append(result['scholarly_workLabel']['value'])
            
pp.pprint(final_dict)
