This notebook creates statistics and visualisations for the Muziekopnamen Zendgemachtigden (MOZ) concert collection belonging to Sound & Vision.
This notebook queries the public linked data endpoint with SPARQL to obtain aggregations from the MOZ. These are visualised in the notebook using the [Plotly](https://plotly.com/) library, with the exception of the network graph, which can be visualised by copying the data produced in the notebook to [Flourish](https://flourish.studio/). 

For any of the visualisations, you can delete the '#' in front of the lines that print the data, and copy the data to your own visualisation tool if you wish.

This notebook was used to generate the data for the [MOZ blog](https://labs.beeldengeluid.nl/blogs/moz-dataset-blog) on the LABS website.

In [None]:
import json
import locale
import matplotlib.pyplot as plt
import os
import plotly.graph_objects as go
import plotly.io as pio
import requests

from collections import OrderedDict
from sys import path
from wordcloud import WordCloud

sparql_endpoint = "https://cat.apis.beeldengeluid.nl/sparql"
muziekweb_sparql_endpoint = "https://api.data.muziekweb.nl/datasets/MuziekwebOrganization/Muziekweb/services/Muziekweb/sparql"
moz_series_id = "<http://data.beeldengeluid.nl/id/series/2101608030025711131>"

prefixes = """PREFIX schema: <http://schema.org/>
              PREFIX sdo: <https://schema.org/>
              PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
              PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n"""

muziekweb_prefixes = """PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
                        PREFIX vocab: <https://data.muziekweb.nl/vocab/>
                        PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n"""

In [None]:
def get_count_from_sparql(sparql_endpoint, query):
    """Carries out the query on the endpoint and returns the value of the count
    :param - sparql_endpoint - the endpoint containing the RDF data to be queried
    :param - query - the query to be carried out. This must select a count variable called ?count
    :returns - the count of the results. Note, if the query fails, then the result is 0"""
    response = requests.get(
                sparql_endpoint,
                params={"query": query},
                headers = {"Accept": "application/sparql-results+json"})
    return response.json()["results"]["bindings"][0]["count"]["value"]

In [None]:
def get_category_count_from_sparql(sparql_endpoint, query):
    """Carries out the query on the endpoint and returns the value of the counts per category
    :param - sparql_endpoint - the endpoint containing the RDF data to be queried
    :param - query - the query to be carried out. This must select a count variable called ?count and 
    a variable ?category per value of which the count has been calculated
    :returns - a dict of the categories with their counts"""
    category_counts = OrderedDict()
    response = requests.get(
                sparql_endpoint,
                params={"query": query},
                headers = {"Accept": "application/sparql-results+json"})
    for result in response.json()["results"]["bindings"]:
        category_counts[result["category"]["value"]] = int(result["count"]["value"])
        
    return category_counts

In [None]:
def get_category_count_and_category_uri_from_sparql(sparql_endpoint, query):
    """Carries out the query on the endpoint and returns the value of the counts per category, and also the category uri
    :param - sparql_endpoint - the endpoint containing the RDF data to be queried
    :param - query - the query to be carried out. This must select a count variable called ?count and 
    a variable ?category per value of which the count has been calculated, and a variable ?category_uri for the URI
    of the category
    :returns - a dict of the categories with a field "count" for the count and a field "uri" for the uri """
    category_counts = OrderedDict()
    response = requests.get(
                sparql_endpoint,
                params={"query": query},
                headers = {"Accept": "application/sparql-results+json"})
    for result in response.json()["results"]["bindings"]:
        category_counts[result["category"]["value"]] = {
                                                        "count": int(result["count"]["value"]),
                                                        "uri": result["category_uri"]["value"]
                                                        }
        
    return category_counts

In [None]:
def fill_gaps_in_timeline(timeline):
    """For a timeline in years, fills in gaps where there are no values with the value 0.
    :param timeline - an ordered dict with the years as keys and the counts as values
    :returns an ordered dict with complete data between the first and final year"""
    start_year = int(list(timeline.keys())[0])
    end_year = int(list(timeline.keys())[-1]) + 1
    
    complete_timeline = OrderedDict()
    for year in range(start_year, end_year):
        if str(year) not in timeline:
            complete_timeline[str(year)] = 0
        else:
            complete_timeline[str(year)] = int(timeline[str(year)])
    return complete_timeline

In [None]:
def get_person_connections(person_id):
    """Gets the persons connected to this person via programmes or scenes, with the properties creator or byArtist
    :param person_id - the GTAA id of the person
    :returns a dictionary with the name of the person as key, then the fields 'count' and 'uri' for the count
    of how often the person co-occurs with the input person, and the GTAA uri of the person"""
    query=prefixes + f"""
    SELECT ?count ?category ?category_uri
    WHERE
    {{
     {{
      SELECT (COUNT(DISTINCT ?program) as ?count) ?category ?category_uri
      WHERE
      {{

        ?program sdo:partOfSeason/sdo:partOfSeries <http://data.beeldengeluid.nl/id/series/2101608030025711131> .
        {{
             ?program (sdo:creator/sdo:creator)|(sdo:byArtist/sdo:byArtist) {person_id} ;
                      (sdo:creator/sdo:creator)|(sdo:byArtist/sdo:byArtist) ?category_uri .
        }}
          UNION
        {{
            ?program sdo:hasPart ?scene .
            ?scene (sdo:creator/sdo:creator)|(sdo:byArtist/sdo:byArtist) {person_id};
                   (sdo:creator/sdo:creator)|(sdo:byArtist/sdo:byArtist) ?category_uri .
        }}
            ?category_uri skos:prefLabel ?category
      }} GROUP BY ?category_uri ?category
     }}
    }} ORDER BY DESC(?count) LIMIT 10
    """

    return get_category_count_and_category_uri_from_sparql(sparql_endpoint, query)

In [None]:
def get_top_x_entities_by_property(selected_property, number_of_entities):
    """Gets the top x entities, ranked by number of occurrences, that are linked
    to a concert with the selected property
    :param selected_property - the property linking the concert to the entity
    :param number_of_entities - how many entities to return
    :returns a dictionary with the top x entity names and their counts"""
    query=prefixes + f"""

    SELECT ?count ?category
    WHERE
    {{
      {{
        SELECT (COUNT(DISTINCT ?program) as ?count) ?category
        WHERE 
        {{
            ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id} .
              {{
                ?program {selected_property}/skos:prefLabel ?category
              }}
              UNION
              {{
                ?program sdo:isPartOfSeason ?season .
                ?season  {selected_property}/skos:prefLabel ?category
              }}
              UNION
              {{
                {moz_series_id} {selected_property}/skos:prefLabel ?category
              }}
              UNION
              {{
                ?program sdo:hasPart ?scene.
                ?scene  {selected_property}/skos:prefLabel ?category 
              }}
        }} GROUP BY ?category 
      }}
    }} ORDER BY DESC(?count) LIMIT {number_of_entities}
    """

    return get_category_count_from_sparql(sparql_endpoint, query)

In [None]:
def get_top_x_entities_by_role_property(role_property, number_of_entities):
    """Gets the top x entities, ranked by number of occurrences, that are linked
    to a concert via a role with the role property
    :param role_property - the property linking the concert to the role and
                            the role to the entity
    :param number_of_entities - how many entities to return
    :returns a dictionary with the top x entity names and their counts"""
    query=prefixes + f"""

    SELECT ?count ?category
    WHERE
    {{
      {{
        SELECT (COUNT(DISTINCT ?program) as ?count) ?category
        WHERE 
        {{
            ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id} .
              {{
                ?program {role_property}/{role_property} ?entity  .
                ?entity skos:prefLabel ?category
              }}
              UNION
              {{
                ?program sdo:isPartOfSeason ?season .
                ?season  {role_property}/{role_property} ?entity  .
                ?entity skos:prefLabel ?category
              }}
              UNION
              {{
                {moz_series_id} {role_property}/{role_property} ?entity  .
                ?entity skos:prefLabel ?category
              }}
              UNION
              {{
                ?program sdo:hasPart ?scene.
                ?scene  {role_property}/{role_property} ?entity  .
                ?entity skos:prefLabel ?category 
              }}
        }} GROUP BY ?category 
      }}
    }} ORDER BY DESC(?count) LIMIT {number_of_entities}
    """

    return get_category_count_from_sparql(sparql_endpoint, query)

In [None]:
def uninvert_name(name):
    """Given a person's name in the form 'surname, first name', returns this in the form
    'first name surname'.
    :param name - inverted person name
    :returns uninverted name"""
    if "," in name:
        name_parts = name.split(",")
        return f"{name_parts[1].strip()} {name_parts[0].strip()}"
    else:
        return name

In [None]:
def format_overlay_hover_info(keys, values, name):
    """Creates a list of hover infos for this part of the overlay graph. Hover information has format
    '(key, value) name'
    :param keys - the keys for the points on the graph
    :param values - the values of the points on the graph
    :param name - the name of the data
    :returns the text for the hover info
    """
    locale.setlocale(locale.LC_ALL, '')
    text = []
    i = 0
    for key in keys:
        format_string = "%%.%df"%0
        formatted_value = locale.format_string(format_string, values[i], grouping = True)
        text.append(f"{str(key)}, ({formatted_value}) {name}")
        i += 1
    return text

In [None]:
def get_separators():
    """Gets the local number separators
    :returns the decimal point and thousands separators"""
    decimal_point = locale.localeconv()['decimal_point']
    thousands_sep = locale.localeconv()['thousands_sep']
    return decimal_point + thousands_sep

In [None]:
def plot_Y_against_X_as_bar_chart(x_axis, y_axis, plot_title, x_axis_title, y_axis_title, margin, filename, 
                            colour="#0028be", width=600, height=500):
    """Plots the Y axis values against the X axis values, using the specified titles in the plot and on the axes,
    and is plotted under the given filename
    Optionally, you can enter a dict as the margin, to set the size of the graph margins (useful if text is
    overlapping). See plotly documentation for more information
    :param x_axis - the x_axis values
    :param y_axis - the y_axis values
    :param plot_title - the title displayed above the plot
    :param x_axis_title - the title displayed below the x axis
    :param y_axis_title - the title displayed below the y axis
    :param margin - space to leave around the plot
    :param filename - filename of the plot
    :param colour - optional, colour to use for the bars
    :param width - optional, width of the plot
    :param height - optional, height of the plot
    """
    if not x_axis:
        raise ValueError("x_axis values list is empty")

    if not y_axis:
        raise ValueError("y_axis values list is empty")

    if len(x_axis) != len(y_axis):
        raise ValueError("The x and y axis values do not have the same number of values (%d and %d)"%(len(x_axis), len(y_axis)))

    data = [go.Bar(
                x=x_axis,
                y=y_axis,
                text=format_overlay_hover_info(x_axis, y_axis, ""),
                hoverinfo='text',
                marker=dict(
                    color=colour,
                    line=dict(
                        color=colour,
                        width=2,
                    )
                )
                )]

    layout = go.Layout(
        title=plot_title,
        width=width,
        height=height,
        margin=margin,
        xaxis=dict(
            title=x_axis_title,
            titlefont=dict(
                family='Arial, monospace',
                size=18
            ),
                type="category"
        ),
        yaxis=dict(
            title=y_axis_title,
            titlefont=dict(
                family='Arial, monospace',
                size=18
            )
        )
        , 
        separators=get_separators()
    )
    fig = go.Figure(data=data, layout=layout) 
    pio.show(fig, filename=filename, config={})

In [None]:
def plot_pie_chart(labels, values, title, margin, filename, colours=["#009fda", "#e00034"], width=950, height=600):
    """Creates a single pie chart with the given values and labels, optionally using the colours specified.
    If "colours" is empty, then default colours are used.
    Optionally, you can enter a dict as the margin, to set the size of the graph margins (useful if text is
    overlapping). See plotly documentation for more information
    :param labels - the labels of the chart segments
    :param values - the values of the chart segments
    :param title - the title displayed above the chart
    :param margin - the space to leave around the chart
    :param filename - the filename of the chart
    :param colours - optional, the colours to use for the segments
    :param width - optional, width of the plot
    :param height - optional, height of the plot
    """

    if not labels:
        raise ValueError("Labels list is empty")

    if not values:
        raise ValueError("Values list is empty")

    if len(labels) != len(values):
        raise ValueError("Must have equal number of items in labels and values")

    trace = go.Pie(labels=labels, values=values, sort=False, textinfo='label+percent', textposition="outside",
                hoverinfo='value',
                hole=.4,
                showlegend=False,
                marker=dict(
                            colors=colours,
                            line=dict(color='#000000', width=2))
                            )

    layout = go.Layout(title=title, width=width, height=height, margin=margin, separators=get_separators())

    fig = go.Figure(data=[trace], layout=layout)
    pio.show(fig, filename=filename, config={})

In [None]:
def create_word_cloud(word_counts, width, height, background_colour = "black"):
    """Given a dictionary with the words as keys, and their counts as values, creates a word cloud and plots it
    using matplotlib.
    The word cloud is plotted with the given width and height (in inches)
    :param word_counts - a dictionary of the words and their counts
    :param width - the width of the cloud
    :param height - the height of the cloud
    :param background_colour - the background colour to use
    """

    if not isinstance(width, int):
        raise ValueError("Width must be an integer")

    if not isinstance(height, int):
        raise ValueError("Height must be an integer")

    if width > 20 or height > 20:
        raise ValueError("Too big. Please note, height and width are in inches!!")

    # Generate a word cloud image
    wordcloud = WordCloud(scale=5,relative_scaling=0.5, background_color = background_colour).generate_from_frequencies(word_counts)

    # Display the generated image with matplotlib:
    plt.figure(figsize=(width,height))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    return wordcloud

## Number of concerts
The number of concerts in the MOZ concert collection

In [None]:
query = prefixes + f"""
            SELECT (COUNT(DISTINCT(?program)) as ?count)
            WHERE 
            {{
                ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id}
            }}"""

total = int(get_count_from_sparql(sparql_endpoint, query))
print(f"{total} concerts in the collection")

## Number of concerts over time
A graph of the number of concerts in the collection over time

In [None]:
query = prefixes + f"""
            SELECT (COUNT(?category) as ?count) ?category
            WHERE 
            {{
                ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id}.  
                ?program <https://schema.org/datePublished> ?date .
                BIND(substr(?date, 1, 4) as ?category)
            }} GROUP BY ?category"""

timeline = get_category_count_from_sparql(sparql_endpoint, query)
complete_timeline = fill_gaps_in_timeline(timeline)  # fill in the gaps where there are no values

# for year in complete_timeline:
#     print(f"{year}\t{complete_timeline[year]}")
    
plot_Y_against_X_as_bar_chart(list(complete_timeline.keys()),
                            list(complete_timeline.values()), 
                            "Number of concerts over time", 
                            "Year",
                            "Number of concerts", 
                            dict(t=50), 
                            "items-over-time" )


## Distribution over audio/video
Most of the collection consists of audio recordings, but some video recordings are available

In [None]:
query = prefixes + f"""
            SELECT ?count ?category
            WHERE
            {{
                {{
                SELECT (COUNT(?category) as ?count) ?category
                    WHERE 
                    {{
                        ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id}.  
                        ?program sdo:associatedMedia/sdo:encodingFormat ?category .
                    }} GROUP BY ?category
                }}
            }} ORDER BY DESC(?count) """

distribution = get_category_count_from_sparql(sparql_endpoint, query)

# for type in distribution:
#     print(f"{type}\t{distribution[type]}")

labels = [encoding_format.split("/")[0] for encoding_format in list(distribution.keys())]
plot_Y_against_X_as_bar_chart(labels,
                            list(distribution.values()), 
                            f"Number of concerts with audio/video", 
                            "Audio/video",
                            "Number of concerts", 
                            dict(t=50), 
                            "items-per-encoding-format" )

## Distribution over musical genres
The number of concerts per musical genre. Note: a concert may have more than one genre

Change the 'top_number_to_plot' value to see more genres

In [None]:
query = prefixes + f"""
            SELECT ?count ?category
            WHERE
            {{
                {{
                SELECT (COUNT(?category) as ?count) ?category
                    WHERE 
                    {{
                        ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id}.  
                        ?program sdo:genre/skos:prefLabel ?category .
                    }} GROUP BY ?category
                }}
            }} ORDER BY DESC(?count) """

distribution = get_category_count_from_sparql(sparql_endpoint, query)

# for genre in distribution:
#     print(f"{genre}\t{distribution[genre]}")

top_number_to_plot = 10
plot_Y_against_X_as_bar_chart(list(distribution.keys())[:top_number_to_plot],
                            list(distribution.values())[:top_number_to_plot], 
                            f"Number of concerts per genre<br>for top {top_number_to_plot} musical genres", 
                            "Genre",
                            "Number of concerts", 
                            dict(t=50), 
                            "items-per-genre" )

## Distribution over locations
The number of concerts per location. Note: locations are not standardised, so the same location may occur multiple times with different spellings etc.
Change the 'top_number_to_plot' value to see more locations

In [None]:
query = prefixes + f"""
            SELECT ?count ?category
            WHERE
            {{
                {{

                    SELECT (COUNT(?category) as ?count) ?category
                    WHERE 
                    {{
                        ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id}.
                        {{  
                            ?program <https://schema.org/locationCreated>/rdfs:label ?category .
                        }}
                        UNION                
                        {{  
                            ?program sdo:hasPart ?scene.
                            ?scene <https://schema.org/locationCreated>/rdfs:label ?category .
                        }}
                    }} GROUP BY ?category
                }}
            }} ORDER BY DESC(?count) """

distribution = get_category_count_from_sparql(sparql_endpoint, query)

# for location in distribution:
#     print(f"{location}\t{distribution[location]}")
    
top_number_to_plot = 10
plot_Y_against_X_as_bar_chart(list(distribution.keys())[:top_number_to_plot],
                            list(distribution.values())[:top_number_to_plot], 
                            f"Number of concerts per location<br>for top {top_number_to_plot} locations", 
                            "Location",
                            "Number of concerts", 
                            dict(t=50), 
                            "items-per-location" )

## Distribution over events
The number of concerts per event. An event can be a festival, for example.
Change the 'top_number_to_plot' value to see more events

### As a bar chart

In [None]:
query = prefixes + f"""
            SELECT ?count ?category
            WHERE
            {{
                {{

                    SELECT (COUNT(?category) as ?count) ?category
                    WHERE 
                    {{
                        ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id}.
                        {{  
                            ?program <https://schema.org/recordedAt>/rdfs:label ?category .
                        }}
                        UNION                
                        {{  
                            ?program sdo:hasPart ?scene.
                            ?scene <https://schema.org/recordedAt>/rdfs:label ?category .
                        }}
                    }} GROUP BY ?category
                }}
            }} ORDER BY DESC(?count) """

distribution = get_category_count_from_sparql(sparql_endpoint, query)

# for event in distribution:
#     print(f"{location}\t{distribution[event]}")

 
top_number_to_plot = 10
plot_Y_against_X_as_bar_chart([f"{name[:20]}..."for name in list(distribution.keys())[:top_number_to_plot]],
                            list(distribution.values())[:top_number_to_plot], 
                            f"Number of concerts per event<br>for top {top_number_to_plot} events", 
                            "Event",
                            "Number of concerts", 
                            dict(t=50), 
                            "items-per-event" )

### As a word cloud

In [None]:
if distribution:
    create_word_cloud(dict(distribution), 15, 10, background_colour="white")

## Percentage of concerts with artists
The percentage of all concerts that have artist information. 

In [None]:
query = prefixes + f"""
SELECT (COUNT(DISTINCT ?program) as ?count)
WHERE 
{{
        ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id}
          {{
            ?program 
              (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor) ?x
          }}
          UNION
          {{
            ?program sdo:isPartOfSeason ?season .

            ?season (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor) ?x
          }}
          UNION
          {{
            {moz_series_id} (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor) ?x
          }}
          UNION
          {{
            ?program sdo:hasPart ?scene.

            ?scene (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor) ?x
          }}
      }}
"""
with_person_count = int(get_count_from_sparql(sparql_endpoint, query))
# print(f"With artist information\t{with_person_count}\t{((with_person_count)/total)*100:.2f}")
# print(f"Without artist information\t{total-with_person_count}\t{100-((with_person_count)/total)*100:.2f}")

plot_pie_chart(["With artist information", "Without artist information"], 
                 [with_person_count, (total-with_person_count)],
                 "Percentage of concerts with artist information", 
                 {}, 
                 "artist-dist", 
                 width=700)


## Number of artists
Artists are essential to a musical concert. They may be people who perform in a concert, are involved in creating it, are mentioned (e.g. the concert is in their honour) or contribute in some other way. 

NB: This count includes groups, such as orchestras, as well as individual persons

In [None]:
query = prefixes + f"""
SELECT (COUNT(DISTINCT ?entityName) as ?count)
WHERE 
{{
        ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id}
          {{
            ?program 
              (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor)/
              (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor) ?entity  .
            ?entity skos:prefLabel ?entityName
          }}
          UNION
          {{
            ?program sdo:isPartOfSeason ?season .

            ?season  (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor)/
            (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor) ?entity  .
            ?entity skos:prefLabel ?entityName
          }}
          UNION
          {{
            {moz_series_id}  (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor)/
            (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor) ?entity  .
            ?entity skos:prefLabel ?entityName
          }}
          UNION
          {{
            ?program sdo:hasPart ?scene.

            ?scene  (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor)/
            (sdo:byArtist|sdo:creator|sdo:mentions|sdo:contributor) ?entity  .
            ?entity skos:prefLabel ?entityName 
          }}
      }}
"""
print(f"{get_count_from_sparql(sparql_endpoint, query)} artists in the collection")

## Distribution of artists over roles
Artists may be associated with a concert in various roles

In [None]:
query=prefixes + f"""

SELECT ?count ?category
WHERE
{{
  {{
    SELECT (COUNT(DISTINCT ?entityName) as ?count) ?role
    WHERE 
    {{
        {{
            ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id} .
            ?program ?role ?roleEntity . 
            ?roleEntity ?role ?entity  .
            ?entity skos:inScheme <http://data.beeldengeluid.nl/gtaa/Persoonsnamen> .
            ?entity skos:prefLabel ?entityName
        }}
        UNION
        {{
            ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id} .
            ?program sdo:partOfSeason ?season.
            ?season ?role ?roleEntity . 
            ?roleEntity ?role ?entity  .
            ?entity skos:inScheme <http://data.beeldengeluid.nl/gtaa/Persoonsnamen> .
            ?entity skos:prefLabel ?entityName
        }}
        UNION
        {{
            ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id} .
            {moz_series_id} ?role ?roleEntity . 
            ?roleEntity ?role ?entity  .
            ?entity skos:inScheme <http://data.beeldengeluid.nl/gtaa/Persoonsnamen> .
            ?entity skos:prefLabel ?entityName
        }}
        UNION
        {{
            ?program sdo:partOfSeason/sdo:partOfSeries {moz_series_id} .
            ?program sdo:hasPart ?scene.
            ?scene ?role ?roleEntity . 
            ?roleEntity ?role ?entity  .
            ?entity skos:inScheme <http://data.beeldengeluid.nl/gtaa/Persoonsnamen> .
            ?entity skos:prefLabel ?entityName
        }}
        
    }} GROUP BY ?role
  }}
    ?role rdfs:label ?category
}} ORDER BY DESC(?count)"""

distribution = get_category_count_from_sparql(sparql_endpoint, query)

for role in distribution:
    print(f"{role}\t{distribution[role]}")

# remove the erroneous roles
del[distribution["mentions"]]
del[distribution["contributor"]]
    
plot_Y_against_X_as_bar_chart(list(distribution.keys()),
                            list(distribution.values()), 
                            f"Number of artists in role", 
                            "Role",
                            "Number of artists", 
                            dict(t=50), 
                            "artists-per-role" )

## Top x creators
The top x most frequently occurring artists in the role of creator. 

Change the 'top_number_to_plot' value to see more creators

In [None]:
top_number_to_plot = 100
distribution = get_top_x_entities_by_role_property("sdo:creator", top_number_to_plot)

# for person in distribution:
#     print(f"{uninvert_name(person)}\t{distribution[person]}")
    
plot_Y_against_X_as_bar_chart([uninvert_name(person) for person in distribution],
                            list(distribution.values()), 
                            f"Number of concerts per creator for top {top_number_to_plot} creators", 
                            "Creator",
                            "Number of concerts", 
                            dict(t=50), 
                            "concerts-per-creator" )

Creators also feature in other data sources. For example, we can look them up in [Muziekweb](https://www.muziekweb.nl/) to see how many albums they are associated with.

In [None]:
# number of albums in Muziekweb per creator
album_distribution = {}
for person in distribution:
    person_name = uninvert_name(person)
    query = muziekweb_prefixes + f"""
            SELECT (COUNT(DISTINCT(?album)) AS ?count) 
            WHERE {{
              ?album vocab:performer/skos:prefLabel \"{person_name}\" ; a vocab:Album
            }}"""

    #print(f"{uninvert_name(person)}\t{get_count_from_sparql(muziekweb_sparql_endpoint, query)}")
    album_distribution[uninvert_name(person)] = int(get_count_from_sparql(muziekweb_sparql_endpoint, query))
 
plot_Y_against_X_as_bar_chart(list(album_distribution.keys()),
                            list(album_distribution.values()), 
                            f"Number of Muziekweb albums per creator for top {top_number_to_plot} creators", 
                            "Creator",
                            "Number of Muziekweb albums", 
                            dict(t=50), 
                            "albums-per-creator" )

## Top x performers
The top x most frequently occurring artists in the role of performer. 

Change the 'top_number_to_plot' value to see more performers

In [None]:
top_number_to_plot = 10
distribution = get_top_x_entities_by_role_property("sdo:byArtist", top_number_to_plot)

# for artist in distribution:
#     print(f"{uninvert_name(artist)}\t{distribution[artist]}")
    
plot_Y_against_X_as_bar_chart([uninvert_name(artist) for artist in distribution],
                            list(distribution.values()), 
                            f"Number of concerts per perfomer for top {top_number_to_plot} performers", 
                            "Performer",
                            "Number of concerts", 
                            dict(t=50), 
                            "concerts-per-performer" )

Performers also feature in other data sources. For example, we can look them up in [Muziekweb](https://www.muziekweb.nl/) to see how many albums they are associated with.

In [None]:
# number of albums in Muziekweb per performer
album_distribution = {}
for artist in distribution:
    artist_name = uninvert_name(artist)
    query = muziekweb_prefixes + f"""
            SELECT (COUNT(DISTINCT(?album)) AS ?count) 
            WHERE {{
              ?album vocab:performer/skos:prefLabel \"{artist_name}\" ; a vocab:Album
            }}"""

    #print(f"{uninvert_name(artist)}\t{get_count_from_sparql(muziekweb_sparql_endpoint, query)}")
    album_distribution[uninvert_name(artist)] = int(get_count_from_sparql(muziekweb_sparql_endpoint, query))
 
plot_Y_against_X_as_bar_chart(list(album_distribution.keys()),
                            list(album_distribution.values()), 
                            f"Number of Muziekweb albums per performer for top {top_number_to_plot} performers", 
                            "Performer",
                            "Number of Muziekweb albums", 
                            dict(t=50), 
                            "albums-per-performer" )

## Network of a person
Persons may be related to each other by being associated with the same concert. We can plot networks of these relationships. These rapidly become large, so here we limit the network to a chosen person's top 10 associated persons, with per person in turn their top 10 associated persons.

Here, we show the network of Mozart, the most frequently occurring creator. You can choose any other person if you know their GTAA thesaurus identifier. You can find persons in the GTAA by searching in the [Termennetwerk](https://termennetwerk.netwerkdigitaalerfgoed.nl/) and choosing the source 'GTAA: persoonsnamen'

The network can be seen below in the form of lists of pairs of artists, and the number of times that they are associated with the same concert. 

It can also be visualised using the [Flourish](https://flourish.studio/) tool. Follow the steps below:
- Create a free account on Flourish
- Go to your Projects page
- Click the button '+ New Visualisation'
- Scroll down to the 'Network Graph' section
- Under 'Starting Points' click on the image labelled 'Default'
- In the visualisation that appears, click on the 'Data' tab
- In the 'Links' tab, delete all the example data (e.g. CTRL+A and Delete)
- Do the same for the 'Points' tab
- In this notebook, run the two cells below
- Copy the output from the first cell and paste it into the 'Links' tab in your Flourish visualisation
- Copy the output from the second cell and paste it into the 'Points' tab in your Flourish visualisation
- Click on the 'Preview' tab to see the network
- Hover over the points to see the names

In [None]:

person_id = "<http://data.beeldengeluid.nl/gtaa/134672>"  # the thesaurus identifier for Mozart
person_label = "Mozart, Wolfgang Amadeus"

# get the person's top 10 connections
connections = get_person_connections(person_id)

person_group = {}

# now get the top 10 connections for each of the person's top 10
i = 2
print("Person 1\tPerson 2\tNumber of concerts together")
for person in connections:
    person_connections = get_person_connections("<"+ connections[person]['uri'] + ">") 
    for new_person in person_connections:
        if person != new_person:
            print(f"{uninvert_name(person)}\t{uninvert_name(new_person)}\t{person_connections[new_person]['count']}")
            if new_person not in person_group:
                person_group[new_person] = i
    i += 1
 
# Paste output from below in the Links tab of the Flourish visualisation

In [None]:
print("Person\tGroup")
for person in person_group:
    print(f"{uninvert_name(person)}\t{person_group[person]}")
    
# Paste output from below in the Points tab of the Flourish visualisation

## Top 10 production companies
The top x most frequently occurring production companies. 

Change the 'top_number_to_plot' value to see more companies

In [None]:
top_number_to_plot = 10
distribution = get_top_x_entities_by_role_property("sdo:productionCompany", top_number_to_plot)

# for company in distribution:
#     print(f"{company}\t{distribution[company]}")
    
plot_Y_against_X_as_bar_chart(list(distribution.keys()),
                            list(distribution.values()), 
                            f"Number of concerts per production company <br>for top {top_number_to_plot} production companies", 
                            "Production company",
                            "Number of concerts", 
                            dict(t=50), 
                            "concerts-per-production-company")

## Top x broadcasters
The top x most frequently occurring broadcasters. 

Change the 'top_number_to_plot' value to see more broadcasters

In [None]:
top_number_to_plot = 10
distribution = get_top_x_entities_by_property("sdo:provider", top_number_to_plot)

# for broadcaster in distribution:
#     print(f"{broadcaster}\t{distribution[broadcaster]}")
    
plot_Y_against_X_as_bar_chart(list(distribution.keys()),
                            list(distribution.values()), 
                            f"Number of concerts per broadcaster <br>for top {top_number_to_plot} broadcasters", 
                            "Broadcaster",
                            "Number of concerts", 
                            dict(t=50), 
                            "concerts-per-broadcaster")