In [None]:
import csv
import locale
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

from ast import literal_eval

In [None]:
def fill_gaps_in_timeline(timeline):
    """For a timeline in years, fills in gaps where there are no values with the value 0.
    :param timeline - an ordered dict with the years as keys and the counts as values
    :returns an ordered dict with complete data between the first and final year"""
    start_year = int(list(timeline.keys())[0])
    end_year = int(list(timeline.keys())[-1]) + 1
    
    complete_timeline = OrderedDict()
    for year in range(start_year, end_year):
        if str(year) not in timeline:
            complete_timeline[str(year)] = 0
        else:
            complete_timeline[str(year)] = int(timeline[str(year)])
    return complete_timeline

In [None]:
def format_overlay_hover_info(keys, values, name):
    """Creates a list of hover infos for this part of the overlay graph. Hover information has format
    '(key, value) name'
    :param keys - the keys for the points on the graph
    :param values - the values of the points on the graph
    :param name - the name of the data
    :returns the text for the hover info
    """
    locale.setlocale(locale.LC_ALL, '')
    text = []
    i = 0
    for key in keys:
        format_string = "%%.%df"%0
        formatted_value = locale.format_string(format_string, values[i], grouping = True)
        text.append(f"{str(key)}, ({formatted_value}) {name}")
        i += 1
    return text

In [None]:
def get_separators():
    """Gets the local number separators
    :returns the decimal point and thousands separators"""
    decimal_point = locale.localeconv()['decimal_point']
    thousands_sep = locale.localeconv()['thousands_sep']
    return decimal_point + thousands_sep

In [None]:
def plot_Y_against_X_as_bar_chart(x_axis, y_axis, plot_title, x_axis_title, y_axis_title, margin, filename, 
                            colour="#0028be", width=600, height=500):
    """Plots the Y axis values against the X axis values, using the specified titles in the plot and on the axes,
    and is plotted under the given filename
    Optionally, you can enter a dict as the margin, to set the size of the graph margins (useful if text is
    overlapping). See plotly documentation for more information
    :param x_axis - the x_axis values
    :param y_axis - the y_axis values
    :param plot_title - the title displayed above the plot
    :param x_axis_title - the title displayed below the x axis
    :param y_axis_title - the title displayed below the y axis
    :param margin - space to leave around the plot
    :param filename - filename of the plot
    :param colour - optional, colour to use for the bars
    :param width - optional, width of the plot
    :param height - optional, height of the plot
    """
    if not x_axis:
        raise ValueError("x_axis values list is empty")

    if not y_axis:
        raise ValueError("y_axis values list is empty")

    if len(x_axis) != len(y_axis):
        raise ValueError("The x and y axis values do not have the same number of values (%d and %d)"%(len(x_axis), len(y_axis)))

    data = [go.Bar(
                x=x_axis,
                y=y_axis,
                text=format_overlay_hover_info(x_axis, y_axis, ""),
                hoverinfo='text',
                marker=dict(
                    color=colour,
                    line=dict(
                        color=colour,
                        width=2,
                    )
                )
                )]

    layout = go.Layout(
        title=plot_title,
        width=width,
        height=height,
        margin=margin,
        xaxis=dict(
            title=x_axis_title,
            titlefont=dict(
                family='Arial, monospace',
                size=18
            ),
                type="category"
        ),
        yaxis=dict(
            title=y_axis_title,
            titlefont=dict(
                family='Arial, monospace',
                size=18
            )
        )
        , 
        separators=get_separators()
    )
    fig = go.Figure(data=data, layout=layout) 
    pio.show(fig, filename=filename, config={})

In [None]:
def plot_pie_chart(labels, values, title, margin, filename, colours=["#009fda", "#e00034"], width=950, height=600):
    """Creates a single pie chart with the given values and labels, optionally using the colours specified.
    If "colours" is empty, then default colours are used.
    Optionally, you can enter a dict as the margin, to set the size of the graph margins (useful if text is
    overlapping). See plotly documentation for more information
    :param labels - the labels of the chart segments
    :param values - the values of the chart segments
    :param title - the title displayed above the chart
    :param margin - the space to leave around the chart
    :param filename - the filename of the chart
    :param colours - optional, the colours to use for the segments
    :param width - optional, width of the plot
    :param height - optional, height of the plot
    """

    if not labels:
        raise ValueError("Labels list is empty")

    if not values:
        raise ValueError("Values list is empty")

    if len(labels) != len(values):
        raise ValueError("Must have equal number of items in labels and values")

    trace = go.Pie(labels=labels, values=values, sort=False, textinfo='label+percent', textposition="outside",
                hoverinfo='value',
                hole=.4,
                showlegend=False,
                marker=dict(
                            colors=colours,
                            line=dict(color='#000000', width=2))
                            )

    layout = go.Layout(title=title, width=width, height=height, margin=margin, separators=get_separators())

    fig = go.Figure(data=[trace], layout=layout)
    pio.show(fig, filename=filename, config={})

In [None]:
programmes_dataframe = pd.read_csv("joop.csv", sep='|', parse_dates=['Uitzenddatum'], 
                                   converters={"Zendgemachtigde": literal_eval,
                                               "Persons behind camera": literal_eval,
                                               "Persons in front of camera": literal_eval
                                              })
print(f"{len(programmes_dataframe)} programmes in total\n")
print(programmes_dataframe)

## Unique titles

In [None]:
print(programmes_dataframe["lengte programma in minuten (indien niet bij ons aanwezig aanwezig lengte bij benadering handmatig vermeld)"])

In [None]:
print(f"{len(programmes_dataframe['Titel'].unique())} unique programme titles\n")
title_counts = programmes_dataframe["Titel"].value_counts()
print(title_counts)

## Total number of hours

In [None]:
length_in_minutes = programmes_dataframe['lengte programma in minuten (indien niet bij ons aanwezig aanwezig lengte bij benadering handmatig vermeld)'].sum()
print(f"Total number of hours {length_in_minutes/60}")

print(f"\nLength known for {programmes_dataframe['lengte programma in minuten (indien niet bij ons aanwezig aanwezig lengte bij benadering handmatig vermeld)'].count()} of the {len(programmes_dataframe)} programmes")

## Digital content

In [None]:
digital_type_counts = programmes_dataframe["digitaal ja/nee"].value_counts()
print(digital_type_counts)

print(f"\nDigital known for {programmes_dataframe['digitaal ja/nee'].count()} of the {len(programmes_dataframe)} programmes")

## Distribution over time

In [None]:
date_counts = programmes_dataframe["Jaartal (start)"].value_counts().sort_index()
print(list(date_counts))
plot_Y_against_X_as_bar_chart(list(date_counts.index),
                            list(date_counts), 
                            "Number of programmes over time", 
                            "Year",
                            "Number of programmes", 
                            dict(t=50), 
                            "programmes-over-time" )

## Oldest/newest programmes

In [None]:
oldest_year = int(programmes_dataframe["Jaartal (start)"].min())
newest_year = int(programmes_dataframe["Jaartal (start)"].max())

# now get the min of the dates within the oldest year
oldest_dates = programmes_dataframe[programmes_dataframe["Jaartal (start)"] == oldest_year]

if len(oldest_dates) == 1:
    # just use date of programme
    print(f"Oldest programme: {oldest_year if pd.isna(oldest_dates.iloc[0]['Uitzenddatum']) else oldest_dates.iloc[0]['Uitzenddatum']}")
else:
    print(f"Oldest programme: {oldest_dates['Uitzenddatum'].min()}")
    
# now get the max of the dates within the newest year
newest_dates = programmes_dataframe[programmes_dataframe["Jaartal (start)"] == newest_year]

if len(newest_dates) == 1:
    # just use date of programme
    print(f"Newest programme: {newest_year if pd.isna(newest_dates.iloc[0]['Uitzenddatum']) else newest_dates.iloc[0]['Uitzenddatum']}")
else:
    print(f"Newest programme: {newest_dates['Uitzenddatum'].max()}")

## Longest running programmes

In [None]:
programmes_dataframe["Length of run"] = programmes_dataframe.fillna(0)["Last year"].astype(int) - programmes_dataframe["First year"].fillna(0).astype(int)

run_length_dataframe = programmes_dataframe[['Titel', 'Length of run']].copy()

# take the maximum length of run for each programme
run_length_dataframe = run_length_dataframe.groupby(['Titel']).max().reset_index()
longest_running_programmes = run_length_dataframe.sort_values(by=['Length of run'], ascending=False)
print(longest_running_programmes)
number_to_show = 10
plot_Y_against_X_as_bar_chart(list(longest_running_programmes['Titel'])[:number_to_show],
                            list(longest_running_programmes['Length of run'])[:number_to_show], 
                            f"Longest running programmes (top {number_to_show})", 
                            "Programme",
                            "Number of years programme ran for", 
                            dict(t=50), 
                            "longest-running-programmes" )


## Distribution over broadcaster

In [None]:
broadcaster_counts = programmes_dataframe['Zendgemachtigde'].explode().value_counts()
number_to_show = 10
print(broadcaster_counts)
plot_Y_against_X_as_bar_chart(list(broadcaster_counts.index)[:number_to_show],
                            list(broadcaster_counts)[:number_to_show], 
                            f"Number of programmes per broadcaster (top {number_to_show})", 
                            "Broadcaster",
                            "Number of programmes", 
                            dict(t=50), 
                            "programmes-per-broadcaster" )

## Distribution commercial/public

In [None]:
public_commercial_counts = programmes_dataframe["Commerciele of publieke omroep?"].value_counts()
print(public_commercial_counts)
plot_pie_chart(["Commercial", "Public"], 
                 list(public_commercial_counts),
                 "Public/commercial", 
                 {}, 
                 "public-commercial-dist", 
                 width=700)

## Appeared together with...
### Behind the camera
Crew or Creator

In [None]:
person_counts = programmes_dataframe['Persons behind camera'].explode().value_counts()
print(person_counts)
number_to_show = 10
plot_Y_against_X_as_bar_chart(list(person_counts.index)[:number_to_show],
                            list(person_counts)[:number_to_show], 
                            f"Number of programmes with person behind the camera (top {number_to_show})",
                            "Person",
                            "Number of programmes", 
                            dict(t=50), 
                            "programmes-per-person" )

### In front of the camera
Cast, Guest, mentioned person, recognised face or voice

In [None]:
person_counts = programmes_dataframe['Persons in front of camera'].explode().value_counts()
print(person_counts)
number_to_show = 10
plot_Y_against_X_as_bar_chart(list(person_counts.index)[:number_to_show],
                            list(person_counts)[:number_to_show], 
                            f"Number of programmes with person in front of the camera (top {number_to_show})", 
                            "Person",
                            "Number of programmes", 
                            dict(t=50), 
                            "programmes-per-person" )

### Specific person

In [None]:
person = "Hans van Eijck"

programmes_dataframe['index'] = programmes_dataframe.index
mask = programmes_dataframe['Persons behind camera'].apply(lambda x: person in x)
programmes_with_person_behind_camera = programmes_dataframe[mask]

mask = programmes_dataframe['Persons in front of camera'].apply(lambda x: person in x)
programmes_with_person_in_front_of_camera = programmes_dataframe[mask]

all_programmes_with_person = pd.merge(programmes_with_person_behind_camera, programmes_with_person_in_front_of_camera, how="outer", on="index")

print(f"There are {len(all_programmes_with_person)} programmes with {person} in total\n")

print(f"There are {len(programmes_with_person_behind_camera)} programmes with {person} behind the camera, from the series:\n")

print('\n'.join(set(list(programmes_with_person_behind_camera["Titel"]))))

print(f"\nThere are {len(programmes_with_person_in_front_of_camera)} programmes with {person} in front of the camera, from the series:\n")

print('\n'.join(set(list(programmes_with_person_in_front_of_camera["Titel"]))))