In [59]:
# Imports

# Data curation
import numpy as np
import pandas as pd
import datetime as dt

# Plotting
from bokeh.plotting import figure 
from bokeh.io import output_notebook, show
from bokeh.models import LabelSet, ColumnDataSource

output_notebook()

In [60]:
# Load the data
df = pd.read_csv('rwc.csv', index_col=0, parse_dates=['Date'])
df

Unnamed: 0,Date,Type,Distance_km,Hours,Minutes,Seconds,Time_h,Calories,ElevGain_m,AvgSpeed_km/h,Year,Month
0,2015-07-14,Walking,2.10,0,26,40,0.444444,89.0,28.0,4.725000,2015,7
1,2015-07-20,Cycling,21.18,1,8,13,1.136944,332.0,270.0,18.628879,2015,7
2,2015-07-25,Cycling,23.52,1,9,32,1.158889,390.0,327.0,20.295302,2015,7
3,2015-07-27,Running,6.94,0,42,4,0.701111,389.0,97.0,9.898574,2015,7
4,2015-07-29,Walking,1.73,0,20,36,0.343333,69.0,32.0,5.038835,2015,7
...,...,...,...,...,...,...,...,...,...,...,...,...
292,2021-01-05,Running,10.18,0,55,43,0.928611,691.0,213.0,10.962608,2021,1
293,2021-01-08,Cycling,23.50,0,45,0,0.750000,520.0,,31.333333,2021,1
294,2021-01-10,Running,10.18,0,55,9,0.919167,675.0,210.0,11.075249,2021,1
295,2021-01-12,Running,10.15,0,56,3,0.934167,689.0,211.0,10.865299,2021,1


In [142]:
def yearly_statistics(activity, statistic):
    
    '''
The yearly_statistics function requires 2 arguments: activity, which can be one of the following strings: 
Walking, Running, Cycling; and statistic, which can be one of the following strings: Counter, Distance, Time.

This function produces a bar chart based on the specific activity and statistic in cause, highlighting the
maximum value(s) in red and the minimum value(s) in blue. It also allows for the user to hover the cursor over
the bar to know more information about that year's chosen activity.
    '''
    
    #####################################################################################################
    # Error handling of wrong parameter input
    #####################################################################################################
    
    # Lists to store the options for each parameter
    activity_options = ['Running', 'Walking', 'Cycling']
    statistic_options = ['Distance', 'Time', 'Counter']
    
    # Ensure only one of Running, Walking or Cycling are the accepted activities  
    while activity not in activity_options:
        print('That is not a valid activity.')  # Warning message
        
        # Let the user choose another activity
        activity = input('Please choose one of the following activities [Running, Walking, Cycling]:\n')
        
    # Ensure only one of Distance, Time or Counter can be selected
    while statistic not in statistic_options:
        print('That is not a valid statistic.')  # Warning message
        
        # Let the user choose another statistic
        statistic = input('Please choose one of the following statistics [Distance, Time, Counter]:\n')
    
    #####################################################################################################
    # Data selection and curation
    #####################################################################################################
    
    # Limit the data you will consider based on the activity, group it by year and sum it
    activity_df = df.loc[df.Type==activity].groupby('Year').sum()
    
    # Round the decimal cases of the distance to 2 if the activity is not cycling, and to 0 if it is cycling
    if activity != 'Cycling':
        activity_df.Distance_km = activity_df.Distance_km.round(2)
    else:
        activity_df.Distance_km = activity_df.Distance_km.round(0)
    
    # Add the average speed column which needs to come from the grouped data by years but the mean is taken
    # instead of the sum. In this case, regardless of the activity, the number is rounded to 2 decimal cases
    activity_df['avg_speed'] = df.loc[df.Type==activity].groupby('Year').mean()['AvgSpeed_km/h'].round(2)
    
    # Add the counts column which comes from the grouped data by years and a counter is taken
    activity_df['count'] = df.loc[df.Type==activity].groupby('Year').count()['Date']
    
    # Create a column with the colors of the bars. Green is the smallest, red the biggest and blue are the
    # others. Create also the time labels
    
    color, time_spent = [], [] # Variable to hold the colors and the time labels
    
    for year in activity_df.index: # Loop over the years as they are the indices
        
        # Make sure the colors of the bars are set according to the statistic chosen
        if statistic == 'Distance':
            to_check = activity_df.Distance_km
        elif statistic == 'Time':
            to_check = activity_df.Time_h
        elif statistic == 'Counter':
            to_check = activity_df['count']
    
        # Add the color to the list
        if to_check[year] == min(to_check):
            color.append('green')
        elif to_check[year] == max(to_check):
            color.append('red')
        else:
            color.append('blue')
        
        # Create the time labels
        hour = int(activity_df.Time_h[year]) # The integer part is the number of hours spent
        
        # By removing the integer part to the overall value, you get the minutes, which need to be multiplied
        # by 60 and then rounded to no decimal cases
        minutes = int(round((activity_df.Time_h[year]-hour)*60, 0)) 
        
        time = str(hour)+'h '+str(minutes)+'min' # Create the label
        time_spent.append(time) # Add the label to the list

    # Add the columns to the dataframe
    activity_df['color'] = color
    activity_df['time_spent'] = time_spent
    
    #####################################################################################################
    # Plotting
    #####################################################################################################
    
    # Set the source as the curated dataframe
    source = ColumnDataSource(activity_df)

    # Information when the mouse is hovered over the bars
    tooltips = [('Distance', "@Distance_km{0,0.00} km"), ('Time', "@time_spent"),
                ("Calories burned","@Calories{0,0}"), ("Cumulative Elevation Gain", "@ElevGain_m{0,0} m"),
                ("Average Speed", "@avg_speed{0.00} km/h"), ("Number of runs", "@count")]
    
    # Set the title and the y-axis label
    # If the chosen statistic was Time, the title will only change due to the activity. The label for the 
    # y-axis will always be Hours
    if statistic == 'Time':
        title = 'Amount of Time Spent '+activity # Adapt the title based on the activity
        label = 'Hours' # Y-axis label
    
    # If the chosen statistic is Distance, the title will be adjusted according to the activity, and the
    # y-axis label will be Kilometers
    elif statistic == 'Distance':
        if activity == 'Walking':
            verb = 'Walked'
        elif activity == 'Cycling':
            verb = 'Cycled'
        else:
            verb = 'Run'
        
        # As it happened for Time, the same procedure is applied to Distance
        title = 'Number of Kilometers '+verb+' per Year'
        label = 'Kilometers'
    
    else:
        if activity == 'Walking':
            name = 'Walks'
        elif activity == 'Cycling':
            name = 'Bike Rides'
        else:
            name = 'Runs'
        
        title = 'Number of '+name+' per Year'
        label = 'Number of '+name
    
    # Instantiate the figure
    sports_fig = figure(title=title, x_axis_label='Year', y_axis_label = label, tooltips=tooltips,
                        plot_width=700, plot_height=500, tools='save')

    # Tweak the title
    sports_fig.title.align = 'center'
    sports_fig.title.text_font_size = "20px"

    # Remove unnecessary graph elements
    # Remove gridlines
    sports_fig.xgrid.grid_line_color, sports_fig.ygrid.grid_line_color = None, None

    # Remove x axis minor ticks
    sports_fig.xaxis.minor_tick_line_color = None
    
    # Remove outline line
    sports_fig.outline_line_color = None

    # Vertical bars
    # Set the bar height based on the chosen statistic and choose the data labels accordingly
    if statistic == 'Distance':
        label_choice = height_choice = 'Distance_km'
    elif statistic == 'Time':
        height_choice, label_choice = 'Time_h', 'time_spent'
    else:
        label_choice = height_choice = 'count'
        
    sports_fig.vbar(x='Year', top=height_choice, width=0.9, source=source, color='color')

    # Get the labelslabel_choice = 
    labels = LabelSet(x='Year', y=height_choice, text=label_choice, level='glyph', text_align='center',
                      source=source, render_mode='canvas', y_offset=3)

    # Add the labels to the figure
    sports_fig.add_layout(labels)

    # Show the figure
    show(sports_fig)