In [1]:
# Get rid of code on export
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# Welcome!

Welcome to my 2019 summary.  
This is a detailed analysis of what I've done so far during 2019. The data is collected from Toogl (the time tracking tool I use).

The first sections are quite tech stuff so feel free to jump to [section 1, overview](#1.-Overview). You can get back to here anytime clicking the [[Index]](#Index) link in every section. Alternatively you can consider installing [Table of Contents](https://github.com/jupyterlab/jupyterlab-toc) pluggin you won't regret.

# Index
* [Settings](#Settings)
* [Reload cache](#Reload-cache)
* [Load & clean data](#Load-and-clean-data)
* [1. Overview](#1.-Overview)
    * [Areas](#Areas:)
    * [Tags](#Tags:)
    * [A. Sleep over time](#A.-Sleep-over-time)
    * [B. Project continuity](#B.-Project-continuity)
    * [C. Time tracked throughoutt the year](#C.-Time-tracked-throughout-the-year)
* [2 BuildUp analysis](#2.-Buildup-analysis)
    * [2a. Buildup dedication per week](#2a.-BuildUp-dedication-per-week)
    * [2b. Buildup dedication per project](#2b.-BuildUp-dedication-per-project)
* [3 Billable analysis](#3.-Billable-analysis)
* [4. Core vs billable](#4.-Core-&-Billable-evolution-throughout-year)

In [157]:
import numpy as np
import pandas as pd
from scipy.stats import gaussian_kde

from datetime import date, time, timedelta
from calendar import month_abbr

from pyToggl import CliExpress
from my_plots import boxplot

# Plotting express
import matplotlib.pyplot as plt
import seaborn as sns

# Visualizations
from bokeh.plotting import figure, show, output_notebook, output_file
from bokeh.models import (
    ColumnDataSource, Range1d, Label, Span, FactorRange, LabelSet, )
from bokeh.layouts import row, column
from bokeh.transform import factor_cmap, linear_cmap, dodge
from bokeh.palettes import Category20, Accent3, viridis, Viridis256, Greens5

output_notebook()

# Settings
Define some global values.

[[Index]](#Index)

In [3]:
# Reload the info from the server
# otherwise use the baked csv (older_entries.csv)
RELOAD_CACHE = False

# Hard_code the list of tag columns.
# After cleaning we'll rebuilt it from the data
TAG_COLS = ['tag_0', 'tag_1', 'tag_2', 'tag_3', 'tag_4', 'tag_5', 'tag_6', 'tag_7']

# Reload cache
Fetch again the data from the server using `cli_express()`. (activate from [Settings](#Settings))

[[Index]](#Index)

In [4]:
if RELOAD_CACHE:
    # Creates/overwrites 'older entries.csv' that is data only from 2019
    CliExpress(reload=True)  

# Load and clean data
Import the data from the cache file (older entries.csv)

**Cleaning actions:**
* Get rid of Unnamed & date cols
* Convert start & stop to datetimes
* Make name a category called project
* Ensure entries are from 2019 only
* Ensure that tags appear only once in each entry
* Get durations in hours
* Add general areas to projects so we can assign a fixed color to them
* Chop midnight-crossing entries so they fit in natural days 
* Ensure task names are unique
* Get Week names
* Get a normalized dataframe (without sleep)



[[Index]](#Index)

In [5]:
df_raw = pd.read_csv('older entries.csv')

# Get rid of Unnamed & date cols
f = (pd.DatetimeIndex(df_raw.date).date != pd.DatetimeIndex(df_raw.start).date)
assert df_raw[f].empty  # Ensure date and start point to the same date

df = df_raw.drop(columns=['Unnamed: 0', 'date'])

# Convert start & stop cols to datetime
df.loc[:, 'start'] = pd.to_datetime(df.start)
df.loc[:, 'stop'] = pd.to_datetime(df.start)
# Ensure everyone has times
assert (df[df.stop.isna()].empty & df[df.start.isna()].empty)

# rename to project
df = df.rename(columns={'name': 'project'})

# older_entries should be 2019 data but ensure only that year data is used
f = pd.DatetimeIndex(df.start).year == 2019
d0 = df.copy()  # prevoius data
df = df[f]
assert d0.shape == df.shape  # And raise error if not

# Ensure that tags appear only once
TAGS = df[TAG_COLS].stack().unique()
for tag in TAGS:
    assert ~((df[TAG_COLS] == tag).sum(axis=1) > 1).any()

# Add main areas (for 2020 clients were used)
df['area'] = np.nan
c0 = {
    'shift_sleep': ['ShiftSleep', ],
    'personal': [
        'Shared.Time', 'healthyLife', 'Feng Shui', 'newCPU', 'redaction',],
    'kic': ['KiC',],
    'buildup': [
        'Superintelligence', 'Khan', 'wEssay', 'Les Mis', 'Jupyter HandBook',
        'mathStuffBox', 'Guggenheim', 'FlashCS', 'PyToggl', 'WhiteBoard',
        'csStuffBox', 'SW Hawking', 'buStuffBox', 'Infinite Powers', 
        'typing course', 'Japanese', 'Networks, crowds and markets', 'BCAM ML',
        'BCAM Decisions', 'Ultralearning', '2018 Summary'],
    'sport': ['Running', 'Bike', 'Swimming',],
    'collaborations': [
        'TZ Management', 'Urgoiti Lounges', 'Urban Adventures', 'Landing',
        'TZ-IT', 'Tz.stats', 'Tz web'],
    'billable': ['Reception', 'Tourne Tours', ]
    }
for k, v in c0.items():
    idx = df[df.project.isin(v)].index
    df.loc[idx, 'area'] = k


assert df[df.area.isna()].empty

# Have times durations in hours
df.loc[:, 'duration'] = (df.duration / 3600).round(2)


### Chop entries that cross 0am  ###
d0 = df.copy()

# Convert time zone
d0.loc[:, 'start'] = pd.DatetimeIndex(d0.start).tz_convert('Europe/Madrid')

# Forecast end for all entries (They had the same date as start)
d0.loc[:, 'stop'] = d0.start + pd.to_timedelta(d0.duration, unit='hours')

# Select the entries that start and finish on diff days
f = pd.DatetimeIndex(d0.start).date != pd.DatetimeIndex(d0.stop)

# We're about to end the entries that were already in the df at 23.59 and add
# a copy of them starting at 0am
s0, s1 = d0[f].copy(), d0[f].copy()

## End days ###
# Build the localized end date within the day
k0 = pd.DataFrame({
    'year': s0.start.dt.year,
    'month': s0.start.dt.month,
    'day': s0.start.dt.day,
    'hour': 23,
    'minute': 59,
    'second': 59,
})

k0 = pd.to_datetime(k0)
k0 = pd.DatetimeIndex(k0).tz_localize('Europe/Madrid')

# And replace in the slice
s0.loc[s0.index, 'stop'] =  k0

# Finally recalculate the duration
h0 = pd.TimedeltaIndex(s0.stop - s0.start).seconds / 3600
s0.loc[s0.index, 'duration'] = h0.values.round(2)

## Starting days ##
# Build the localized end date within the day
k1 = pd.DataFrame({
    'year': s1.stop.dt.year,
    'month': s1.stop.dt.month,
    'day': s1.stop.dt.day,
    'hour': 0,
    'minute': 0,
    'second': 0,
})

k1 = pd.to_datetime(k1)
k1 = pd.DatetimeIndex(k1).tz_localize('Europe/Madrid')

# And replace in the slice
s1.loc[s1.index, 'start'] =  k1

# Finally recalculate the duration
h1 = pd.TimedeltaIndex(s1.stop - s1.start).seconds / 3600
s1.loc[s1.index, 'duration'] = h1.values.round(2)


# Now replace crossing midnight entries by their chopped counterparts
d0.loc[s0.index, :] = s0

# And add the new day starting ones
d0 = pd.concat((d0, s1))

# Rearrange by start, reindex and get rid of the last row (it was cloned on 
# chopping)
d0 = d0.sort_values('start')
df = d0.reset_index(drop=True)
df = df.drop(index=df.index.max())

# Asign default to nan tasks
df.task.fillna('default', inplace=True)

# Ensure that task names are have unique names by groupping them along with 
# projects and then counting how many projects appear for each task. Should be
# 1 because of default task
k0 = df.groupby(['task', 'project']).count()
k0 = k0.reset_index().groupby('task').project.count()
assert (k0 > 1).sum() == 1  # the default project


# Get week names df
d0 = df[df.task == 'Week plan'].copy()
idx = d0[d0.description.duplicated()].index
d0 = d0.drop(index=idx).reset_index(drop=True).description
WEEK_NAMES = d0.str.split(' week| Week', expand=True)[0]

# Get the normalized dataframe (that without sleep)
f = df.area == 'shift_sleep'
dfn = df.drop(index=df[f].index)

## Plot settings
Once cleaned the data, add some settings related to plots since they rely on the data.

* **CAT_COLOR:** Define a consistent color across plots
* **TOOLS:** Define the tools that appear in the plots
* **MONTHS_DICT:** month dict to map axes.

## Sortcuts
Some useful shortcuts.
* **TAG_COLS:** the columns that carry info about the tags. Useful to search them.
* **UNIQUE_TAGS:** the unique tags across the df.

In [6]:
# A consistent project color map across plots
AREA_COLORS_DICT = {
    'kic': 'darkorchid',
    'billable': 'coral',
    'buildup': 'lightseagreen',
    'collaborations': 'darkseagreen',
    'personal': 'olive', 
    'sport': 'slategrey',
}

# Tools for the plots
TOOLS='hover,crosshair,pan,wheel_zoom,box_zoom,reset,tap'

# Month list for axes
MONTHS_DICT = {
    1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Abr', 5: 'May', 6: 'Jun', 
    7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

# Month list for axes when they are weekly labeled
MONTHS_WEEKS_DICT = {
    0: 'Jan', 4: 'Feb', 8: 'Mar', 13: 'Abr', 17: 'May', 21: 'Jun', 26: 'Jul', 
    30: 'Aug', 34: 'Sep', 39: 'Oct', 43: 'Nov', 47: 'Dec', 52: 'Jan'}


# Get unique tags
TAG_COLS = dfn.columns[dfn.columns.str.contains('tag_')]
k0 = dfn[TAG_COLS].astype(str).values.ravel()
UNIQUE_TAGS = np.unique(k0)

## Methods

Useful methods

In [7]:
def IQR(df, col='duration', freq='SM'):
    """Make a dataframe of the interquartile range of the given column.
    
    returns interquartile df & outliers df
    """
    # Get > Q3 + 1.5 * IQR peaks
    Q1, Q3 = (d0[col].describe()['25%'], d0[col].describe()['75%'])
    iqr0 = Q3 - Q1
    lower = (Q1 - 1.5 * iqr0).astype(int)  # Lower bound for the box
    upper = (Q3 + 1.5 * iqr0).astype(int)  # Upper bound for the box
    
    # Resample column excluding outliers
    f = (df[col] <= upper) & (df[col] >= lower)
    d1, d2 = df[f], df[~f]
    a0 = [lambda x: np.percentile(x, 25), lambda x: np.percentile(x, 75), 
          np.mean, ]
    d1 = d1[col].resample(freq).agg(a0)
    d1['Q1'] = d1.iloc[:, 0]
    d1['Q3'] = d1.iloc[:, 1]
    d1 = d1.drop(['<lambda>', '<lambda>'], axis=1).reset_index()

    return d1, d2

def week_sample(df):
    """A convenience method that samples durations weekly.
    
    Arguments:
      df: a pandas dataframe with a start and duration columns.
      
    Returns: a dataframe with two columns, start (dt) & duration (float).
    """
    d0 = df.groupby(df.start.dt.date).duration.sum()
    dr = pd.to_datetime(d0.index) 
    d0 = d0.reindex(dr).resample('W-MON').sum()
    return d0.reset_index()

# 1. Overview

On tracking time, every activity is classified uniquely in a single project. These projects are classified in this notebook in areas.

### **Areas:**
There are 7 different areas:
* **Billable:** where I get the cashflow that keeps everything running.
* **Buildup:** thought of it as the R+D area.
* **Collaborations:** what is supposed to be work but delivered for free.
* **KiC:** Self organization stuff.
* **Personal:** a kind of stuffBox where everything that doesn't fit elsewhere ends up. Mostly is shared time with the family, actually.
* **Shift_sleep:** single project area considered aside due to its size.
* **Sport:** build body & mind

[[Index]](#Index)

In [8]:
# Projects description hardcode
k0 = df.groupby(['area', 'project']).duration.sum()


In [9]:
S1b = pd.DataFrame(k0)
S1b['description'] = [
    'The work at the hotel.',
    'Guided visits in the city.',
    'The previous year outcome',
    'A short course on decision theory at BCAM.',
    'A short course on ML at BCAM',
    'A collection of short computer science projects.',
    'Visits to expositions at the Guggenheim Museum.',
    'Infinite Powers\' book reading club management.',
    'Japanese language learning.',
    'A brief introduction to jupyter notebooks, numpy and pandas.',
    'Learning maths at Khan Academy.',
    'Les misèrables reading (in french)',
    'Study Networks crowds and Markets book by David Easley and Jon Kleinberg',
    'The program that let me take custom and personal insights of the time\
    tracked. Also used to check consistency in the records.',
    'A review of one of my all-time most beloved books: A Brief history of time.',
    'Book by Nick Bostrom',
    'A book by Scott Young to improve learning methods.',
    'A contest of math white/blackboards',
    'Asorted learning stuff (shallow) :' + 
    ', '.join(df[df.project == 'buStuffBox'].description.unique().tolist()),
    'Asorted cs learning stuff (shallow)',
    'Some math projects (shallow & deep) outside khan Academy',
    'Learn typing techniques',
    'A short essay on work',
    'The preparation to join Irontec (an IT company)',
    'Run the numbers in Trapuzarrak (a local business focused on tailor made basque outfits.)',
    'An app to manage production Trapuzarrak',
    'Create a web page for Trapuzarrak.',
    'The data analysis extracted from TZ app',
    'The new branch of guided visits',
    'A brief visual guide of services for the hotel.',
    'Self organization time',
    'Realocate stuff at home in deep',
    'Family time',
    'Take care about nutrition',
    'Set up new computer',
    '-',
    'Tracking sleep time (sometimes shifted by the shifttimes at the hotel)',
    'Ride',
    'Mostly trail running',
    'Indoor pool',]
print('Showing projects that took more than the 1% of the time tracked')
S1b[S1b.duration > dfn.duration.sum() * .01]
S1b

Showing projects that took more than the 1% of the time tracked


Unnamed: 0_level_0,Unnamed: 1_level_0,duration,description
area,project,Unnamed: 2_level_1,Unnamed: 3_level_1
billable,Reception,960.89,The work at the hotel.
billable,Tourne Tours,350.88,Guided visits in the city.
buildup,2018 Summary,102.85,The previous year outcome
buildup,BCAM Decisions,30.74,A short course on decision theory at BCAM.
buildup,BCAM ML,94.86,A short course on ML at BCAM
buildup,FlashCS,118.34,A collection of short computer science projects.
buildup,Guggenheim,6.47,Visits to expositions at the Guggenheim Museum.
buildup,Infinite Powers,67.57,Infinite Powers' book reading club management.
buildup,Japanese,31.41,Japanese language learning.
buildup,Jupyter HandBook,1.73,"A brief introduction to jupyter notebooks, num..."


These are rigid areas that enclose rigid projects but often areas and projects overlap one another. And there is where tags come into scene spanning tracked activities through projects and areas:

### **Tags:**
* **BuildUp:** the tag that binds all the buildup acivities (regardless they are in a buildup project)
    * **Core:** the key activities in buildup projects, that is, math & computer science topics.
        * **Python:** all the python programming stuff.
            * **Jupyter:** numpy, pandas and jupyter notebooks skill development.
        * **Math:** all the math activity.
            * **Calculus:** math activities related to calculus.
            * **Linear Algebra:** math activities related to linear algebra.
        * **ML:** Machine learning topics usually involving above tags.
        * **Web tech:** web skills (html, css, js)
    * **Lang:** language learning bundle.
    * **Reading:** recreational reading or shallow learning.
* **Graph:** graphical design works.
* **Iratxe:** time spent with my partner.
* **Family:** family time.
* **Home:** Activities performed at home.

**Caveats:**  
Shift sleep was imported from swipetimes (the old time tracking app) so it didn't carry info about sleep at home or, for instance, a short nap in the bus. 


**Extra data:**  
**S1a:** The hours by project not in buildup area but having buildup tag   
**S1b:** The description of all projects carried out.

[[Index]](#Index)

In [10]:
# Get the projects with BuildUp tag but outside buildup area
f0 = (df.area != 'buildup')
f1 = ((df[TAG_COLS] == 'BuildUp').sum(axis=1) == 1)
S1a = df[f0 & f1].groupby('project').duration.sum()

# Create areas' data
d0 = dfn.groupby('area').sum()
new_order = [
    'billable', 'collaborations', 'buildup', 
    'personal', 'sport', 'kic']
d0 = d0.reindex(new_order).reset_index()

# Create tags' data
k0 = [df.groupby(col).duration.sum() for col in TAG_COLS]
k0 = pd.concat(k0).groupby(level=0).sum()
k0 = (k0).round(1)
k0 = k0.reset_index().rename(columns={'index': 'tag'})

f = (
    (k0.tag.isin(np.linspace(1.5, 5.0, 8).astype(str))) |
    (k0.duration < 100) |
    (k0.tag == 'tip')
)
k0 = k0.drop(k0[f].index)
k1 = (df[df.billable == True].duration.sum()).round(1)
k0.loc[24, :] = ['billable', k1]
k0 = k0.sort_values('duration', ascending=False)


## Plot ##
s0 = ColumnDataSource(d0)
s1 = ColumnDataSource(k0)

# Create the category colors
colors =  [
    AREA_COLORS_DICT[area] for area in d0.area.tolist()]
m1 = factor_cmap('area', colors, factors=d0.area)

p1 = figure(
    x_range=d0.area, height=400, width=500, toolbar_location=None,
    title='Time Distribution between project\'s areas',
    x_axis_label='Area', y_axis_label='Hours', tooltips='@area: @duration h')

p1.vbar(x='area', top='duration', width=.7, fill_alpha=.7, color=m1, source=s0)

p2 = figure(
    x_range=k0.tag, height=400, width=500, toolbar_location=None, 
    title='Time Distribution between tags', x_axis_label='Tag', 
    tooltips='@tag: @duration h')

m2 = factor_cmap('tag', Category20[12], factors=k0.tag)
p2.vbar(x='tag', top='duration', width=.7, fill_alpha=.7, color=m2, source=s1)

p2.xaxis.major_label_orientation = np.pi/4

show(row(p1, p2))

In [11]:
k0 = dfn.copy()
k0['month'] = k0.start.dt.month
k0 = k0.pivot_table(
    index='month', columns='area', values='duration', aggfunc=np.sum)
k0['month_name'] = [month_abbr[n] for n in range(1, 13)]


from bokeh.models import FactorRange
s0 = ColumnDataSource(k0.reset_index())
cols = k0.columns.tolist()[:-1]
x_range = FactorRange(factors=k0.month_name.tolist())
p1 = figure(
    width=900, height=400, y_range=(0, 500), x_range=x_range,
    title='Time distribution per area and per month',
    x_axis_label='Months', y_axis_label='Hours')
p1.varea_stack(
    cols, x='month_name', source=s0, fill_alpha=.7,
    color=[AREA_COLORS_DICT[area] for area in cols],
    legend_label=cols)

p1.legend.orientation = 'horizontal'
p1.legend.location = 'top_left'
    
show(p1)

## A. Sleep over time

Sleep is a fundamental part in everyone's life. In my case it's vital to keep an eye on it due to the time spent at the hotel --nights-- so it's one of the parameters I control from pyToggl script. All the time slept is tracked whether regular nigth sleep, morning sleep after hotel or even a bus nap. Notice also that many days I went from the hotel right to a tour delaying the sleep to the afternoon and even to the nigth.

Below graphs show the daily sleep through the year per month and its probability distribution. 

**Keys:**
* June is the most intense month because of the hotel and the tour high season, whereas I' say that january is the easiest.
* On July there's a day with no sleep at all that corresponds to a 168km trail running race I took part in.

As shown in the distribution plot, most likely I'm going to be sleeping 7.49h if randomly chosen, but on average I ended up sleeping 6.9h.

[[Index]](#Index)

In [12]:
f = (df.area == 'shift_sleep')
d0 = df[f].copy()

d0 = d0.set_index(d0.start)
d0 = d0.resample('D').sum()
d1 = d0.duration

# Plot the boxplot
h1 = 350
p1 = boxplot(d0.index.month.values, d1.values,)
p1.title.text = 'Daily sleep per month'
p1.xaxis.axis_label = 'months'
p1.yaxis.axis_label = 'hours'
p1.xaxis.ticker = [n for n in range(1, 13)]
p1.xaxis.major_label_overrides = MONTHS_DICT
p1.width, p1.height = 600, h1

# Plot the kernel density estimation
kernel = gaussian_kde(d1)
x=np.linspace(d1.min(), d1.max(), d1.count())
kde = pd.DataFrame({'x': x, 'kernel': kernel(x)})

s0 = ColumnDataSource(kde)

p2 = figure(
    title='Sleep hours per day kde', width=300, height=h1,
    tools=TOOLS, x_axis_label='Hours (std)')
p2.line(x='x', y='kernel', source=s0)
mean = Span(
    location=d1.mean(), dimension='height',
    line_dash='dashed', line_width=2, line_color='red')


ticks = [d1.mean() - n*d1.std() for n in (4, 3, 2, 1, 0, -1, -2)]
p2.xaxis.ticker = ticks
p2.add_layout(mean)


show(row(p1, p2))

## B. Project continuity
The following plot shows the continuity of projects throghout the year, a view of simultaneously open projects. That is, represents the projects where there's some kind of progress in a given week. 

**Keys:**  
* B the end of July there were several projects that suddenly stopped
* Also, July's 27, 28, 29 were the weeks --aside of the first and last-- with less projects (<13) due to a couple of long races that took place that likely produced a lack of concentration.
* The most productive weeks (in terms of open projects) were November's 43, 44, 46, 48, 49 with more than 20 projects open.
* Buildup area projects, by far, are majority.
* The dashed line represents the long runs week that acted like a switch point



[[Index]](#Index)

In [13]:
d0 = dfn.copy()
d0['week'] = pd.DatetimeIndex(dfn.start).week
d0 = d0.pivot_table(index='project', columns='week', values='duration')

# Order by dedication
d0['counts'] = d0.count(axis=1)
d0 = d0.sort_values('counts', ascending=False)
c0 = d0.counts.astype(str)  # to show in the plot
d0 = d0.drop(columns='counts')

# Select only the projects that have more than threshold weeks dedication
t0 = 10
f = (~d0.isna()).sum(axis=1) > t0
d0 = d0[f.values]

# Fill the weeks with dedication with a whole number depending the project
rows = d0.shape[0]
y0 = (
    np.full(shape=d0.shape, fill_value=1) + 
    (np.arange(rows).reshape(1, rows) + .5).T)
f = d0.isna()
d0[~f] = y0

# Transpose the df so weeks lay in the index
d0 = d0.T

# Set some colors to match areas
colors = [
    AREA_COLORS_DICT[dfn[dfn.project == project].sample(1).area.values[0]] 
    for project in d0.columns]

### PLOT ###
s0 = ColumnDataSource(d0)
r0 = Range1d(-7, 54)
p0 = viridis(d0.shape[1])
t1 = 'Project dedication (>{} weeks of dedication this year)'.format(t0)

p1 = figure(
    plot_height=600, plot_width=950, x_range=r0, toolbar_location='above',
    title=t1, y_axis_label='Projects', x_axis_label='Week (ticked by month)')

for n, col in enumerate(d0.columns):
    p1.line(x='week', y=col, source=s0, line_width=10, color=colors[n], 
            line_cap='round')
    l1 = Label(
        x=-6, y=n+1.5, text='{}, {}'.format(c0[n], col),
        text_font_size='12px', text_baseline='middle')
    p1.add_layout(l1)

# Add the 28th week marker
twenty8 = Span(
    location=28, dimension='height', line_color='coral', line_dash='dashed',
    line_width=3)
p1.add_layout(twenty8)

p1.yaxis.visible = False
p1.yaxis.ticker = np.arange(2, d0.shape[1] + 1)
p1.xaxis.bounds = (0, 52)
p1.xaxis.ticker = np.linspace(0, 52, num=13).astype(int)
p1.xaxis.major_label_overrides = MONTHS_WEEKS_DICT

show(p1)

## C. Time tracked throughout the year

Finally, the daily time tracked per month.

**Keys:**  
* December is the month with more time tracked on average per day, maybe because of the christmas when there are a bunch of hours spent with the family.
* The outliers appearing in july account for the two long runs I did.


[[Index]](#Index)

In [14]:
d0 = dfn.copy()
d0 = d0[['start', 'duration']].set_index('start')
d0 = d0.resample('D').sum()
p = boxplot(d0.index.month.values, d0.duration.values,)
p.title.text = 'Daily time tracked per month'
p.xaxis.axis_label = 'months'
p.yaxis.axis_label = 'hours'
p.width, p.height = 900, 400
p.xaxis.ticker = [n for n in range(1, 13)]
p.xaxis.major_label_overrides = MONTHS_DICT
show(p)

## D. Year worksheet

A github style --work-- contributions during the year.

Black days represent days with no dedication at all, those are:

| Date | Position | Description |
| ---- | -------- | ----------- |
| Jan 1 | (1, 1) | Family day |
| March 28 | (13, 4) | Zegama aizkorri long run |
| July 13 | (28, 6) | Ehun milak |
| July 19-21-22 | (29, (5, 6, 7)) | Utra Valnord |

Also there is workload per week plot with a reference to a regular 40h week to get a feel how intense weeks are.

**Most intense day:** April 11th, (15, 4) 15.86h. All the night at the hotel, then some coding in the morning (pyToggl). Tour, some more coding and black holes conference in the afternoon.

In [160]:
def x():
    """Come up with the x coordinate.

    The x coordinate are the weeks in the year (53), the first one having 6
    days and the last one (officially belonging to jan 2020) having two days.
    """
    first_week = np.ones(6)
    last_week = np.full(2, 53)
    inner_weeks = np.concatenate([np.full(7, n) for n in range(2, 53)])
    x = np.concatenate((first_week, inner_weeks, last_week))
    assert x.size == 365
    return x


def y():
    """Come up with the y coordinate.

    The y coordinate are the days within a week. Also, the first one has 6
    days and the last one (officially belonging to jan 2020) has two days.
    """
    first_week = np.arange(1, 7)
    last_week = np.arange(1, 3)
    inner_weeks = np.concatenate([np.arange(1, 8) for _ in range(2, 53)])
    y = np.concatenate((first_week, inner_weeks, last_week))
    assert y.size == 365
    return y

# Get durations per day ensuring that every day in the year has a value, even 0
work = df[df.area.isin(['billable', 'collaborations', 'buildup', ])]
work = work.groupby(df.start.dt.date).duration.sum()
work = work.reindex(pd.date_range('2019-1-1', '2019-12-31'))
assert work.size == 365
work = work.reset_index()
work['x'] = x().astype(int)
work['y'] = y()
work_load = work.groupby('x').duration.sum().reset_index()

## Plot ##
# Uncomment the following line to export the plot to a html file. Notice that 
# this will have the side effect to export every other plot that calls show()
# afterwards, so use once and restart the kernel to avoid that behavior.
# output_file('Outputs/worksheet.html')

# The first plot is a github style contribution sheet intended to show the
# intensity per day (and days with no activity at all)
s0 = ColumnDataSource(work)
cmap = linear_cmap(
    'duration', palette=Greens5[::-1], low=0, high=work.duration.max(), 
    nan_color='black')
p1 = figure(
    plot_height=200, plot_width=950, y_range=(0, 8), x_range=(0, 55),
    title='2019 WorkSheet', tools=TOOLS, y_axis_label='days', 
    tooltips='@duration h', toolbar_location=None)
p1.square('x', 'y', source=s0, size=10, color=cmap, line_color=Greens5[0])

# Now plot the work load per week to get a feel of what means in terms of a
# regular 40h work week.
s1 = ColumnDataSource(work_load)
p2 = figure(
    plot_height=200, plot_width=950, y_range=(0, 80), x_range=(0, 55), 
    y_axis_label='workload', x_axis_label='weeks', tooltips='@duration h',
    toolbar_location=None)
p2.vbar(x='x', top='duration', source=s1, width=.7, fill_alpha=.5,
        color=Greens5[1])
forty_hours_limit = Span(location=40, dimension='width', line_color='red')
p2.add_layout(forty_hours_limit)

p1.xgrid.grid_line_color = None
p1.ygrid.grid_line_color = None
p1.xaxis.ticker = np.append(1, np.arange(1, 11) * 5)
p2.xaxis.ticker = np.append(1, np.arange(1, 11) * 5)
p1.yaxis.ticker = np.arange(1, 8)
p2.yaxis.ticker = 10 * np.arange(9)
show(column(p1, p2))

# 2. Buildup analysis

## 2a. BuildUp dedication per week
The first visualization of this section shows the dedication in hours per week. Also the buildup tag dedication is shown because, as said in the begining, it spans over other areas (mostly in Collaborations)

**Keys:**
* Visually, it's clear that by 29th week (after the long runs) there was a break point after which buildup dedication was increased, steeper in the case of buildup tag --uncomment to show--
* By the end of the year buildup activity was intense but mainly because of the new features designed for TZ-IT (part of collaborations)


[[Index]](#Index)

In [159]:
bu_entries = dfn[dfn.area == 'buildup']

# Ensure all entries are tagged as BuildUp
f2 = ((df[TAG_COLS] == 'BuildUp').sum(axis=1) == 1)
f3 = df.area == 'buildup'  # entries classified as Buildup Area
assert df[f3 & ~f2].empty

# The list of projects in buildup area (remove if not used finally)
S1b.loc['buildup', :].sort_values('duration', ascending=False)  

# Also display BuildUp Tag
f = ((dfn[TAG_COLS] == 'BuildUp').sum(axis=1) == 1)
bu_tag = dfn[f]

# get the data
d0, d1 = [week_sample(x)[:-1] for x in (bu_entries, bu_tag,)]
d0 = d0.join(d1.duration, rsuffix='_tag')
d0['start'] = d0.start - timedelta(days=6)
d0['date'] = d0.start.astype(str)
d0['week'] = 1 + d0.index
# No area durations over tag durations
assert d0[d0.duration > d0.duration_tag].empty

# Calculate the mean after and before the 28th week break
breakpoint = d0[d0.duration == d0.duration.min()].index
before = d0[d0.index <= breakpoint[0]]
after = d0[d0.index > breakpoint[0]]
d0.loc[before.index, 'mean'] = before.duration.mean()
d0.loc[after.index, 'mean'] = after.duration.mean()

### Plot ###
s0 = ColumnDataSource(d0)

p1 = figure(
    plot_height=400, plot_width=950,
    title='BuildUp dedication per week', x_axis_type='datetime',
    tools=TOOLS, y_axis_label='hours per week', x_axis_label='Date',
    tooltips='@week) @date: @duration h', toolbar_location='above')

p1.vbar(
    x='start', top='duration', width=timedelta(days=5), source=s0,
    fill_alpha=.6, color=AREA_COLORS_DICT['buildup'],legend_label='Area')

p1.vbar(
    x='start', top='duration_tag', width=timedelta(days=5), source=s0, 
    fill_alpha=.1, color=AREA_COLORS_DICT['buildup'], legend_label='Tag',)

p1.line(x='start', y='mean', source=s0, legend_label='mean', line_width=3, 
        line_alpha=.7, color='coral')
try:
    d0['mean_tag']
except KeyError:
    pass
else:
    p1.line(
        x='start', y='mean_tag', source=s0,
        legend_label='mean_tag', line_width=3, line_alpha=.7, color='blue')

p1.legend.location = 'top_left'


show(p1)

## 2b. BuildUp dedication per project

The following plot shows the dedication per project as well as the main areas of interest tracked by tags.

The main areas of interest are: math, calculus, ml, python, 

**Keys:**
* Only projects with more than 15h in the year are considered.
* Khan --Academy-- concentrates a lot of time  by itself.
* As expected FlashCS, a collection of short asorted projects related to computer science, have the widest range of areas of interest.
* Conversely, wEssay, a short essay about work, has the least interaction with the main areas of interest, although the project was quite interesting.


[[Index]](#Index)

In [16]:
k0 = dfn[dfn.area == 'buildup']

# First extract the time invested per tag (of interest)
tags = [
    'Math', 'Calculus',
    'ML',  'Python', 'Jupyter',]

series = list()
for tag in tags:
    f0 = ((k0[TAG_COLS] == tag).sum(axis=1) == 1)
    f0 = k0[f0].groupby('project').duration.sum()
    f0.name = tag
    series.append(f0)
k0 = pd.concat(series, axis=1)

# Now get the total durations for those projects
k1 = dfn[dfn.project.isin(k0.index)]
k1 = k1.groupby('project').duration.sum().sort_values(ascending=False)

# Finally join the global durations to the tag durations
k0 = k0.reindex(k1.index)
k0['duration'] = k1.values
k0 = k0[k0.duration > 20]

### Plot ###
s0 = ColumnDataSource(k0)

tooltips = [
    ('Dedication', '@duration h'),
    ('Math', '@Math h'),   
    ('Calculus', '@Calculus h'),   
    ('ML', '@ML h'),
    ('Python', '@Python h'),
    ('Jupyter', '@Jupyter h'),]

p = figure(
    x_range=k0.index.values, plot_height=500, plot_width=900,
    title='BuildUp dedication per project',
    tools=TOOLS, y_axis_label='Year hours', x_axis_label='Project',
    tooltips=tooltips)

p.vbar(
    x='project', top='duration', width=.83, source=s0, fill_alpha=.4,
    color=AREA_COLORS_DICT['buildup'], legend_label='Dedication')

offset = np.linspace(-.35, .35, 5)
colors = viridis(8)
for n, col in enumerate(k0.columns[:-1]):
    p.vbar(
        x=dodge('project', offset[n], range=p.x_range), top=col, width=.1,
        color=colors[n+1], legend_label='%' + col, source=s0, fill_alpha=.7)

p.xaxis.major_label_orientation = np.pi/4

show(p)

## 2c. Subproject analysis --Core only--

Certain core projects --that is, projects involving one of the areas of interest-- have subprojects inside, well because of their asorted nature, like FlashCS, or because of the logical subsection splits. The following set of plots show these areas. Hovering on the cols, a short description of the area is shown.

Depending the area of interest we set three categories:
* **Math:** projects that track in the math tag but few ML tag
    * Khan
    * mathStuffBox
* **CS:** projects that track in the python tag but few ML tag
    * 2018 Summary
    * pyToggl
* **ML:** projects that track mostly on the ML tag 
    * FlashCS
    * BCAM ML
    * BCAM Decisions

Notice that only projects that track over 80% on the Core tag are considered
    

### 2c_1: Math projects

As seen on figure 2b, dedication largely goes in Khan academy project where I was learning multivariable calculus, topic that greatly improved the understanding of neural networks fancy terms like gradient descent. Aside, mathStuffBox are mostly collections of short puzzles I came across during the year. Specially nice that of [Galperin](https://www.maths.tcd.ie/~lebed/Galperin.%20Playing%20pool%20with%20pi.pdf).

In [17]:
descriptions = pd.read_csv('task_descriptions.csv', index_col=0)
math_dfs = list()
projects = ('Khan', 'mathStuffBox')
for project in projects:
    k0 = df[df.project == project]
    k0 = k0.groupby('task').duration.sum().reset_index()
    k0 = pd.merge(k0, descriptions, on='task', how='left')
    # Ensure every task has description
    assert k0.description.isna().sum() == 0
    math_dfs.append(k0)

# Plot
plots = list()
for n, s in enumerate(math_dfs):
    source = ColumnDataSource(s)
    colors = viridis(math_dfs[n].index.size +2)
    c_map = factor_cmap('task', colors, math_dfs[n].task)
    title = '{} subprojects time distribution'.format(projects[n])
    y_range = Range1d(0, 100)
    p = figure(
        x_range=math_dfs[n].task, width=450, height=400, title=title,
        toolbar_location=None, tooltips='@description', y_range=y_range)

    p.vbar(
        x='task', top='duration', width=.7, source=source, fill_alpha=.5, 
        color=c_map)
    p.xaxis.major_label_orientation = np.pi/4
    if n == 0:
        p.yaxis.axis_label = 'Dedication in hours'
    plots.append(p)

show(row(plots))

### 2c_2: CS projects
Noticeable that as the analysis goes on the time dedicated on each section is decreasing, these could be the causes:
* Because in the begining one has to figure out the design and come up with the code. 
* Buildup and OpK were intense areas of work and so they have a decent chunk of data to analyze
* Also, can be caused because late in the year I was running out of time and I had to finish it.

PyToggl has no subsections so it's not plotted.

In [18]:
k0 = df[df.project == '2018 Summary']
k0 = k0.groupby('task').duration.sum().reset_index()
k0 = pd.merge(k0, descriptions, on='task', how='left')
assert k0.description.isna().sum() == 0  # Ensure every task has description

# Order by duration and send Sheet to the pole position afterwards
k0 = k0.sort_values('duration')
k0.reset_index(drop=True, inplace=True)
k0 = k0.reindex([0, 1, 2, 4, 5, 3])


# Plot cs_dfs
source = ColumnDataSource(k0)
colors = viridis(k0.index.size +2)
c_map = factor_cmap('task', colors, k0.task)
p = figure(
    y_range=k0.task, width=700, height=400, x_axis_label='Dedication in hours',
    title='2018 Summary subsections time distribution',
    y_axis_label='Subsection', tools=TOOLS, tooltips='@description')

p.hbar(
    y='task', right='duration', height=.7, source=source,
    fill_alpha=.5, color=c_map)


show(p)

### 2c_3: ML projects

Finally Machine learning related projects.

**Keys**
* By far Neural networks took the most of the time
* EuroScipy included not only the daily tutorials but also some introductory work on wavelets and Astronomical data processing (topic that I liked a lot)
* BCAM decision theory course was fairly beyond my skills (although I could understand more than expected) so I didn't dedicate much time overall. Suprisingly, early works on minimax algorithm were useful.

In [19]:
descriptions = pd.read_csv('task_descriptions.csv', index_col=0)
ml_dfs = list()
projects = ('FlashCS', 'BCAM ML', 'BCAM Decisions')
for project in projects:
    k0 = df[df.project == project]
    k0 = k0.groupby('task').duration.sum().reset_index()
    k0 = pd.merge(k0, descriptions, on='task', how='left')
    # Ensure every task has description
    assert k0.description.isna().sum() == 0
    ml_dfs.append(k0)

# Plot
plots = list()
for n, s in enumerate(ml_dfs):
    source = ColumnDataSource(s)
    colors = viridis(ml_dfs[n].index.size +2)
    c_map = factor_cmap('task', colors, ml_dfs[n].task)
    title = '{} subprojects time distribution'.format(projects[n])
    y_range = Range1d(0, 85)
    p = figure(
        x_range=ml_dfs[n].task, width=320, height=400, title=title,
        tooltips='@description', toolbar_location=None, y_range=y_range)

    p.vbar(
        x='task', top='duration', width=.7, source=source, fill_alpha=.5, 
        color=c_map)
    p.xaxis.major_label_orientation = np.pi/4
    if n == 0:
        p.yaxis.axis_label = 'Dedication in hours'
    
    plots.append(p)

show(row(plots))

## 2d. The Buildup-Collaboration overlap

Buildup activities span over different areas, mainly on collaborations in a sort of learn by doing, reaching more than 1500h per year.

In [20]:
bu_area = df[df.area == 'buildup']
collab_area = df[(df.area == 'collaborations')]
f = (collab_area[TAG_COLS] == 'BuildUp').sum(axis=1) == 1
collab, intersection = collab_area[~f], collab_area[f]

values = [sample.duration.sum() for sample in (bu_area, intersection, collab)]
s0 = ColumnDataSource(data={
    'y': [1, ],
    'bu': [values[0], ],
    'it': [values[1], ],
    'co': [values[2], ], })

s1 = ColumnDataSource(data={
    'x': [500, 1290, 1650,],
    'y': [.9, .9, .9,],
    'text': ['{} h'.format(v) for v in values],
})

# Plot
r0 = Range1d(0, 2)
p = figure(
    width=900, height=200, x_axis_label='Dedication in hours', y_range=r0,
    title='BuildUp collaboration overlap', toolbar_location=None)

colors = viridis(10)
p.hbar_stack(
    ['bu', 'it', 'co'], y='y', source=s0, height=.3, fill_alpha=.5,
    legend_label=['BuildUp', 'BuildUp ∩ Collab', 'Collaborations'],
    color=colors[3:6])

p.legend.orientation = 'horizontal'
labels = LabelSet(
    x='x', y='y', text='text', source=s1,
    text_align='center', text_font_size='11px')
p.add_layout(labels)



show(p)

### 2d_a Tz app analysis
389 hours (~82%) of above overlap are devoted to tz application. If including tz.stats, the project that analyzes the data from the app, both cover almost the 95% of the buildup time in collaborations.  
Below are plotted the different features shipped, some other minor improvements and the time debbuging code. Hovering over the bars appears a short description.    
**Keys:**
* The dashed lines represent the 25% & 75% quartiles and the mean of major feature implementation.  
* The most active months are the first (rushing to have the new set of features ready for the new year) and december where new features are being built for the following year

In [21]:
# Assing descriptions to tasks
k0 = df[df.project == 'TZ-IT']
k0 = k0.groupby('task').duration.sum().reset_index()
k0 = pd.merge(k0, descriptions, on='task', how='left')
assert k0.description.isna().sum() == 0  # Ensure every task has description

# Order by duration and send Sheet to the pole position afterwards
k0 = k0.sort_values('duration')
k0.reset_index(drop=True, inplace=True)

# Get the mean and the IQR for features
f = k0.task.str.contains('Bug|Minor')
features = k0[~f]

# Plot cs_dfs
source = ColumnDataSource(k0)
colors = list(viridis(k0.index.size +2))
colors[17], colors[20] = 'firebrick', 'darkorange'
c_map = factor_cmap('task', colors, k0.task)
p = figure(
    y_range=k0.task, width=700, height=500, x_axis_label='Dedication in hours',
    title='Features, improvements & bug tracker',
    y_axis_label='Features', tools=TOOLS, tooltips='@description')

p.hbar(
    y='task', right='duration', height=.7, source=source,
    fill_alpha=.5, color=c_map)
q1 = Span(
    location=features.duration.describe()['25%'], dimension='height',
    line_color='darkslategray', line_dash='dashed', line_width=2)
mean = Span(
    location=features.duration.describe()['mean'], dimension='height',
    line_color='darkslategray', line_dash='dashed', line_width=1)
q3 = Span(
    location=features.duration.describe()['75%'], dimension='height',
    line_color='darkslategray', line_dash='dashed', line_width=2,)
p.add_layout(q1)
p.add_layout(mean)
p.add_layout(q3)

show(p)

In [22]:
tz = df[df.project == 'TZ-IT'].copy()
f = tz.task.str.contains('Bug|Minor')
idx = tz[~f].index
tz.loc[idx, 'task'] = 'Major features'
tz = tz.pivot_table(
    index=tz.start.dt.month, columns='task', values='duration', aggfunc='sum')
tz['month'] = month_abbr[1:13]
tz = tz.set_index('month').fillna(0)
tz = tz[['Major features', 'Minor improvements', 'Bug Tracker']]

# Plot
s0 = ColumnDataSource(tz)

tooltips = [
    ('Major features', '@{Major features}h'),
    ('Minor improvements', '@{Minor improvements}h'),
    ('Bug Tracker', '@{Bug Tracker}h')
]

p = figure(
    x_range=tz.index.tolist(), width=900, height=400, x_axis_label='Month',
    y_axis_label='Dedication in hours', tools=TOOLS, tooltips=tooltips,
    title='Features, improvements & bugs during the year',)

p.vbar_stack(
    tz.columns.to_list(), x='month', source=s0, width=.8, fill_alpha=.5,
    legend_label=tz.columns.to_list(),
    color=['teal', 'darkorange', 'firebrick'])

p.legend.location = 'top_left'


show(p)

🏗️ Work in progress down here

# 3. Billable analysis

**Extra data**  

[[Index]](#Index)

# 4. Core & Billable evolution throughout year

**Extra data**  
**S4a:** days without activity

[[Index]](#Index)

In [23]:
# Select Core entries
f = ((dfn[TAG_COLS] == 'Core').sum(axis=1) == 1)
d0 = dfn[f]
d0 = d0.groupby(pd.DatetimeIndex(d0.start).date).duration.sum()

# Select Billable entries
f = dfn.billable == True
d1 = dfn[f]
d1 = d1.groupby(pd.DatetimeIndex(d1.start).date).duration.sum()

# Join both dfs
cols = {'duration_x': 'core', 'duration_y': 'billable'}
d0 = pd.merge(d0, d1, left_index=True, right_index=True, how='outer').rename(columns=cols)

# Add the days without activity
r0 = pd.date_range('2019-01-01', '2019-12-31')
d0 = d0.reindex(r0).fillna(0)
S4a = d0[(d0 == 0).all(axis=1)]
print('Days without activity:', S4a.shape[0])

d0 = d0.resample('W').sum()

d0['diff'] = d0.core.cumsum() - d0.billable.cumsum()
d0['prt_date'] = d0.index.astype(str)


### Plot ###
s0 = ColumnDataSource(d0)

pdd = timedelta(days=10)  # padding
r0 = Range1d(d0.index.min()-pdd, d0.index.max()+pdd)
r1 = Range1d(0, 55)
t0 = [
    ('Date', '@prt_date'),
    ('Core', '@core'),
    ('Billable', '@billable'),
    ('Diff', '@diff'),
]
p1 = figure(plot_height=350, plot_width=1200, x_range=r0, y_range=r1,
            title='Core vs Billable', x_axis_type='datetime', tooltips=t0,
            tools=TOOLS, y_axis_label='hours per week', x_axis_label='Date')

p1.vbar(x='index', top='core', width=timedelta(days=5), source=s0, fill_alpha=.6,
       color=AREA_COLORS_DICT['buildup'], legend_label="Core")

p1.vbar(x='index', top='billable', width=timedelta(days=5), source=s0, fill_alpha=.6,
       color=AREA_COLORS_DICT['billable'], legend_label="Billable")

p1.legend.orientation = 'horizontal'
p1.legend.location = 'top_left'
p1.xaxis.visible = False


# Plot the difference
areas = (AREA_COLORS_DICT['billable'], AREA_COLORS_DICT['buildup'])
m0 = linear_cmap(field_name='diff', palette=areas, low=-110 ,high=110)
p2 = figure(plot_height=150, plot_width=1200, x_range=r0, 
            x_axis_type='datetime',
            tools=TOOLS, y_axis_label='Difference in h', x_axis_label='Date', tooltips=t0)
p2.vbar(x='index', top='diff', width=timedelta(days=5), source=s0, fill_alpha=.5,
       color=m0)

p2.toolbar.autohide = True

show(column(p1, p2))

Days without activity: 10
