This Jupyter Notebook is used for exploring different problems, and attempts at solving them

# groupby -> resample -> agg

Examining the strange result of using `resample` + `agg` on `groupby`,
stubled upon in [`02-contributors_graph.py`](./02-contributors_graph.py)

In [1]:
import datetime
import json
import logging
import math
import pprint
from collections import Counter, defaultdict
from pathlib import Path
from typing import Optional

# data analysis
import numpy as np
import pandas as pd

# dashboard
import panel as pn

# plotting
from bokeh.models.formatters import DatetimeTickFormatter
import holoviews as hv
import hvplot.pandas
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.figure import Figure
from matplotlib import cm

In [2]:
with open('../../data/examples/stats/qtile.timeline.purpose-to-type.json', mode='r') as json_fp:
    data = json.load(json_fp)

In [3]:
data.keys()

dict_keys(['qtile'])

In [4]:
df = pd.DataFrame.from_records(data['qtile'])
df.shape

(5350, 97)

In [5]:
df = df[df['n_parents'] == 1]
df.shape

(5347, 97)

In [6]:
df = df.dropna(subset=['author.timestamp', 'committer.timestamp'], how='any')
df.shape

(5347, 97)

In [7]:
df = df.assign(
            n_commits =  1,
            author_date    = lambda x: pd.to_datetime(x['author.timestamp'],    unit='s', utc=True),
            committer_date = lambda x: pd.to_datetime(x['committer.timestamp'], unit='s', utc=True),
        )
df.shape

(5347, 100)

In [8]:
df.head(3)

Unnamed: 0,bug_id,patch_id,file_names,language:Python,type:programming,purpose:programming,+:count,+:type.code,+:purpose.programming,+:type.documentation,...,language:JSON,language:TOML,language:Git Revision List,language:SVG,language:desktop,language:CSS,language:Nix,n_commits,author_date,committer_date
0,all_authors-no_merges,47474a8375ae785b91992355be1678565eba9d23.v2.json,1.0,1.0,1.0,1.0,34.0,30.0,34.0,4.0,...,,,,,,,,1,2011-04-11 21:24:27+00:00,2011-04-11 21:24:27+00:00
1,all_authors-no_merges,88197e8de01d79af7858d60acb9bd57bc06b5e73.v2.json,1.0,1.0,1.0,1.0,4.0,4.0,4.0,,...,,,,,,,,1,2015-10-20 11:18:55+00:00,2015-10-28 19:36:58+00:00
2,all_authors-no_merges,9f411bf9bcf74c7e28d7eaa0d1fd03c382458ba4.v2.json,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,...,,,,,,,,1,2016-08-12 04:39:33+00:00,2016-08-12 04:39:33+00:00


In [9]:
df.columns

Index(['bug_id', 'patch_id', 'file_names', 'language:Python',
       'type:programming', 'purpose:programming', '+:count', '+:type.code',
       '+:purpose.programming', '+:type.documentation', '-:count',
       '-:type.code', '-:purpose.programming', 'diff.n_files',
       'diff.hunk_span_src', 'diff.hunk_span_dst', 'diff.n_hunks',
       'diff.n_lines_added', 'diff.n_lines_removed', 'diff.n_lines_all',
       'diff.n_rem', 'diff.n_mod', 'diff.n_groups', 'diff.patch_size',
       'diff.n_add', 'diff.hunk_spread_src', 'diff.hunk_spread_dst',
       'diff.groups_spread', 'author.timestamp', 'author.tz_info',
       'author.name', 'author.email', 'committer.timestamp',
       'committer.tz_info', 'committer.name', 'committer.email', 'n_parents',
       '-:type.documentation', 'diff.spread_inner', 'purpose:test',
       '+:type.test', '+:purpose.test', '-:type.test', '-:purpose.test',
       'language:Text', 'type:prose', 'purpose:documentation',
       '+:purpose.documentation', 'languag

In [10]:
df_x = df[['author.email', 'author_date', 'n_commits', '+:count', '-:count']]
df_x

Unnamed: 0,author.email,author_date,n_commits,+:count,-:count
0,pc@gafol.net,2011-04-11 21:24:27+00:00,1,34.0,4.0
1,nxnfufunezn@gmail.com,2015-10-20 11:18:55+00:00,1,4.0,1.0
2,frostidaho@users.noreply.github.com,2016-08-12 04:39:33+00:00,1,1.0,1.0
3,dev@dariogiovannetti.net,2017-03-18 19:22:58+00:00,1,5.0,2.0
4,mcol@posteo.net,2020-06-10 18:14:23+00:00,1,2.0,1.0
...,...,...,...,...,...
5345,jdulaney@fedoraproject.org,2016-02-02 18:11:46+00:00,1,1.0,1.0
5346,uberj@onid.orst.edu,2012-05-09 06:47:52+00:00,1,1.0,1.0
5347,tycho@tycho.ws,2014-10-08 18:38:30+00:00,1,3.0,3.0
5348,sean.v.775@gmail.com,2015-01-30 01:10:39+00:00,1,52.0,73.0


## Resample only

In [11]:
resample_rate='QE'

In [12]:
agg_func_map={'n_commits': 'sum', '+:count': 'sum', '-:count': 'sum'}
agg_func_map

{'n_commits': 'sum', '+:count': 'sum', '-:count': 'sum'}

In [13]:
columns_agg = list(agg_func_map.keys())
columns_agg

['n_commits', '+:count', '-:count']

In [14]:
df_r = df_x.resample(
        resample_rate,
        on='author_date'
    )[columns_agg].agg(
        agg_func_map,
        numeric_only=True
    )
df_r.shape

(66, 3)

In [15]:
df_r

Unnamed: 0_level_0,n_commits,+:count,-:count
author_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2008-09-30 00:00:00+00:00,207,8111.0,3487.0
2008-12-31 00:00:00+00:00,73,3730.0,2403.0
2009-03-31 00:00:00+00:00,164,3330.0,1181.0
2009-06-30 00:00:00+00:00,1,1.0,2.0
2009-09-30 00:00:00+00:00,6,323.0,632.0
...,...,...,...
2023-12-31 00:00:00+00:00,107,2004.0,1202.0
2024-03-31 00:00:00+00:00,66,2385.0,540.0
2024-06-30 00:00:00+00:00,128,5381.0,1753.0
2024-09-30 00:00:00+00:00,54,2434.0,860.0


## Groupby, then resample

In [16]:
df_g = df_x.groupby('author.email')
df_g

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f47ecaa1010>

In [17]:
df_g[columns_agg].agg(
        agg_func_map,
        numeric_only=True
    )

Unnamed: 0_level_0,n_commits,+:count,-:count
author.email,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0002e7@gmail.com,1,25.0,13.0
1024369+holocronweaver@users.noreply.github.com,1,125.0,2.0
104843199+jlcoulter@users.noreply.github.com,2,2.0,2.0
107062289+shyguyCreate@users.noreply.github.com,10,690.0,81.0
110528300+c0rydoras@users.noreply.github.com,1,27.0,12.0
...,...,...,...
yonnji@kitsune.one,1,82.0,0.0
yurilxc@gmail.com,1,13.0,4.0
zaheen.jamil@gmail.com,2,3.0,1.0
zordsdavini@gmail.com,29,855.0,149.0


In [18]:
df_x.groupby('author.email').resample(
        resample_rate,
        on='author_date'
    )[columns_agg].agg(
        agg_func_map,
        numeric_only=True
    )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_commits,n_commits,n_commits,+:count,+:count,+:count,-:count,-:count,-:count
Unnamed: 0_level_1,Unnamed: 1_level_1,n_commits,+:count,-:count,n_commits,+:count,-:count,n_commits,+:count,-:count
author.email,author_date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0002e7@gmail.com,2023-09-30 00:00:00+00:00,1,25.0,13.0,1,25.0,13.0,1,25.0,13.0
1024369+holocronweaver@users.noreply.github.com,2024-12-31 00:00:00+00:00,1,125.0,2.0,1,125.0,2.0,1,125.0,2.0
104843199+jlcoulter@users.noreply.github.com,2022-09-30 00:00:00+00:00,2,2.0,2.0,2,2.0,2.0,2,2.0,2.0
107062289+shyguyCreate@users.noreply.github.com,2024-06-30 00:00:00+00:00,5,93.0,41.0,5,93.0,41.0,5,93.0,41.0
107062289+shyguyCreate@users.noreply.github.com,2024-09-30 00:00:00+00:00,3,563.0,25.0,3,563.0,25.0,3,563.0,25.0
...,...,...,...,...,...,...,...,...,...,...
zordsdavini@gmail.com,2021-03-31 00:00:00+00:00,1,1.0,1.0,1,1.0,1.0,1,1.0,1.0
zordsdavini@gmail.com,2021-06-30 00:00:00+00:00,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
zordsdavini@gmail.com,2021-09-30 00:00:00+00:00,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
zordsdavini@gmail.com,2021-12-31 00:00:00+00:00,1,14.0,0.0,1,14.0,0.0,1,14.0,0.0


In [19]:
df_g_r = df_g.resample(
        resample_rate,
        on='author_date'
    )[columns_agg].agg(
        agg_func_map,
        numeric_only=True
    )
df_g_r.shape

(1165, 9)

In [20]:
df_g_r

Unnamed: 0_level_0,Unnamed: 1_level_0,n_commits,n_commits,n_commits,+:count,+:count,+:count,-:count,-:count,-:count
Unnamed: 0_level_1,Unnamed: 1_level_1,n_commits,+:count,-:count,n_commits,+:count,-:count,n_commits,+:count,-:count
author.email,author_date,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0002e7@gmail.com,2023-09-30 00:00:00+00:00,1,25.0,13.0,1,25.0,13.0,1,25.0,13.0
1024369+holocronweaver@users.noreply.github.com,2024-12-31 00:00:00+00:00,1,125.0,2.0,1,125.0,2.0,1,125.0,2.0
104843199+jlcoulter@users.noreply.github.com,2022-09-30 00:00:00+00:00,2,2.0,2.0,2,2.0,2.0,2,2.0,2.0
107062289+shyguyCreate@users.noreply.github.com,2024-06-30 00:00:00+00:00,5,93.0,41.0,5,93.0,41.0,5,93.0,41.0
107062289+shyguyCreate@users.noreply.github.com,2024-09-30 00:00:00+00:00,3,563.0,25.0,3,563.0,25.0,3,563.0,25.0
...,...,...,...,...,...,...,...,...,...,...
zordsdavini@gmail.com,2021-03-31 00:00:00+00:00,1,1.0,1.0,1,1.0,1.0,1,1.0,1.0
zordsdavini@gmail.com,2021-06-30 00:00:00+00:00,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
zordsdavini@gmail.com,2021-09-30 00:00:00+00:00,0,0.0,0.0,0,0.0,0.0,0,0.0,0.0
zordsdavini@gmail.com,2021-12-31 00:00:00+00:00,1,14.0,0.0,1,14.0,0.0,1,14.0,0.0


In [21]:
df_gr = df_x.groupby(['author.email', pd.Grouper(key='author_date', freq=resample_rate)])
df_gr

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f47eca4dcd0>

In [22]:
df_gr[columns_agg].agg(
        agg_func_map,
        numeric_only=True
    )

Unnamed: 0_level_0,Unnamed: 1_level_0,n_commits,+:count,-:count
author.email,author_date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0002e7@gmail.com,2023-09-30 00:00:00+00:00,1,25.0,13.0
1024369+holocronweaver@users.noreply.github.com,2024-12-31 00:00:00+00:00,1,125.0,2.0
104843199+jlcoulter@users.noreply.github.com,2022-09-30 00:00:00+00:00,2,2.0,2.0
107062289+shyguyCreate@users.noreply.github.com,2024-06-30 00:00:00+00:00,5,93.0,41.0
107062289+shyguyCreate@users.noreply.github.com,2024-09-30 00:00:00+00:00,3,563.0,25.0
...,...,...,...,...
zordsdavini@gmail.com,2020-06-30 00:00:00+00:00,12,715.0,95.0
zordsdavini@gmail.com,2020-09-30 00:00:00+00:00,1,31.0,22.0
zordsdavini@gmail.com,2021-03-31 00:00:00+00:00,1,1.0,1.0
zordsdavini@gmail.com,2021-12-31 00:00:00+00:00,1,14.0,0.0


# -/+ plots

In [23]:
DATASET_DIR = 'data/examples/stats'
JSON_FILE = 'tensorflow.timeline.purpose-to-type.json'
JSON_PATH = None

for TOP_DIR in ['', '..', '../..']:
    full_dir  = Path(TOP_DIR).joinpath(DATASET_DIR)
    full_path = full_dir.joinpath(JSON_FILE)

    if full_dir.is_dir() and full_path.is_file():
        JSON_PATH = full_path
        break

In [24]:
JSON_PATH

PosixPath('../../data/examples/stats/tensorflow.timeline.purpose-to-type.json')

We’ll speed up our application by caching (`@pn.cache`) the data across users

In [25]:
@pn.cache
def get_timeline_data():
    #logger.debug(f"[@pn.cache] get_timeline_data() for {JSON_PATH=}")
    with open(JSON_PATH, mode='r') as json_fp:
        return json.load(json_fp)

In [26]:
timeline_data = get_timeline_data()
type(timeline_data)

dict

In [27]:
timeline_data.keys()

dict_keys(['tensorflow'])

Extract data

In [28]:
tf_timeline_data = timeline_data['tensorflow']

Create `DataFrame` out of timeline data

In [29]:
tf_timeline_df = pd.DataFrame.from_records(tf_timeline_data)
tf_timeline_df.head(5)

Unnamed: 0,bug_id,patch_id,file_names,language:Python,type:programming,purpose:programming,+:count,+:type.code,+:purpose.programming,+:type.documentation,...,language:INI,language:Limbo,language:Swift,language:JSON,language:OpenStep Property List,language:SVG,language:CSV,language:Dockerfile,diff.n_binary_files,language:MLIR
0,yong.tang,ab0a5278d81ef34096775d5d56f11694cca2a785.v2.json,1.0,1.0,1.0,1.0,38.0,37.0,38.0,1.0,...,,,,,,,,,,
1,yong.tang,2d67d32e587e773811da4577587dc07cf2922641.v2.json,3.0,,1.0,1.0,9.0,1.0,1.0,,...,,,,,,,,,,
2,yong.tang,6346745f18ded325cdd476d1e521b301b2f38db5.v2.json,1.0,1.0,1.0,1.0,12.0,12.0,12.0,,...,,,,,,,,,,
3,yong.tang,c5121973a96665c5e1420f73e571287f157fa8e3.v2.json,1.0,,1.0,1.0,17.0,15.0,17.0,2.0,...,,,,,,,,,,
4,yong.tang,21a9efc4cddbce661073544db31a63639686310a.v2.json,1.0,,1.0,1.0,2.0,2.0,2.0,,...,,,,,,,,,,


Drop merges and root commits (they have oversized \[first parent] diffs)

In [30]:
tf_timeline_df = tf_timeline_df[tf_timeline_df['n_parents'] == 1]

Add **'n_commits'** column

Before resampling, each row correspons to a single commit

In [31]:
tf_timeline_df['n_commits'] = 1

In [32]:
# Just in case
tf_timeline_df = tf_timeline_df.dropna(subset=['author.timestamp', 'committer.timestamp'], how='any')

In [33]:
tf_timeline_df[['bug_id', 'patch_id', 'author.timestamp', 'author.tz_info', 'committer.timestamp', 'committer.tz_info']]

Unnamed: 0,bug_id,patch_id,author.timestamp,author.tz_info,committer.timestamp,committer.tz_info
0,yong.tang,ab0a5278d81ef34096775d5d56f11694cca2a785.v2.json,1.570260e+09,+0000,1.570260e+09,+0000
1,yong.tang,2d67d32e587e773811da4577587dc07cf2922641.v2.json,1.520546e+09,+0000,1.552328e+09,+0000
2,yong.tang,6346745f18ded325cdd476d1e521b301b2f38db5.v2.json,1.505932e+09,+0000,1.505932e+09,+0000
3,yong.tang,c5121973a96665c5e1420f73e571287f157fa8e3.v2.json,1.527621e+09,+0000,1.527623e+09,+0000
4,yong.tang,21a9efc4cddbce661073544db31a63639686310a.v2.json,1.511876e+09,-0800,1.519498e+09,+0000
...,...,...,...,...,...,...
3339,ezhulenev,b67cf30e4f7985598846462896e5a1e1591b3b8f.v2.json,1.652739e+09,-0700,1.652739e+09,-0700
3340,ezhulenev,52c5795ccd2a53871bb4edbc136033caef64d3a2.v2.json,1.642784e+09,-0800,1.642784e+09,-0800
3341,ezhulenev,cc579007fec72159cd5d457b8f977a28ff169069.v2.json,1.696482e+09,-0700,1.696482e+09,-0700
3342,ezhulenev,809237b3ff80f99a610f9bc56179057bf5ef391b.v2.json,1.701981e+09,-0800,1.701982e+09,-0800


Create `pd.Timestamp` columns out of timestamp; because they all have to be in the same timezone, use UTC timezone

In [34]:
tf_timeline_df['author.date'] = tf_timeline_df[['author.timestamp', 'author.tz_info']].apply(
    lambda x: pd.Timestamp.fromtimestamp(x['author.timestamp'], tz='UTC'),
axis='columns').astype('datetime64[ns, UTC]')

tf_timeline_df['committer.date'] = tf_timeline_df[['committer.timestamp', 'committer.tz_info']].apply(
    lambda x: pd.Timestamp.fromtimestamp(x['committer.timestamp'], tz='UTC'),
axis='columns').astype('datetime64[ns, UTC]')

In [35]:
tf_timeline_df[['bug_id', 'patch_id', 'author.date', 'author.timestamp', 'author.tz_info']]

Unnamed: 0,bug_id,patch_id,author.date,author.timestamp,author.tz_info
0,yong.tang,ab0a5278d81ef34096775d5d56f11694cca2a785.v2.json,2019-10-05 07:18:42+00:00,1.570260e+09,+0000
1,yong.tang,2d67d32e587e773811da4577587dc07cf2922641.v2.json,2018-03-08 21:57:07+00:00,1.520546e+09,+0000
2,yong.tang,6346745f18ded325cdd476d1e521b301b2f38db5.v2.json,2017-09-20 18:22:35+00:00,1.505932e+09,+0000
3,yong.tang,c5121973a96665c5e1420f73e571287f157fa8e3.v2.json,2018-05-29 19:10:48+00:00,1.527621e+09,+0000
4,yong.tang,21a9efc4cddbce661073544db31a63639686310a.v2.json,2017-11-28 13:28:49+00:00,1.511876e+09,-0800
...,...,...,...,...,...
3339,ezhulenev,b67cf30e4f7985598846462896e5a1e1591b3b8f.v2.json,2022-05-16 22:11:06+00:00,1.652739e+09,-0700
3340,ezhulenev,52c5795ccd2a53871bb4edbc136033caef64d3a2.v2.json,2022-01-21 16:52:52+00:00,1.642784e+09,-0800
3341,ezhulenev,cc579007fec72159cd5d457b8f977a28ff169069.v2.json,2023-10-05 04:57:01+00:00,1.696482e+09,-0700
3342,ezhulenev,809237b3ff80f99a610f9bc56179057bf5ef391b.v2.json,2023-12-07 20:35:04+00:00,1.701981e+09,-0800


In [36]:
pm_count_cols = [col for col in tf_timeline_df.columns if col.startswith('+:') or col.startswith('-:')]
pm_count_cols.sort(key=lambda s: s[2:]+('0' if s[0] == '-' else '1'))
pm_count_cols

['-:count',
 '+:count',
 '-:purpose.data',
 '+:purpose.data',
 '-:purpose.documentation',
 '+:purpose.documentation',
 '-:purpose.markup',
 '+:purpose.markup',
 '-:purpose.other',
 '+:purpose.other',
 '-:purpose.programming',
 '+:purpose.programming',
 '-:purpose.project',
 '+:purpose.project',
 '-:purpose.test',
 '+:purpose.test',
 '-:type.code',
 '+:type.code',
 '-:type.data',
 '+:type.data',
 '-:type.documentation',
 '+:type.documentation',
 '-:type.markup',
 '+:type.markup',
 '-:type.other',
 '+:type.other',
 '-:type.project',
 '+:type.project',
 '-:type.test',
 '+:type.test']

In [37]:
diff_x_cols = [col for col in tf_timeline_df.columns if col.startswith('diff.')]
diff_x_cols

['diff.n_files',
 'diff.hunk_span_src',
 'diff.hunk_span_dst',
 'diff.n_hunks',
 'diff.n_lines_added',
 'diff.n_lines_removed',
 'diff.n_lines_all',
 'diff.spread_inner',
 'diff.n_add',
 'diff.n_groups',
 'diff.n_mod',
 'diff.patch_size',
 'diff.groups_spread',
 'diff.hunk_spread_src',
 'diff.hunk_spread_dst',
 'diff.n_file_renames',
 'diff.n_rem',
 'diff.n_added_files',
 'diff.n_removed_files',
 'diff.n_binary_files']

In [38]:
#@pn.cache
def resample_timeline(author: str = 'ezhulenev', resample_rate: str = 'ME', agg_func: str = 'sum') -> pd.DataFrame:
    ## DEBUG
    #print(f"> resampling for {author} at sample rate '{resample_rate}' and agg_func '{agg_func}'")
    df = tf_timeline_df[tf_timeline_df['bug_id'] == author].resample(
        resample_rate,
        on='author.date'
    )[['n_commits', *pm_count_cols, *diff_x_cols]].agg(
        {col: agg_func if col in [*pm_count_cols, *diff_x_cols] else 'sum'  # excludes 'n_commits'
         for col in ['n_commits', *pm_count_cols, *diff_x_cols]},
        numeric_only=True
    )

    # to be possibly used for xlabel when plotting
    df['author.date(UTC)'] = df.index
    df['author.date(Y-m)'] = df.index.strftime('%Y-%m')

    # TODO: do it with dependencies / bound functions
    # NOTE: Panel specific !!!
    #column_base_widget.disabled_options = all_possible_pm_col_perc_basenames
    #logger.debug(f"resample_timeline({author=}, {resample_rate=}, {agg_func=}) -> pd.DataFrame({hex(id(df))})")

    return df

In [39]:
tf_timeline_selected_resampled_df = resample_timeline()
tf_timeline_selected_resampled_df.head(5)

Unnamed: 0_level_0,n_commits,-:count,+:count,-:purpose.data,+:purpose.data,-:purpose.documentation,+:purpose.documentation,-:purpose.markup,+:purpose.markup,-:purpose.other,...,diff.groups_spread,diff.hunk_spread_src,diff.hunk_spread_dst,diff.n_file_renames,diff.n_rem,diff.n_added_files,diff.n_removed_files,diff.n_binary_files,author.date(UTC),author.date(Y-m)
author.date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-06-30 00:00:00+00:00,8,362.0,2067.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10883.0,10546.0,10546.0,0.0,295.0,4.0,0.0,0.0,2018-06-30 00:00:00+00:00,2018-06
2018-07-31 00:00:00+00:00,2,4.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,365.0,340.0,340.0,0.0,1.0,0.0,0.0,0.0,2018-07-31 00:00:00+00:00,2018-07
2018-08-31 00:00:00+00:00,2,14.0,543.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,228.0,204.0,204.0,0.0,7.0,2.0,0.0,0.0,2018-08-31 00:00:00+00:00,2018-08
2018-09-30 00:00:00+00:00,18,1363.0,4322.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13759.0,12491.0,12491.0,0.0,626.0,1.0,0.0,0.0,2018-09-30 00:00:00+00:00,2018-09
2018-10-31 00:00:00+00:00,16,1038.0,2633.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,21685.0,19630.0,19630.0,0.0,418.0,0.0,0.0,0.0,2018-10-31 00:00:00+00:00,2018-10


## Matplotlib

In [40]:
def round_10s(x):
    mult = 10 ** math.floor(math.log10(x))
    return math.ceil(x / mult) * mult

In [41]:
round_10s(277.0)

300

In [42]:
def plot_counts(resampled_df: pd.DataFrame,
                repo_desc: str = 'tensorflow', author_desc: str = 'ezhulenev', resample_rate: str = 'ME', agg_func: str = 'sum',
                figsize: tuple[float, float] = (5, 5),
):
    sns.set_style("whitegrid")

    ## DEBUG figsize
    #print(f"plot_counts(): {figsize=}")
    fig = Figure(figsize=figsize)
    axes = fig.subplots(nrows=2, ncols=1, sharex=True)

    max_count = resampled_df[['+:count', '-:count']].max().max()
    max_ylim = round_10s(max_count)

    for ax, column, color, invert in zip(axes.ravel(), ['+:count', '-:count'], ['green', 'red'], [False, True]):
        sns.lineplot(ax=ax, data=resampled_df, 
                     x='author.date', y=column, 
                     color=color, drawstyle='steps-post')
    
        ax.fill_between(resampled_df.index, resampled_df[column],
                        alpha=0.2, color=color, step='post')
        ax.set_ylim(0, max_ylim)
        ax.set_ylabel(f"{agg_func}({column})")
    
        if invert:
            ax.invert_yaxis()
        else:
            #ax.set_title(f"author={author_desc}", fontsize=9)
            ax.axhline(0, color="k")

    fig.suptitle(f'repo={repo_desc}, author={author_desc}, lines per resample="{resample_rate}"', fontsize=10)
    fig.subplots_adjust(hspace=0)

    #plt.show()
    #plt.close(fig) # CLOSE THE FIGURE!
    return fig

In [43]:
fig = plot_counts(tf_timeline_selected_resampled_df)
pn.pane.Matplotlib(fig, tight=True)

## hvPlot

https://holoviews.org/user_guide/Plotting_with_Matplotlib.html#plot-layouts

> Another aspect that differs quite substantially between matplotlib and other extension is the layout system.
> Since plots do not have an absolute size relative to one another it depends on the aspect of each plot.
> The main options to control the layout include:
>
> - **`aspect_weight`**: Whether to weight the aspect of plots when laying out plots (default=False).
> - **`hspace`**: Horizontal spacing between subplots.
> - **`tight`**: Whether to automatically reduce space between subplots.
> - **`vspace`**: Vertical space between subplots.

> ```python
> line_contours = contours(img).opts(aspect=3)
> fill_contours = filled_contours.opts(aspect=2)
>
> opts.defaults(opts.Layout(sublabel_format='', fig_size=150))
>
> (line_contours + fill_contours).opts(tight=True)
> ```

https://holoviews.org/releases.html#version-1-2-0

> Highlights/features:
> - \[...]
> - New `aspect_weight` and `tight` layout plot options for more customizability of Layout arrangements (4b1f03d, e6a76b7)
> 
> API changes:
> - \[...]
> - Renaming of various plot and style options including:
>    - \[...]
>    - `vertical_spacing` and `horizontal_spacing` to `vspace` and `hspace` respectively

In [44]:
# 'matplotlib' backed supports `vspace` option
hv.opts.Layout(vspace=0.1, backend='matplotlib')

Options('Layout', backend='matplotlib', vspace=0.1)

In [45]:
# but 'bokeh' backend does not support it
#hv.opts.Layout(vspace=0.1, backend='bokeh')

# but there is `border` option for 'bokeh' backend... but not for Layout, but for Curve
#hv.opts.Layout(border=0, backend='bokeh')
hv.opts.Curve(border=0, backend='bokeh')

Options('Curve', backend='bokeh', border=0)

In [46]:
# Check the current backend
current_backend = hv.Store.current_backend
print(f"The current HoloViews backend is: {current_backend}")

The current HoloViews backend is: bokeh


In [47]:
def hvplot_plots(resampled_df: pd.DataFrame,
                 repo_desc: str = 'tensorflow', author_desc: str = 'ezhulenev', resample_rate: str = 'ME', agg_func: str = 'sum',
):
    max_count = resampled_df[['+:count', '-:count']].max().max()
    max_ylim = round_10s(max_count)

    #formatter = DatetimeTickFormatter(months='%Y')
    
    step_p = resampled_df.hvplot.step(
        y="+:count",
        #label="sum(+:count)",
        ylabel=f"{agg_func}(+:count)",
        color="green",
        line_width=3,
        ylim=(0, max_ylim),
        grid=True,
        padding=(0.02,0),
        #xformatter=formatter,
        # step specific
        where='post',
        # interactive
        tools=[
            'box_zoom',
            'save',
            'reset',
            'hover',
        ],
    ).opts(
        xaxis=None,
        default_tools=[], toolbar='above',
        border=1,  # Bohek specific
    )

    step_m = resampled_df.hvplot.step(
        y="-:count",
        #label="sum(-:count)",
        ylabel=f"{agg_func}(-:count)",
        color="red",
        line_width=3,
        ylim=(0, max_ylim),
        grid=True,
        padding=(0.02,0),
        #xformatter=formatter,
        # step specific
        where='post',
        # interactive
        tools=[
            'box_zoom',
            'save',
            'reset',
            'hover',
        ],
    ).opts(
        xaxis='bottom', invert_yaxis=True,
        default_tools=[], toolbar='above',
        border=1,  # Bohek specific
    )  # Invert the y-axis

    return (step_p, step_m)

In [48]:
def hvplot_counts(resampled_df: pd.DataFrame,
                 repo_desc: str = 'tensorflow', author_desc: str = 'ezhulenev', resample_rate: str = 'ME', agg_func: str = 'sum',
):
    (step_p, step_m) = hvplot_plots(
        resampled_df=resampled_df,
        repo_desc=repo_desc, author_desc=author_desc, resample_rate=resample_rate, agg_func=agg_func,
    )
    
    return (step_p + step_m).opts(
        #vspace=0,  # not for Bokeh backend
        #tight=True,  # not for Bokeh backend
        title=f'repo={repo_desc}, author={author_desc}, lines per resample="{resample_rate}"',
        fontsize={'title': '10pt'},
    ).cols(1)

In [49]:
def hvplot_overlay(resampled_df: pd.DataFrame,
                   repo_desc: str = 'tensorflow', author_desc: str = 'ezhulenev', resample_rate: str = 'ME', agg_func: str = 'sum',
):
    (step_p, step_m) = hvplot_plots(
        resampled_df=resampled_df,
        repo_desc=repo_desc, author_desc=author_desc, resample_rate=resample_rate, agg_func=agg_func,
    )
    
    return (step_p * step_m).opts(
        #vspace=0,  # not for Bokeh backend
        #tight=True,  # not for Bokeh backend
        title=f'repo={repo_desc}, author={author_desc}, lines per resample="{resample_rate}"',
        fontsize={'title': '10pt'},
    )

In [50]:
fig = hvplot_counts(tf_timeline_selected_resampled_df)
pn.pane.HoloViews(fig)

In [51]:
# Does not work as intended, both plots use the same iverted y axis

#fig = hvplot_overlay(tf_timeline_selected_resampled_df)
#pn.pane.HoloViews(fig)

In [52]:
(step_p, step_m) = hvplot_plots(tf_timeline_selected_resampled_df)
print(step_p)

:Curve   [author.date]   (+:count)


In [53]:
dir(step_p)

['_Chart__abstract',
 '_Dimensioned__abstract',
 '_Element2D__abstract',
 '_Element__abstract',
 '_ViewableElement__abstract',
 '__add__',
 '__bool__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lshift__',
 '__lt__',
 '__module__',
 '__mul__',
 '__ne__',
 '__new__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_auto_indexable_1d',
 '_auxiliary_component',
 '_binned',
 '_cached',
 '_cached_constants',
 '_conversion_interface',
 '_dataset',
 '_deep_indexable',
 '_dim_aliases',
 '_dim_groups',
 '_empty_region',
 '_get_bounds_selection',
 '_get_index_expr',
 '_get_index_selection',
 '_get_lasso_selection',
 '_get_selection',
 '_get_selection_dims',
 '_

In [54]:
step_p.opts['color']

'green'

In [55]:
step_p.dframe().iloc[:,0].values[:5]

array(['2018-06-30T00:00:00.000000000', '2018-07-31T00:00:00.000000000',
       '2018-08-31T00:00:00.000000000', '2018-09-30T00:00:00.000000000',
       '2018-10-31T00:00:00.000000000'], dtype='datetime64[ns]')

In [56]:
step_p.dimensions

<bound method Dimensioned.dimensions of :Curve   [author.date]   (+:count)>

In [57]:
step_p.dimension_values('+:count', expanded=False, flat=False)

array([2.0670e+03, 2.0000e+01, 5.4300e+02, 4.3220e+03, 2.6330e+03,
       5.4790e+03, 3.5200e+03, 3.5050e+03, 1.0430e+03, 4.5460e+03,
       5.2750e+03, 2.7790e+03, 2.9640e+03, 2.1350e+03, 4.3400e+02,
       4.7000e+01, 6.4100e+02, 2.0500e+02, 3.0000e+01, 2.9700e+02,
       1.7360e+03, 2.1080e+03, 1.2400e+02, 4.2000e+01, 2.3000e+01,
       9.5800e+02, 3.1900e+02, 5.6600e+02, 5.7000e+01, 1.7000e+01,
       2.2200e+02, 8.0000e+00, 6.4600e+02, 5.5400e+02, 6.8800e+02,
       0.0000e+00, 5.7400e+02, 1.3830e+03, 2.0100e+02, 4.6500e+02,
       6.1300e+02, 5.1200e+02, 1.4770e+03, 6.4000e+02, 1.2000e+01,
       2.5500e+02, 3.9410e+03, 1.3770e+03, 1.3000e+01, 1.5431e+04,
       8.1700e+03, 4.8050e+03, 6.6960e+03, 6.8360e+03, 3.2400e+03,
       5.1000e+01, 1.5800e+02, 6.3000e+01, 2.6500e+02, 1.6900e+02,
       6.8810e+03, 4.1860e+03, 2.1680e+03, 5.0730e+03, 8.6550e+03,
       8.5460e+03, 9.8130e+03, 7.3810e+03, 1.5460e+03, 4.5590e+03,
       7.2620e+03, 1.1271e+04, 6.4460e+03, 4.8290e+03, 9.1700e

In [58]:
xs = [1, 2, 3]
ys = [2, 0, 7]
hv.Polygons([{'x': xs, 'y': ys}]).opts(color='red', alpha=0.2)

In [59]:
step_p[:20]

In [60]:
str(step_p.dimensions()[0])

'author.date'

In [61]:
xs = step_p[:20].dimension_values('author.date').astype(np.int64)
ys = step_p[:20].dimension_values('+:count')
#hv.Polygons([{'x': xs, 'y': ys}]).opts(color='red', alpha=0.2)

In [62]:
xs_c = np.empty((xs.size + xs.size + 1,), dtype=xs.dtype)
xs_c[0::2] = np.append(xs, [xs[-1]])
xs_c[1::2] = xs

ys_c = np.zeros((ys.size + ys.size + 1,), dtype=ys.dtype)
ys_c[1::2] = ys
ys_c[2::2] = ys

ys_c[-1] = 0

In [63]:
print(f" 0:({xs_c[0]}, {ys_c[0]}")
print(f" 1:({xs_c[1]}, {ys_c[1]}")
print("...")
print(f"-2:({xs_c[-2]}, {ys_c[-2]}")
print(f"-1:({xs_c[-1]}, {ys_c[-1]}")

 0:(1530316800000000000, 0.0
 1:(1530316800000000000, 2067.0
...
-2:(1580428800000000000, 297.0
-1:(1580428800000000000, 0.0


In [64]:
poly = hv.Polygons([{'x': xs_c, 'y': ys_c}]).opts(color='green', alpha=0.2)
poly

In [65]:
#step_p[0:20] * poly

In [66]:
ts = step_p[:20].dimension_values('author.date')
ys = step_p[:20].dimension_values('+:count')

ts[:3]

array(['2018-06-30T00:00:00.000000000', '2018-07-31T00:00:00.000000000',
       '2018-08-31T00:00:00.000000000'], dtype='datetime64[ns]')

In [67]:
ts_x = np.empty((ts.size + ts.size + 1,), dtype=ts.dtype)
ts_x[0::2] = np.append(ts, [ts[-1]])
ts_x[1::2] = ts

ys_x = np.zeros((ys.size + ys.size + 1,), dtype=ys.dtype)
ys_x[1::2] = ys
ys_x[2::2] = ys

ys_x[-1] = 0

In [68]:
df_area = pd.DataFrame({'author.date': ts_x, '+:count': ys_x})
df_area.head(10)

Unnamed: 0,author.date,+:count
0,2018-06-30,0.0
1,2018-06-30,2067.0
2,2018-07-31,2067.0
3,2018-07-31,20.0
4,2018-08-31,20.0
5,2018-08-31,543.0
6,2018-09-30,543.0
7,2018-09-30,4322.0
8,2018-10-31,4322.0
9,2018-10-31,2633.0


In [69]:
plot_1 = df_area.hvplot.scatter(x='author.date', y='+:count', color='green', alpha=0.8, size=5, sort_date=False).opts(default_tools=[], tools=["hover"])
plot_2 = df_area.hvplot.area(x='author.date', y='+:count', color='green', alpha=0.2, sort_date=False).opts(default_tools=[], tools=[])

plot_1 * plot_2



In [70]:
step_p[:20] * plot_2



In [71]:
hv.extension("bokeh")
hv.Scatter([]).opts(default_tools=[])*hv.Scatter([]).opts(default_tools=[])



In [72]:
type(step_p)

holoviews.element.chart.Curve

In [73]:
hv.element.chart.Curve

holoviews.element.chart.Curve

In [74]:
def area_step_from_curve(curve: hv.element.chart.Curve,
                         color: str = 'green', alpha: float = 0.2,
                         where: str = 'post'):
    if where != 'post':
        raise ValueError(f"{where=} not supported, possible values are ['post']")

    # extract x and y values
    #ts = curve.dframe().iloc[:,0]
    #ys = curve.dframe().iloc[:,1]
    ts = curve.dimension_values(0)
    ys = curve.dimension_values(1)

    # create step function from those values
    ts_x = np.empty((ts.size + ts.size + 1,), dtype=ts.dtype)
    ts_x[0::2] = np.append(ts, [ts[-1]])
    ts_x[1::2] = ts

    ys_x = np.zeros((ys.size + ys.size + 1,), dtype=ys.dtype)
    ys_x[1::2] = ys
    ys_x[2::2] = ys

    ys_x[-1] = 0

    # create DataFrame (alternative would be to create xarray)
    df = pd.DataFrame({
        str(step_p.dimensions()[0]): ts_x,
        str(step_p.dimensions()[1]): ys_x,
    })

    plot = df.hvplot.area(
        x=str(step_p.dimensions()[0]),
        y=str(step_p.dimensions()[1]),
        # options from parameters
        color=color,
        alpha=alpha,
        # needed to work correctly
        sort_date=False,
        # hover does not work for area
        tools=[],
        hover=False,
    ).opts(default_tools=[])

    return plot

In [75]:
step_p * area_step_from_curve(step_p, color='green')



In [76]:
step_m * area_step_from_curve(step_m, color='red')



In [77]:
(step_p * area_step_from_curve(step_p, color='green') + step_m * area_step_from_curve(step_m, color='red')).cols(1)



In [78]:
def hvplot_pm_counts(
    resampled_df: pd.DataFrame,
    repo_desc: str = 'tensorflow', author_desc: str = 'ezhulenev', resample_rate: str = 'ME', agg_func: str = 'sum',
):
    (step_p, step_m) = hvplot_plots(
        resampled_df=resampled_df,
        repo_desc=repo_desc, author_desc=author_desc, resample_rate=resample_rate, agg_func=agg_func,
    )

    area_p = area_step_from_curve(step_p, color=step_p.opts['color'])
    area_m = area_step_from_curve(step_m, color=step_m.opts['color'])
    
    return (step_p*area_p + step_m*area_m).opts(
        #vspace=0,  # not for Bokeh backend
        #tight=True,  # not for Bokeh backend
        title=f'repo={repo_desc}, author={author_desc}, lines per resample="{resample_rate}"',
        fontsize={'title': '10pt'},
        # remove 'pan' and 'wheel_zoom' - not needed
        #default_tools=[],
        #tools=[
        #    'box_zoom',
        #    'save',
        #    'reset',
        #    'hover',
        #],
    ).cols(1)

In [79]:
step_p * hv.HLine(0.0).opts(color='gray', line_width=3)

In [80]:
fig = hvplot_pm_counts(tf_timeline_selected_resampled_df)
pn.pane.HoloViews(fig)



# Sankey - directory structure of changes

## Examples

Mermaid.js: <https://mermaid.js.org/syntax/sankey.html>

data

```csv
Electricity grid,Over generation / exports,104.453
Electricity grid,Heating and cooling - homes,113.726
Electricity grid,H2 conversion,27.14
```

config

```yaml
---
config:
  sankey:
    linkColor: gradient
    nodeAlignment: justify
    showValues: false
---
```

graph (sankey-beta)

```mermaid
---
config:
  sankey:
    linkColor: gradient
    nodeAlignment: justify
    showValues: false
---

sankey-beta

%% source,target,value
Electricity grid,Over generation / exports,104.453
Electricity grid,Heating and cooling - homes,113.726
Electricity grid,H2 conversion,27.14
```

In [81]:
hv.Sankey([
    ('Electricity grid','Over generation / exports',104.453),
    ('Electricity grid','Heating and cooling - homes',113.726),
    ('Electricity grid','H2 conversion',27.14),
]).opts(width=400, height=300)

In [82]:
# Prepare data
sankey_data = {
    'source': ['Electricity grid', 'Electricity grid', 'Electricity grid'],
    'target': ['Over generation / exports', 'Heating and cooling - homes', 'H2 conversion'],
    'value': [104.453, 113.726, 27.14]
}

sankey_df = pd.DataFrame(sankey_data)

# Create Sankey diagram
sankey_diagram = hv.Sankey(sankey_df, kdims=['source', 'target'], vdims='value')

# Customize appearance
sankey_diagram.opts(
    #edge_color=hv.CategoricalColorMapper(factors=df['source'].unique(), palette='Viridis'),
    edge_color='target',
    node_width=20,
    width=600,
    height=400,
    title='Sankey Diagram Example',
    show_values=False  # Hide values on links
)

# Display the diagram
sankey_diagram

In [83]:
hv.Sankey([
    ('A', 'X', 5),
    ('A', 'Y', 7),
    ('A', 'Z', 6),
    ('B', 'X', 2),
    ('B', 'Y', 9),
    ('B', 'Z', 4)]
).opts(width=600, height=400)

In [84]:
hv.Sankey([
    ('A', 'X', 5),
    ('A', 'Y', 7),
    ('A', 'Z', 6),
    ('A', 'x', 4), # intermediate
    ('x', 'X', 2), # ...
    ('B', 'X', 2),
    ('B', 'Y', 9),
    ('B', 'Z', 4)]
).opts(width=600, height=400)

## Read and process data

In [85]:
with open('../../data/examples/stats/qtile.lines-stats.json', mode='r') as json_fp:
    lines_data = json.load(json_fp)

In [86]:
lines_data.keys()

dict_keys(['data/examples/annotations/qtile'])

In [87]:
lines_data['data/examples/annotations/qtile'].keys()

dict_keys(['all_authors-no_merges'])

In [88]:
elem = next(iter(lines_data['data/examples/annotations/qtile']['all_authors-no_merges'].values()))
elem

{'libqtile/layout/tree.py': {'language': 'Python',
  'type': 'programming',
  'purpose': 'programming',
  '+': {'count': 34,
   'type.code': 30,
   'purpose.programming': 34,
   'type.documentation': 4},
  '-': {'count': 4, 'type.code': 4, 'purpose.programming': 4},
  '+/-': {'type.code': 34,
   'purpose.programming': 38,
   'type.documentation': 4}}}

In [89]:
len(elem)

1

In [90]:
{k: list(enumerate(k.split('/'))) for k in elem.keys()}

{'libqtile/layout/tree.py': [(0, 'libqtile'), (1, 'layout'), (2, 'tree.py')]}

In [91]:
all_files_info = {
    k: list(enumerate(k.split('/')))
     for elem in lines_data['data/examples/annotations/qtile']['all_authors-no_merges'].values()
     for k in elem.keys()
}

In [92]:
len(all_files_info)

963

In [93]:
'/'.join(['a','b','c'])

'a/b/c'

In [94]:
next(iter(elem.keys())).split('/')

['libqtile', 'layout', 'tree.py']

In [95]:
all_dirs_set = {
     '/'.join(k.split('/')[:-1])
     for elem in lines_data['data/examples/annotations/qtile']['all_authors-no_merges'].values()
     for k in elem.keys()
}

In [96]:
len(all_dirs_set)

110

In [97]:
max([v for v in all_files_info.values()], key=len)

[(0, 'docs'),
 (1, 'manual'),
 (2, 'commands'),
 (3, 'shell'),
 (4, 'qtile-run.rst')]

In [98]:
sorted([v for v in all_files_info.values()], key=len, reverse=True)[:5]

[[(0, 'docs'),
  (1, 'manual'),
  (2, 'commands'),
  (3, 'shell'),
  (4, 'qtile-run.rst')],
 [(0, 'docs'),
  (1, 'manual'),
  (2, 'commands'),
  (3, 'shell'),
  (4, 'qtile-top.rst')],
 [(0, 'docs'),
  (1, '_themes'),
  (2, 'qtile'),
  (3, 'static'),
  (4, 'qtile.css_t')],
 [(0, 'docs'),
  (1, 'manual'),
  (2, 'commands'),
  (3, 'shell'),
  (4, 'iqshell.rst')],
 [(0, 'docs'), (1, 'manual'), (2, 'commands'), (3, 'shell'), (4, 'index.rst')]]

In [99]:
sorted([v for v in all_files_info.values()], key=len)[:3]

[[(0, 'CHANGELOG')], [(0, '.travis.yml')], [(0, 'setup.cfg')]]

In [100]:
changes_info = {
    patch: len(elem)
    for patch, elem in lines_data['data/examples/annotations/qtile']['all_authors-no_merges'].items()
}
max([(k,v) for k,v in changes_info.items()], key=lambda t: t[1])

('c8fdbedd248173d3f415b411792a2980d7a448d2.v2.json', 238)

In [101]:
df[df['patch_id'] == 'c8fdbedd248173d3f415b411792a2980d7a448d2.v2.json'][['bug_id', 'patch_id', 'file_names', 'n_parents', '-:count', '+:count']]

Unnamed: 0,bug_id,patch_id,file_names,n_parents,-:count,+:count
4207,all_authors-no_merges,c8fdbedd248173d3f415b411792a2980d7a448d2.v2.json,238.0,1.0,8083.0,8189.0


In [102]:
#df.columns

In [103]:
df[df['file_names']==10].head()

Unnamed: 0,bug_id,patch_id,file_names,language:Python,type:programming,purpose:programming,+:count,+:type.code,+:purpose.programming,+:type.documentation,...,language:JSON,language:TOML,language:Git Revision List,language:SVG,language:desktop,language:CSS,language:Nix,n_commits,author_date,committer_date
42,all_authors-no_merges,96124f96b523139f6386b3aa5a2a32fe8c366f96.v2.json,10.0,10.0,10.0,10.0,10.0,10.0,10.0,,...,,,,,,,,1,2019-01-13 07:04:30+00:00,2019-01-13 09:55:55+00:00
711,all_authors-no_merges,de39c977ead474a1300505b10f93a7c1d275478b.v2.json,10.0,9.0,9.0,9.0,44.0,42.0,42.0,,...,,,,,,,,1,2021-10-24 07:07:02+00:00,2021-11-01 11:54:20+00:00
772,all_authors-no_merges,392ee576f644d34c7419fbd7b3ecefb98cfb44ea.v2.json,10.0,10.0,10.0,10.0,20.0,20.0,20.0,,...,,,,,,,,1,2012-12-02 21:29:51+00:00,2012-12-02 21:29:51+00:00
951,all_authors-no_merges,b079318cbae1ea5dabf5aa9e5724bafcfadc2049.v2.json,10.0,5.0,5.0,5.0,77.0,36.0,40.0,4.0,...,,,,,,,,1,2008-08-24 08:22:01+00:00,2008-08-24 08:22:01+00:00
1007,all_authors-no_merges,e76de94088ed209e7e6a2b29fec5cd88c8ec526d.v2.json,10.0,10.0,10.0,5.0,142.0,52.0,59.0,7.0,...,,,,,,,,1,2010-08-21 12:20:13+00:00,2010-08-21 12:20:13+00:00


https://github.com/qtile/qtile/commit/c8fdbedd248173d3f415b411792a2980d7a448d2

**Blackify the code**

237 files changed, +8189 -8083 lines changed

In [104]:
df.describe()

Unnamed: 0,file_names,language:Python,type:programming,purpose:programming,+:count,+:type.code,+:purpose.programming,+:type.documentation,-:count,-:type.code,...,language:HTML,language:Markdown,language:JSON,language:TOML,language:Git Revision List,language:SVG,language:desktop,language:CSS,language:Nix,n_commits
count,5347.0,4551.0,4584.0,4258.0,5122.0,4045.0,4073.0,2534.0,4463.0,3544.0,...,49.0,13.0,2.0,16.0,2.0,3.0,8.0,13.0,1.0,5347.0
mean,2.756499,2.566908,2.563045,2.217238,37.784069,25.934734,30.635649,12.932123,24.68026,18.419865,...,3.22449,1.0,1.0,1.0,1.0,1.0,1.125,1.461538,2.0,1.0
std,6.475875,6.541656,6.530844,5.13739,164.940454,117.510156,129.043983,49.586718,159.044182,113.016117,...,3.274048,0.0,0.0,0.0,0.0,0.0,0.353553,0.967418,,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0
25%,1.0,1.0,1.0,1.0,3.0,2.0,2.0,2.0,2.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0
50%,1.0,1.0,1.0,1.0,9.0,7.0,7.0,3.0,4.0,4.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0
75%,3.0,2.0,2.0,2.0,29.0,20.0,23.0,9.0,14.0,12.0,...,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0
max,238.0,234.0,234.0,138.0,8189.0,5652.0,5829.0,1629.0,8083.0,5742.0,...,14.0,1.0,1.0,1.0,1.0,1.0,2.0,4.0,2.0,1.0


In [105]:
df[['file_names', 'n_parents', *pm_count_cols, *diff_x_cols]].describe()

Unnamed: 0,file_names,n_parents,-:count,+:count,-:purpose.data,+:purpose.data,-:purpose.documentation,+:purpose.documentation,-:purpose.markup,+:purpose.markup,...,diff.n_mod,diff.patch_size,diff.groups_spread,diff.hunk_spread_src,diff.hunk_spread_dst,diff.n_file_renames,diff.n_rem,diff.n_added_files,diff.n_removed_files,diff.n_binary_files
count,5347.0,5347.0,4463.0,5122.0,242.0,284.0,405.0,742.0,62.0,70.0,...,4065.0,5334.0,3420.0,2839.0,2839.0,142.0,2151.0,369.0,107.0,89.0
mean,2.756499,1.0,24.68026,37.784069,6.157025,8.394366,19.167901,17.330189,32.032258,28.185714,...,12.477491,47.696663,311.626023,335.724903,335.724903,2.246479,28.728963,1.783198,2.523364,2.47191
std,6.475875,0.0,159.044182,164.940454,18.068109,20.454365,132.784615,55.793867,93.611679,60.937434,...,116.185883,208.931815,904.870639,834.418361,834.418361,4.546352,124.903939,2.406206,3.655819,3.344228
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,2.0,3.0,1.0,1.0,1.0,1.0,2.0,2.0,...,1.0,3.0,19.0,27.0,27.0,1.0,2.0,1.0,1.0,1.0
50%,1.0,1.0,4.0,9.0,2.0,2.0,2.0,3.0,3.5,4.0,...,3.0,10.0,78.5,97.0,97.0,1.0,5.0,1.0,1.0,1.0
75%,3.0,1.0,14.0,29.0,4.0,6.0,7.0,12.0,20.0,30.5,...,9.0,35.0,301.0,349.0,349.0,2.0,16.0,1.0,2.0,2.0
max,238.0,1.0,8083.0,8189.0,227.0,162.0,2533.0,794.0,663.0,369.0,...,6665.0,9607.0,30280.0,21556.0,21556.0,48.0,3073.0,21.0,21.0,18.0


## Single commit example

In [106]:
#xelem = lines_data['data/examples/annotations/qtile']['all_authors-no_merges']['c8fdbedd248173d3f415b411792a2980d7a448d2.v2.json']
xelem = lines_data['data/examples/annotations/qtile']['all_authors-no_merges']['b079318cbae1ea5dabf5aa9e5724bafcfadc2049.v2.json']
xelem

{'doc-src/admin.html': {'language': 'HTML',
  'type': 'markup',
  'purpose': 'markup',
  '+': {'count': 1, 'type.code': 1, 'purpose.markup': 1},
  '-': {'count': 1, 'type.code': 1, 'purpose.markup': 1},
  '+/-': {'type.code': 2, 'purpose.markup': 2}},
 'doc-src/configuration.html': {'language': 'HTML',
  'type': 'markup',
  'purpose': 'markup',
  '+': {'count': 19,
   'type.code': 16,
   'purpose.markup': 19,
   'type.documentation': 3},
  '-': {'count': 11,
   'type.documentation': 1,
   'purpose.markup': 11,
   'type.code': 10},
  '+/-': {'type.code': 26, 'purpose.markup': 30, 'type.documentation': 4}},
 '/dev/null': {'language': '/dev/null',
  'type': 'other',
  'purpose': 'other',
  '+': {},
  '-': {},
  '+/-': {}},
 'doc-src/configuration/example.html': {'language': 'HTML',
  'type': 'markup',
  'purpose': 'markup',
  '+': {'count': 3,
   'type.documentation': 2,
   'purpose.markup': 3,
   'type.code': 1},
  '-': {},
  '+/-': {'type.documentation': 2, 'purpose.markup': 3, 'type.co

In [107]:
xsankey_data_1 = [(path, line_type, val) for path, data in xelem.items() for line_type, val in data['+/-'].items() if line_type.startswith('type.')]
xsankey_data_1

[('doc-src/admin.html', 'type.code', 2),
 ('doc-src/configuration.html', 'type.code', 26),
 ('doc-src/configuration.html', 'type.documentation', 4),
 ('doc-src/configuration/example.html', 'type.documentation', 2),
 ('doc-src/configuration/example.html', 'type.code', 1),
 ('doc-src/configuration/index.py', 'type.code', 4),
 ('doc-src/configuration/index.py', 'type.documentation', 1),
 ('doc-src/index.html', 'type.code', 14),
 ('doc-src/index.html', 'type.documentation', 1),
 ('doc-src/index.py', 'type.code', 15),
 ('doc-src/index.py', 'type.documentation', 3),
 ('libqtile/command.py', 'type.code', 5),
 ('libqtile/layout.py', 'type.code', 8),
 ('libqtile/manager.py', 'type.code', 10)]

In [108]:
xsankey_1 = hv.Sankey(xsankey_data_1).opts(edge_color_index=1, width=800, height=400)
xsankey_1

In [109]:
c = Counter()
c.update({('x','y'): 1})
c.update({('x','y'): 2})
c[('x','y')] += 3
c

Counter({('x', 'y'): 6})

In [110]:
list(Path('foo/bar/baz').parents)

[PosixPath('foo/bar'), PosixPath('foo'), PosixPath('.')]

In [111]:
dir_data = Counter()
for p,_,v in xsankey_data_1:
    print(f"{p} => {v}")
    dir_data[(str(Path(p).parent), p)] += v
    for p_f, p_t in zip(Path(p).parent.parents, Path(p).parents):
        #print(f"- ({p_f}, {p_t})")
        dir_data[(str(p_f), str(p_t))] += v

dir_data

doc-src/admin.html => 2
doc-src/configuration.html => 26
doc-src/configuration.html => 4
doc-src/configuration/example.html => 2
doc-src/configuration/example.html => 1
doc-src/configuration/index.py => 4
doc-src/configuration/index.py => 1
doc-src/index.html => 14
doc-src/index.html => 1
doc-src/index.py => 15
doc-src/index.py => 3
libqtile/command.py => 5
libqtile/layout.py => 8
libqtile/manager.py => 10


Counter({('.', 'doc-src'): 73,
         ('doc-src', 'doc-src/configuration.html'): 30,
         ('.', 'libqtile'): 23,
         ('doc-src', 'doc-src/index.py'): 18,
         ('doc-src', 'doc-src/index.html'): 15,
         ('libqtile', 'libqtile/manager.py'): 10,
         ('doc-src', 'doc-src/configuration'): 8,
         ('libqtile', 'libqtile/layout.py'): 8,
         ('doc-src/configuration', 'doc-src/configuration/index.py'): 5,
         ('libqtile', 'libqtile/command.py'): 5,
         ('doc-src/configuration', 'doc-src/configuration/example.html'): 3,
         ('doc-src', 'doc-src/admin.html'): 2})

In [112]:
xsankey_data_2_dirs = [(p[0], p[1], v) for p, v in dir_data.items()]
xsankey_data_2_dirs

[('doc-src', 'doc-src/admin.html', 2),
 ('.', 'doc-src', 73),
 ('doc-src', 'doc-src/configuration.html', 30),
 ('doc-src/configuration', 'doc-src/configuration/example.html', 3),
 ('doc-src', 'doc-src/configuration', 8),
 ('doc-src/configuration', 'doc-src/configuration/index.py', 5),
 ('doc-src', 'doc-src/index.html', 15),
 ('doc-src', 'doc-src/index.py', 18),
 ('libqtile', 'libqtile/command.py', 5),
 ('.', 'libqtile', 23),
 ('libqtile', 'libqtile/layout.py', 8),
 ('libqtile', 'libqtile/manager.py', 10)]

In [113]:
xsankey_data_2 = xsankey_data_1 + xsankey_data_2_dirs
xsankey_data_2

[('doc-src/admin.html', 'type.code', 2),
 ('doc-src/configuration.html', 'type.code', 26),
 ('doc-src/configuration.html', 'type.documentation', 4),
 ('doc-src/configuration/example.html', 'type.documentation', 2),
 ('doc-src/configuration/example.html', 'type.code', 1),
 ('doc-src/configuration/index.py', 'type.code', 4),
 ('doc-src/configuration/index.py', 'type.documentation', 1),
 ('doc-src/index.html', 'type.code', 14),
 ('doc-src/index.html', 'type.documentation', 1),
 ('doc-src/index.py', 'type.code', 15),
 ('doc-src/index.py', 'type.documentation', 3),
 ('libqtile/command.py', 'type.code', 5),
 ('libqtile/layout.py', 'type.code', 8),
 ('libqtile/manager.py', 'type.code', 10),
 ('doc-src', 'doc-src/admin.html', 2),
 ('.', 'doc-src', 73),
 ('doc-src', 'doc-src/configuration.html', 30),
 ('doc-src/configuration', 'doc-src/configuration/example.html', 3),
 ('doc-src', 'doc-src/configuration', 8),
 ('doc-src/configuration', 'doc-src/configuration/index.py', 5),
 ('doc-src', 'doc-src

In [114]:
xsankey_2 = hv.Sankey(xsankey_data_2).opts(edge_color_index=1, width=800, height=400)
xsankey_2

In [115]:
reduced_data = Counter()
for p,l,v in xsankey_data_1:
    print(f"{p} ={v}=> {l}")
    reduced_data[(str(Path(p).parent), l)] += v
    for p_f, p_t in zip(Path(p).parent.parents, Path(p).parents):
        #print(f"- ({p_f}, {p_t})")
        reduced_data[(str(p_f), str(p_t))] += v

reduced_data

doc-src/admin.html =2=> type.code
doc-src/configuration.html =26=> type.code
doc-src/configuration.html =4=> type.documentation
doc-src/configuration/example.html =2=> type.documentation
doc-src/configuration/example.html =1=> type.code
doc-src/configuration/index.py =4=> type.code
doc-src/configuration/index.py =1=> type.documentation
doc-src/index.html =14=> type.code
doc-src/index.html =1=> type.documentation
doc-src/index.py =15=> type.code
doc-src/index.py =3=> type.documentation
libqtile/command.py =5=> type.code
libqtile/layout.py =8=> type.code
libqtile/manager.py =10=> type.code


Counter({('.', 'doc-src'): 73,
         ('doc-src', 'type.code'): 57,
         ('libqtile', 'type.code'): 23,
         ('.', 'libqtile'): 23,
         ('doc-src', 'type.documentation'): 8,
         ('doc-src', 'doc-src/configuration'): 8,
         ('doc-src/configuration', 'type.code'): 5,
         ('doc-src/configuration', 'type.documentation'): 3})

In [116]:
xsankey_data_3 = [(p[0], p[1], v) for p, v in reduced_data.items()]
xsankey_data_3

[('doc-src', 'type.code', 57),
 ('.', 'doc-src', 73),
 ('doc-src', 'type.documentation', 8),
 ('doc-src/configuration', 'type.documentation', 3),
 ('doc-src', 'doc-src/configuration', 8),
 ('doc-src/configuration', 'type.code', 5),
 ('libqtile', 'type.code', 23),
 ('.', 'libqtile', 23)]

In [117]:
xsankey_3 = hv.Sankey(xsankey_data_3).opts(edge_color_index=1, width=800, height=400)
xsankey_3

In [118]:
pn.Column(
    pn.panel(xsankey_1.opts(width=700, height=300, border=2)),
    pn.panel(xsankey_2.opts(width=700, height=300, border=2)),
    pn.panel(xsankey_3.opts(width=700, height=300, border=2)),
)#.save('save.png')

In [119]:
# hand-edited, based on xsankey_data_3
xsankey_data_x4 = [('__doc-src__', 'type.code', 57),
 ('.', 'doc-src', 73),
 ('doc-src', '__doc-src__', 57+8),
 ('__doc-src__', 'type.documentation', 8),
 ('doc-src/configuration', 'type.documentation', 3),
 ('doc-src', 'doc-src/configuration', 8),
 ('doc-src/configuration', 'type.code', 5),
 ('libqtile', 'type.code', 23),
 ('.', 'libqtile', 23)]
xsankey_data_x4

[('__doc-src__', 'type.code', 57),
 ('.', 'doc-src', 73),
 ('doc-src', '__doc-src__', 65),
 ('__doc-src__', 'type.documentation', 8),
 ('doc-src/configuration', 'type.documentation', 3),
 ('doc-src', 'doc-src/configuration', 8),
 ('doc-src/configuration', 'type.code', 5),
 ('libqtile', 'type.code', 23),
 ('.', 'libqtile', 23)]

In [120]:
l1 = [('a',1),('b',2)]
l2 = l1.copy()

l2[0] = ('A', 11)
print(f"{l1=}, {l2=}")

l1=[('a', 1), ('b', 2)], l2=[('A', 11), ('b', 2)]


In [121]:
s = set("abc")
s.add("d")
s

{'a', 'b', 'c', 'd'}

In [122]:
xsankey_data_4 = xsankey_data_3.copy()
xsankey_data_sets = {
    'dir-to-dir': set(),
    'dir-to-line': set(),
}
xsankey_data_cntr = Counter()
xsankey_data_line = defaultdict(set)

for p_f, p_t, v in xsankey_data_4:
    if p_t.startswith('type.'):
        xsankey_data_sets['dir-to-line'].add(p_f)
        xsankey_data_cntr[p_f] += v
        xsankey_data_line[p_f].add(p_t)
    else:
        xsankey_data_sets['dir-to-dir'].add(p_f)

xsankey_data_sets['intersection'] = xsankey_data_sets['dir-to-dir'] & xsankey_data_sets['dir-to-line']

print(f"dir-to-dir:   {xsankey_data_sets['dir-to-dir']}")
print(f"dir-to-line:  {xsankey_data_sets['dir-to-line']}")
print(f"intersection: {xsankey_data_sets['dir-to-dir'] & xsankey_data_sets['dir-to-line']}")
print( "-------------")
print(f"{xsankey_data_cntr}")
xsankey_data_extracted = {k: v for k, v in xsankey_data_cntr.items() if k in xsankey_data_sets['intersection']}
print(f"{Counter(xsankey_data_extracted)}")
print( "-------------")
print(f"{xsankey_data_line}")

dir-to-dir:   {'.', 'doc-src'}
dir-to-line:  {'libqtile', 'doc-src/configuration', 'doc-src'}
intersection: {'doc-src'}
-------------
Counter({'doc-src': 65, 'libqtile': 23, 'doc-src/configuration': 8})
Counter({'doc-src': 65})
-------------
defaultdict(<class 'set'>, {'doc-src': {'type.code', 'type.documentation'}, 'doc-src/configuration': {'type.documentation', 'type.code'}, 'libqtile': {'type.code'}})


In [123]:
xsankey_cntr_4 = reduced_data.copy()
xsankey_cntr_4

Counter({('.', 'doc-src'): 73,
         ('doc-src', 'type.code'): 57,
         ('libqtile', 'type.code'): 23,
         ('.', 'libqtile'): 23,
         ('doc-src', 'type.documentation'): 8,
         ('doc-src', 'doc-src/configuration'): 8,
         ('doc-src/configuration', 'type.code'): 5,
         ('doc-src/configuration', 'type.documentation'): 3})

In [124]:
xsankey_cntr_4 = reduced_data.copy()

for d in xsankey_data_sets['intersection']:
    print(f"{d!r}:")
    for l in xsankey_data_line[d]:
        print(f"    {l!r}")
        xsankey_cntr_4[(f"__{d}__", l)] = xsankey_cntr_4[(d, l)]
        xsankey_cntr_4[(d, f"__{d}__")] += xsankey_cntr_4[(d, l)]
        del xsankey_cntr_4[(d, l)]

xsankey_cntr_4

'doc-src':
    'type.code'
    'type.documentation'


Counter({('.', 'doc-src'): 73,
         ('doc-src', '__doc-src__'): 65,
         ('__doc-src__', 'type.code'): 57,
         ('libqtile', 'type.code'): 23,
         ('.', 'libqtile'): 23,
         ('doc-src', 'doc-src/configuration'): 8,
         ('__doc-src__', 'type.documentation'): 8,
         ('doc-src/configuration', 'type.code'): 5,
         ('doc-src/configuration', 'type.documentation'): 3})

In [125]:
xsankey_data_4 = [(p[0], p[1], v) for p, v in xsankey_cntr_4.items()]
xsankey_data_4

[('.', 'doc-src', 73),
 ('doc-src/configuration', 'type.documentation', 3),
 ('doc-src', 'doc-src/configuration', 8),
 ('doc-src/configuration', 'type.code', 5),
 ('libqtile', 'type.code', 23),
 ('.', 'libqtile', 23),
 ('__doc-src__', 'type.code', 57),
 ('doc-src', '__doc-src__', 65),
 ('__doc-src__', 'type.documentation', 8)]

In [126]:
xsankey_4 = hv.Sankey(xsankey_data_4).opts(edge_color_index=1, width=800, height=400)
xsankey_4

In [127]:
xsankey_x4 = hv.Sankey(xsankey_data_x4).opts(edge_color_index=1, width=800, height=400, title="Hand-edited data")
xsankey_x4

In [128]:
xsankey_cntr_5 = xsankey_cntr_4.copy()

#to_delete = lambda x: x == 'doc-src'  # should not pass the check
to_delete = lambda x: '/' in x
can_delete = True

xsankey_5_info = {
    'delete-contents': defaultdict(dict),
    'to-prev': {}
}

# sanity check
for k, v in xsankey_cntr_5.items():
    (p_f, p_t) = k
    if to_delete(p_f):
        if not p_t.startswith('type.'):
            print(f"{p_f!r} is not final: {p_f!r} =[{v}]=> {p_t!r}")
            can_delete = False
        else:
            xsankey_5_info['delete-contents'][p_f][p_t] = v
            
    if to_delete(p_t):
        xsankey_5_info['to-prev'][p_t] = p_f

print(f"{can_delete=}")

if can_delete:
    to_prev_dict = {}
    for p_t, p_f in xsankey_5_info['to-prev'].items():
        if (p_f, f"__{p_f}__") in xsankey_cntr_5:
            #print(f"({p_f}, __{p_f}__): {xsankey_cntr_5[(p_f, f'__{p_f}__')]}")
            to_prev_dict[f"__{p_f}__"] = p_f

    xsankey_5_info['to-prev'] |= to_prev_dict

    for k, v in xsankey_cntr_5.items():
        (p_f, p_t) = k
        if (p_f in xsankey_5_info['to-prev'] and
            p_t.startswith('type.')):
            xsankey_5_info['delete-contents'][p_f][p_t] = v

    for k, v in xsankey_cntr_4.items():  # we are changing xsankey_cntr_5
        (p_f, p_t) = k
        if p_t in xsankey_5_info['to-prev'] and p_f == xsankey_5_info['to-prev'][p_t]:
            print(f"({p_f}, {p_t}): {v})")
            for kk, vv in xsankey_5_info['delete-contents'][p_t].items():
                xsankey_cntr_5[(p_f, kk)] += vv
                print(f"  ({p_f}, {kk}) += {vv} => {xsankey_cntr_5[(p_f, kk)]}")
            del xsankey_cntr_5[(p_f, p_t)]
        if p_f in xsankey_5_info['to-prev']:
            del xsankey_cntr_5[(p_f, p_t)]

print(f"{xsankey_5_info=}")

xsankey_cntr_5

can_delete=True
(doc-src, doc-src/configuration): 8)
  (doc-src, type.documentation) += 3 => 3
  (doc-src, type.code) += 5 => 5
(doc-src, __doc-src__): 65)
  (doc-src, type.code) += 57 => 62
  (doc-src, type.documentation) += 8 => 11
xsankey_5_info={'delete-contents': defaultdict(<class 'dict'>, {'doc-src/configuration': {'type.documentation': 3, 'type.code': 5}, '__doc-src__': {'type.code': 57, 'type.documentation': 8}}), 'to-prev': {'doc-src/configuration': 'doc-src', '__doc-src__': 'doc-src'}}


Counter({('.', 'doc-src'): 73,
         ('doc-src', 'type.code'): 62,
         ('libqtile', 'type.code'): 23,
         ('.', 'libqtile'): 23,
         ('doc-src', 'type.documentation'): 11})

In [129]:
xsankey_data_5 = [(p[0], p[1], v) for p, v in xsankey_cntr_5.items()]
xsankey_data_5

[('.', 'doc-src', 73),
 ('libqtile', 'type.code', 23),
 ('.', 'libqtile', 23),
 ('doc-src', 'type.documentation', 11),
 ('doc-src', 'type.code', 62)]

In [130]:
xsankey_5 = hv.Sankey(xsankey_data_5).opts(edge_color_index=1, width=800, height=400)
xsankey_5

In [131]:
(
    xsankey_1.opts(width=700, height=300, border=2, title="Changed file to line type") +
    xsankey_2.opts(width=700, height=300, border=2, title="Dir to file to line type") +
    xsankey_3.opts(width=700, height=250, border=2, title="Directory to line type") + 
    xsankey_4.opts(width=700, height=250, border=2, title="Dir to line type, with __dir__") +
    xsankey_5.opts(width=700, height=250, border=2, title="One level less")
).cols(1)

## Larger single commit example - automatic

Largest commit with respect to the number of changed files:

In [132]:
xelem_large = lines_data['data/examples/annotations/qtile']['all_authors-no_merges']['c8fdbedd248173d3f415b411792a2980d7a448d2.v2.json']
len(xelem_large)

238

In [133]:
list(xelem_large.keys())[:10]

['libqtile/backend/__init__.py',
 'libqtile/backend/base.py',
 'libqtile/backend/wayland/core.py',
 'libqtile/backend/wayland/keyboard.py',
 'libqtile/backend/wayland/output.py',
 'libqtile/backend/wayland/window.py',
 'libqtile/backend/wayland/wlrq.py',
 'libqtile/backend/x11/core.py',
 'libqtile/backend/x11/drawer.py',
 'libqtile/backend/x11/window.py']

In [134]:
list(xelem_large.keys())[-10:]

['test/widgets/test_textbox.py',
 'test/widgets/test_thermal_zone.py',
 'test/widgets/test_volume.py',
 'test/widgets/test_widget_init_configure.py',
 'test/widgets/test_widgetbox.py',
 'test/widgets/test_window_count.py',
 'test/widgets/test_windowname.py',
 'test/widgets/test_windowtabs.py',
 'test/widgets/test_wlan.py',
 'tox.ini']

Create simple Sankey plot from file names of changed files to line types, no processing

In [135]:
def line_stats_to_sankey_triple(data: dict) -> list[tuple[str, str, int]]:
    return  [
        (path, line_type, val) 
        for path, data in data.items()
        for line_type, val in data['+/-'].items()
        if line_type.startswith('type.')
    ]

xsankey_data_large_1 = line_stats_to_sankey_triple(xelem_large)
xsankey_data_large_1[:5]

[('libqtile/backend/__init__.py', 'type.code', 4),
 ('libqtile/backend/base.py', 'type.code', 72),
 ('libqtile/backend/base.py', 'type.documentation', 4),
 ('libqtile/backend/wayland/core.py', 'type.code', 109),
 ('libqtile/backend/wayland/keyboard.py', 'type.code', 4)]

In [136]:
def sankey_plot_from_triples(sankey_data: list[tuple[str, str, int]], width: int = 800, height: int = 400) -> hv.Sankey:
    return hv.Sankey(sankey_data).opts(edge_color_index=1, width=width, height=height)

In [137]:
xsankey_large_1 = sankey_plot_from_triples(xsankey_data_large_1).opts(title="Changed file to line type - for 238 files")
xsankey_large_1

**TODO:** Either convert to `Counter()` indexed with tuple of two strings, or create such structure directly - instead of list of 3-element tuples that `line_stats_to_sankey_triple()` creates

In [138]:
def tuples_split_dirs_counter(data_list: list[tuple[str, str, int]]) -> Counter:
    dir_data = Counter()
    
    for p,_,v in data_list:
        #print(f"{p} => {v}")
        dir_data[(str(Path(p).parent), p)] += v
        for p_f, p_t in zip(Path(p).parent.parents, Path(p).parents):
            #print(f"- ({p_f}, {p_t})")
            dir_data[(str(p_f), str(p_t))] += v

    return dir_data

In [139]:
xsankey_cntr_large_2_dir = tuples_split_dirs_counter(xsankey_data_large_1)
len(xsankey_cntr_large_2_dir)

258

In [140]:
def sankey_triples_from_counter(data_counter: Counter) -> list[tuple[str, str, int]]:
    return [(p[0], p[1], v) for p, v in data_counter.items()]

xsankey_data_large_2_dirs = sankey_triples_from_counter(xsankey_cntr_large_2_dir)
xsankey_data_large_2_dirs[:5]

[('libqtile/backend', 'libqtile/backend/__init__.py', 4),
 ('libqtile', 'libqtile/backend', 5667),
 ('.', 'libqtile', 11597),
 ('libqtile/backend', 'libqtile/backend/base.py', 76),
 ('libqtile/backend/wayland', 'libqtile/backend/wayland/core.py', 109)]

In [141]:
def sankey_counter_from_triples(data_list: list[tuple[str, str, int]]) -> Counter:
    return Counter({(p_f, p_t): v for p_f, p_t, v in data_list})

sankey_counter_from_triples(xsankey_data_large_2_dirs[:5])

Counter({('.', 'libqtile'): 11597,
         ('libqtile', 'libqtile/backend'): 5667,
         ('libqtile/backend/wayland', 'libqtile/backend/wayland/core.py'): 109,
         ('libqtile/backend', 'libqtile/backend/base.py'): 76,
         ('libqtile/backend', 'libqtile/backend/__init__.py'): 4})

In [142]:
xsankey_data_large_2 = xsankey_data_large_1 + xsankey_data_large_2_dirs
xsankey_data_large_2[::40]

[('libqtile/backend/__init__.py', 'type.code', 4),
 ('libqtile/core/state.py', 'type.documentation', 1),
 ('libqtile/layout/xmonad.py', 'type.documentation', 4),
 ('libqtile/widget/canto.py', 'type.documentation', 1),
 ('libqtile/widget/keyboardkbdd.py', 'type.code', 35),
 ('libqtile/widget/statusnotifier.py', 'type.code', 124),
 ('test/helpers.py', 'type.documentation', 4),
 ('test/test_popup.py', 'type.code', 12),
 ('test/widgets/test_moc.py', 'type.code', 62),
 ('libqtile/backend/x11', 'libqtile/backend/x11/core.py', 84),
 ('libqtile/interactive', 'libqtile/interactive/iqshell_kernel.py', 65),
 ('libqtile/widget', 'libqtile/widget/base.py', 163),
 ('libqtile/widget', 'libqtile/widget/pulseaudio_ffi.py', 10),
 ('test', 'test/helpers.py', 43),
 ('test', 'test/test_window.py', 46),
 ('test/widgets', 'test/widgets/test_volume.py', 20)]

In [143]:
xsankey_large_2 = sankey_plot_from_triples(xsankey_data_large_2).opts(title="Dir to file to line type - for 238 files")
xsankey_large_2

In [144]:
def tuples_dirs_only_counter(data_list: list[tuple[str, str, int]]) -> Counter:
    reduced_data = Counter()
    
    for p,l,v in data_list:
        #print(f"{p} ={v}=> {l}")
        reduced_data[(str(Path(p).parent), l)] += v
        for p_f, p_t in zip(Path(p).parent.parents, Path(p).parents):
            #print(f"- ({p_f}, {p_t})")
            reduced_data[(str(p_f), str(p_t))] += v

    return reduced_data

In [145]:
xsankey_cntr_large_3_dir = tuples_dirs_only_counter(xsankey_data_large_1)
len(xsankey_cntr_large_3_dir)

59

In [146]:
xsankey_data_large_3 = sankey_triples_from_counter(xsankey_cntr_large_3_dir)
xsankey_data_large_3[:5]

[('libqtile/backend', 'type.code', 76),
 ('libqtile', 'libqtile/backend', 5667),
 ('.', 'libqtile', 11597),
 ('libqtile/backend', 'type.documentation', 4),
 ('libqtile/backend/wayland', 'type.code', 268)]

In [147]:
xsankey_large_3 = sankey_plot_from_triples(xsankey_data_large_3).opts(
    title=f"Directory to line type - for ≈{len(xsankey_data_large_3)} dirs",
)
xsankey_large_3

In [148]:
def add_dashdash_dirs_to_counter(data_counter: Counter) -> Counter:
    res = data_counter.copy()

    xsankey_data_sets = {
        'dir-to-dir': set(),
        'dir-to-line': set(),
    }
    #xsankey_data_cntr = Counter()
    xsankey_data_line = defaultdict(set)

    for (p_f, p_t), v in data_counter.items():
        if p_t.startswith('type.'):
            xsankey_data_sets['dir-to-line'].add(p_f)
            #xsankey_data_cntr[p_f] += v
            xsankey_data_line[p_f].add(p_t)
        else:
            xsankey_data_sets['dir-to-dir'].add(p_f)

    xsankey_data_sets['intersection'] = xsankey_data_sets['dir-to-dir'] & xsankey_data_sets['dir-to-line']

    #xsankey_data_extracted = {k: v for k, v in xsankey_data_cntr.items() if k in xsankey_data_sets['intersection']}

    for d in xsankey_data_sets['intersection']:
        #print(f"{d!r}:")
        for l in xsankey_data_line[d]:
            #print(f"    {l!r}")
            res[(f"__{d}__", l)]  = res[(d, l)]
            res[(d, f"__{d}__")] += res[(d, l)]
            del res[(d, l)]

    return res

In [149]:
len(xsankey_cntr_large_3_dir)

59

In [150]:
xsankey_cntr_large_4_dir = add_dashdash_dirs_to_counter(xsankey_cntr_large_3_dir)
len(xsankey_cntr_large_4_dir)

63

In [151]:
xsankey_data_large_4 = sankey_triples_from_counter(xsankey_cntr_large_4_dir)
xsankey_data_large_4[::5]

[('libqtile', 'libqtile/backend', 5667),
 ('libqtile/backend/x11', 'type.code', 5300),
 ('libqtile/core', 'type.code', 253),
 ('libqtile/extension', 'type.documentation', 1),
 ('libqtile', 'libqtile/layout', 630),
 ('libqtile/scripts', 'type.code', 400),
 ('libqtile/widget', 'type.documentation', 80),
 ('test/backend/x11', 'type.code', 200),
 ('test/extension', 'type.code', 49),
 ('test/layouts', 'type.documentation', 9),
 ('test', 'test/widgets', 956),
 ('__.__', 'type.code', 11),
 ('__libqtile__', 'type.documentation', 40)]

In [152]:
xsankey_large_4 = sankey_plot_from_triples(xsankey_data_large_4).opts(
    title=f"Dir to line type, with __dir__ - for ≈{len(xsankey_data_large_4)} dirs",
)
xsankey_large_4

In [153]:
def reduce_sankey_from_tail(data_counter: Counter) -> Counter:
    res = data_counter.copy()

    print("reduce_sankey_from_tail():")

    max_level = 0
    for (p_f, _) in data_counter.keys():
        n_dashes = p_f.count('/')
        if n_dashes > max_level:
            max_level = n_dashes

    print(f"  {max_level=}")
    
    to_delete = lambda x: x.count('/') == max_level
    can_delete = True

    xsankey_info = {
        'delete-contents': defaultdict(dict),
        'to-prev': {}
    }

    # sanity check
    for k, v in data_counter.items():
        (p_f, p_t) = k
        if to_delete(p_f):
            if not p_t.startswith('type.'):
                print(f"  {p_f!r} is not final: {p_f!r} =[{v}]=> {p_t!r}")
                can_delete = False
            else:
                xsankey_info['delete-contents'][p_f][p_t] = v
            
        if to_delete(p_t):
            xsankey_info['to-prev'][p_t] = p_f

    print(f"  {can_delete=}")
    
    if can_delete:
        to_prev_dict = {}
        for p_t, p_f in xsankey_info['to-prev'].items():
            if (p_f, f"__{p_f}__") in data_counter:
                #print(f"({p_f}, __{p_f}__): {xsankey_cntr_5[(p_f, f'__{p_f}__')]}")
                to_prev_dict[f"__{p_f}__"] = p_f

        print(f"  extra 'to-prev':{len(to_prev_dict)}")
        xsankey_info['to-prev'] |= to_prev_dict

        for k, v in data_counter.items():
            (p_f, p_t) = k
            if (p_f in xsankey_info['to-prev'] and
                p_t.startswith('type.')):
                xsankey_info['delete-contents'][p_f][p_t] = v

        for k, v in data_counter.items():  # we are changing res
            (p_f, p_t) = k
            if p_t in xsankey_info['to-prev'] and p_f == xsankey_info['to-prev'][p_t]:
                #print(f"({p_f}, {p_t}): {v})")
                for kk, vv in xsankey_info['delete-contents'][p_t].items():
                    res[(p_f, kk)] += vv
                    #print(f"  ({p_f}, {kk}) += {vv} => {res[(p_f, kk)]}")
                del res[(p_f, p_t)]
            if p_f in xsankey_info['to-prev']:
                del res[(p_f, p_t)]

    return res

In [154]:
len(xsankey_cntr_large_4_dir)

63

In [155]:
xsankey_cntr_large_5_dir = reduce_sankey_from_tail(xsankey_cntr_large_4_dir)
len(xsankey_cntr_large_5_dir)

reduce_sankey_from_tail():
  max_level=2
  can_delete=True
  extra 'to-prev':1


53

In [156]:
xsankey_data_large_5 = sankey_triples_from_counter(xsankey_cntr_large_5_dir)
xsankey_data_large_5[::5]

[('libqtile', 'libqtile/backend', 5667),
 ('libqtile', 'libqtile/core', 268),
 ('libqtile/interactive', 'type.code', 82),
 ('libqtile/layout', 'type.code', 617),
 ('libqtile', 'libqtile/scripts', 409),
 ('test', 'test/backend', 222),
 ('test', 'test/extension', 50),
 ('test/scripts', 'type.code', 28),
 ('test/widgets', 'type.documentation', 28),
 ('__test__', 'type.documentation', 29),
 ('libqtile/backend', 'type.documentation', 23)]

In [157]:
xsankey_large_5 = sankey_plot_from_triples(xsankey_data_large_5).opts(
    title=f"One less level - for ≈{len(xsankey_data_large_5)} dirs",
)
xsankey_large_5

In [158]:
def reduce_sankey_thin_out(data_counter: Counter, threshold_ratio = 0.005) -> Counter:
    print("reduce_sankey_thin_out():")

    total_lines = 0
    for (p_f, p_t), v in data_counter.items():
        if p_f != '.':
            continue
        total_lines += v

    print(f"  {total_lines=}")
    print(f"  threshold={threshold_ratio}*{total_lines}={threshold_ratio*total_lines}")

    data_info = {
        'to-remove': set()
    }
    
    for (p_f, p_t), v in data_counter.items():
        if v < threshold_ratio*total_lines:
            print(f"  - ({p_f}, {p_t}): {v} {'*' if p_t.startswith('type.') else ' '}")
            data_info['to-remove'].add(p_f)

    data_info |= {
        'delete-contents': defaultdict(dict),
        'to-prev': {},
        'can-remove': set(),
    }

    print("  gathering data:")
    
    for (p_f, p_t), v in data_counter.items():
        # want to remove, and can remove
        if p_f in data_info['to-remove'] and p_t.startswith('type.'):
            data_info['delete-contents'][p_f][p_t] = v

    for (p_f, p_t), v in data_counter.items():
        if p_t in data_info['to-remove'] and p_t in data_info['delete-contents']:
            data_info['to-prev'][p_t] = p_f
            
            total_width = 0
            for v in data_info['delete-contents'][p_t].values():
                total_width += v
            if total_width < threshold_ratio*total_lines:
                if f"__{p_f}__" == p_t:
                    print(f"  ! ({p_f}) -> ({p_t}) -> {data_info['delete-contents'][p_t]}")
                else:
                    print(f"  + ({p_f}) => ({p_t}) => {data_info['delete-contents'][p_t]}")
                    data_info['can-remove'].add(p_t)
            else:
                print(f"  - ({p_f}) -> ({p_t}) -> {data_info['delete-contents'][p_t]}")

    ## -------------------------------------------------------
    ## actual removal
    res = data_counter.copy()

    print("  deleting/compressing:")
    for k, v in data_counter.items():  # we are changing res
        (p_f, p_t) = k
        if p_t in data_info['can-remove']:
            if p_t in data_info['to-prev'] and p_f == data_info['to-prev'][p_t]:
                print(f"  - ({p_f}, {p_t}): {v})")
                for kk, vv in data_info['delete-contents'][p_t].items():
                    res[(p_f, kk)] += vv
                    #print(f"  ({p_f}, {kk}) += {vv} => {res[(p_f, kk)]}")
                del res[(p_f, p_t)]

        if p_f in data_info['can-remove']:
            if p_f in data_info['to-prev']:
                del res[(p_f, p_t)]

    return res

In [159]:
len(xsankey_cntr_large_5_dir)

53

In [160]:
xsankey_cntr_large_6_dir = reduce_sankey_thin_out(xsankey_cntr_large_5_dir)
len(xsankey_cntr_large_6_dir)

reduce_sankey_thin_out():
  total_lines=16272
  threshold=0.005*16272=81.36
  - (libqtile/core, type.documentation): 15 *
  - (libqtile/extension, type.code): 50 *
  - (libqtile, libqtile/extension): 51  
  - (libqtile/extension, type.documentation): 1 *
  - (libqtile/interactive, type.documentation): 1 *
  - (libqtile/layout, type.documentation): 13 *
  - (libqtile/resources, type.documentation): 21 *
  - (libqtile/scripts, type.documentation): 9 *
  - (libqtile/widget, type.documentation): 80 *
  - (test/configs, type.code): 15 *
  - (test, test/configs): 15  
  - (test/extension, type.code): 49 *
  - (test, test/extension): 50  
  - (test/extension, type.documentation): 1 *
  - (test/layouts, type.documentation): 9 *
  - (test/scripts, type.code): 28 *
  - (test, test/scripts): 30  
  - (test/scripts, type.documentation): 2 *
  - (test/widgets, type.documentation): 28 *
  - (__.__, type.code): 11 *
  - (., __.__): 11  
  - (__test__, type.documentation): 29 *
  - (__libqtile__, type

46

In [161]:
xsankey_data_large_6 = sankey_triples_from_counter(xsankey_cntr_large_6_dir)
xsankey_data_large_6[::5]

[('libqtile', 'libqtile/backend', 5667),
 ('libqtile', 'libqtile/core', 268),
 ('libqtile/layout', 'type.documentation', 13),
 ('libqtile/resources', 'type.documentation', 21),
 ('libqtile', 'libqtile/widget', 3257),
 ('test', 'test/layouts', 1357),
 ('__.__', 'type.code', 11),
 ('__libqtile__', 'type.documentation', 40),
 ('test/backend', 'type.code', 216),
 ('test', 'type.documentation', 3)]

In [162]:
xsankey_large_6 = sankey_plot_from_triples(xsankey_data_large_6).opts(
    title=f"One less level, and thinned out - for ≈{len(xsankey_data_large_6)} dirs",
)
xsankey_large_6

In [163]:
threshold = 0.05

xsankey_cntr_large_6b_dir = reduce_sankey_thin_out(xsankey_cntr_large_5_dir, threshold_ratio=threshold)
xsankey_data_large_6b = sankey_triples_from_counter(xsankey_cntr_large_6b_dir)

xsankey_large_6b = sankey_plot_from_triples(xsankey_data_large_6b).opts(
    title=f"One less level, and thinned out (thresh={100.0*threshold:.1f}%) - for ≈{len(xsankey_data_large_6b)} dirs",
)
xsankey_large_6b

reduce_sankey_thin_out():
  total_lines=16272
  threshold=0.05*16272=813.6
  - (libqtile/command, type.code): 102 *
  - (libqtile, libqtile/command): 102  
  - (libqtile/core, type.code): 253 *
  - (libqtile, libqtile/core): 268  
  - (libqtile/core, type.documentation): 15 *
  - (libqtile/extension, type.code): 50 *
  - (libqtile, libqtile/extension): 51  
  - (libqtile/extension, type.documentation): 1 *
  - (libqtile/interactive, type.code): 82 *
  - (libqtile, libqtile/interactive): 83  
  - (libqtile/interactive, type.documentation): 1 *
  - (libqtile/layout, type.documentation): 13 *
  - (libqtile, libqtile/layout): 630  
  - (libqtile/layout, type.code): 617 *
  - (libqtile/resources, type.code): 90 *
  - (libqtile, libqtile/resources): 111  
  - (libqtile/resources, type.documentation): 21 *
  - (libqtile/scripts, type.code): 400 *
  - (libqtile, libqtile/scripts): 409  
  - (libqtile/scripts, type.documentation): 9 *
  - (libqtile/widget, type.documentation): 80 *
  - (test, t

In [164]:
(
    xsankey_large_1.opts(width=700, height=300, border=2) +
    xsankey_large_2.opts(width=700, height=300, border=2) +
    xsankey_large_3.opts(width=700, height=250, border=2) + 
    xsankey_large_4.opts(width=700, height=250, border=2) +
    xsankey_large_5.opts(width=700, height=250, border=2)
).cols(1)

In [165]:
( 
    xsankey_large_4.opts(width=700, height=250, border=2) +
    xsankey_large_5.opts(width=700, height=250, border=2) +
    xsankey_large_6.opts(width=700, height=250, border=2) +
    xsankey_large_6b.opts(width=700, height=250, border=2)
).cols(1)

## Styling Sankey (single large commit, processed)

In [166]:
xsankey_data_large_6b[:5]

[('libqtile', 'libqtile/backend', 5667),
 ('.', 'libqtile', 11597),
 ('libqtile/widget', 'type.code', 3177),
 ('libqtile', 'libqtile/widget', 3257),
 ('libqtile/widget', 'type.documentation', 80)]

In [167]:
xsankey_large_6b_df = pd.DataFrame.from_records(xsankey_data_large_6b, columns=['source', 'target', 'count'])
xsankey_large_6b_df.head(5)

Unnamed: 0,source,target,count
0,libqtile,libqtile/backend,5667
1,.,libqtile,11597
2,libqtile/widget,type.code,3177
3,libqtile,libqtile/widget,3257
4,libqtile/widget,type.documentation,80


In [168]:
xsankey_large_6b_df['from'] = xsankey_large_6b_df['source']\
    .str.rsplit(pat='/', n=2)\
    .map(lambda l: l[-1], na_action='ignore')\
    .str.replace(r'^\.$', '{qtile}', regex=True)
xsankey_large_6b_df['to'] = xsankey_large_6b_df['target']\
    .str.rsplit(pat='/', n=2)\
    .map(lambda l: l[-1], na_action='ignore')\
    .str.replace(r'^type\.(.*)$', lambda m: f"[{m.group(1)}]", regex=True)

xsankey_large_6b_df.head(5)

Unnamed: 0,source,target,count,from,to
0,libqtile,libqtile/backend,5667,libqtile,backend
1,.,libqtile,11597,{qtile},libqtile
2,libqtile/widget,type.code,3177,widget,[code]
3,libqtile,libqtile/widget,3257,libqtile,widget
4,libqtile/widget,type.documentation,80,widget,[documentation]


In [169]:
def color_from_row(row: pd.Series) -> str:
    if row['target'] == 'type.code':
        return 'blue'
    if row['target'] == 'type.documentation':
        return 'green'
    if row['target'] == 'type.test':
        return 'red'

    if row['source'].startswith('__') and row['source'].endswith('__'):
        return 'brown'

    return 'gray'


xsankey_large_6b_df['color'] = xsankey_large_6b_df.apply(
    color_from_row,
    axis=1,
    raw=False,
)

xsankey_large_6b_df.head(5)

Unnamed: 0,source,target,count,from,to,color
0,libqtile,libqtile/backend,5667,libqtile,backend,gray
1,.,libqtile,11597,{qtile},libqtile,gray
2,libqtile/widget,type.code,3177,widget,[code],blue
3,libqtile,libqtile/widget,3257,libqtile,widget,gray
4,libqtile/widget,type.documentation,80,widget,[documentation],green


`hv.Sankey` lacks good documentation:
- the Reference Gallery documentation shows example with `hv.Dataset`, not with `pd.DataFrame`<br>
  https://holoviews.org/reference/elements/bokeh/Sankey.html
- the Gallery with Demos shows use of `pd.DataFrame`, but is poor in shown options<br>
  https://holoviews.org/gallery/demos/bokeh/energy_sankey.html
- reference documentation lacks details; most of parameters are inherited<br>
  https://holoviews.org/reference_manual/holoviews.plotting.bokeh.html#sankey-module <br>
  https://holoviews.org/reference_manual/holoviews.element.html#module-holoviews.element.sankey <br>
  https://holoviews.org/reference_manual/holoviews.__init__.html#holoviews.__init__.Sankey

The [Sankey Diagram](https://python-graph-gallery.com/sankey-diagram/) section in python-graph-gallery.com
has an [example from Plotly](https://python-graph-gallery.com/sankey-diagram-with-python-and-plotly/),
but not from HoloViews.

Good examples of drawing Sankey graphs / plots with Python:
- **[Setting up Holoviews for creating Sankey diagrams in 2023](https://medium.com/@johnomoluabimail/fix-setting-up-holoviews-for-creating-sankey-diagrams-in-2023-python-cbbdef47d3ed)** - using `pd.DataFrame`
- [Sankey Diagrams in Python](https://medium.com/@cbkwgl/sankey-diagrams-in-python-fc9673465ccb) - simpler example, using `pd.DataFrame`, and using indexed numbers (from, to, value) and separate node list

Other references:
- https://en.wikipedia.org/wiki/Sankey_diagram
- https://www.data-to-viz.com/graph/sankey.html
- https://www.sankey-diagrams.com/

In [170]:
hv.Sankey(
    xsankey_large_6b_df[['from','to','count','color']],
    label="One less level, and thinned out",
).opts(
    label_position='outer',  # valid options include: '[left, right, outer, inner]'
    edge_color='color',
    node_color='index',
    cmap='tab20',
)

## Extracting data for Sankey - code

Code from Piotr Przymus, used in _"Wolves in Developers’ Clothing: Analyzing the Software Engineering Practice in the XZ Utils Supply Chain Attack"_

In [171]:
def plot_sankey_lines(global_commits_files_name_line_counter, output):
    path_counter = Counter()
    for f,v in global_commits_files_name_line_counter.items():
        components = f[0].split("/")
        type_change = f[1]
    
        if "po" in f[0]:
            type_change = "translation"
        if "test" in f[0] and type_change == "code":
            type_change = "test"
    
        if len(components) == 1:
            path_counter.update({ ("project", "__/__"): v})
            path_counter.update({ ("__/__", f"[{type_change}]"): v})
            continue
        else:
            if components[0]:
                path_counter.update({("project", components[0]): v})
            
        if len(components) == 2:
            path_counter.update({ (components[0], "__" + components[0] + "__"): v})
            path_counter.update({ ("__" + components[0] + "__", f"[{type_change}]"): v})
            continue
    
    
        for i in range(len(components)-2):
            if (not components[i] and components[i+1]):
                continue
            path_counter.update({tuple(components[i:i+2]):v})
        path_counter.update({(components[-2], f"[{type_change}]"):v})
    if not path_counter:
        return
    output.append("""
# Sankey files -> lines -> annotation
mermaid
---
config:
  sankey:
    showValues: false
---

sankey-beta

""")
    for k, v in path_counter.items():
        output.append(f"{','.join(k)},{v}\n")
    output.append("\n```\n")

New code, based on the one above

In [172]:
def pre_filter(
    lines_data: dict, timeline_data: dict, 
    drop_merges: bool = True, churn_size: Optional[int] = None
) -> dict:
    res = lines_data.copy()  # shallow copy is all we need
    return res

In [173]:
dd = {'a':1,'b':2,'c':3}
ee = dd.copy()
del ee['b']

print(f"{dd=}, {ee=}")

dd={'a': 1, 'b': 2, 'c': 3}, ee={'a': 1, 'c': 3}


# hvPlot / HoloViews help

In [174]:
hvplot.help('step')


The `step` plot connects the points with piece-wise constant curves.

The `step` plot can be used pretty much anytime the `line` plot might be used, and has many
of the same options available.

Reference: https://hvplot.holoviz.org/reference/tabular/step.html

Parameters
----------
x : string, optional
    Field name(s) to draw x-positions from. If not specified, the index is
    used. Must refer to continuous data. Not categorical data.
y : string or list, optional
    Field name(s) to draw y-positions from. If not specified, all numerical
    fields are used.
by : string, optional
    A single field or list of fields to group by. All the subgroups are visualized.
groupby: string, list, optional
    A single field or list of fields to group and filter by. Adds one or more widgets to
    select the subgroup(s) to visualize.
where: string, optional
    The interpolation method. One of 'mid', 'pre', 'post'. Default is 'mid'.
color : str or array-like, optional.
    The color for each of

In [175]:
hvplot.help('area')


The `area` plot can be used to color the area under a line or to color the space between two
lines.

Reference: https://hvplot.holoviz.org/reference/tabular/area.html

Parameters
----------
x : string, optional
    Field name(s) to draw x-positions from. If not specified, the index is
    used. Can refer to continuous and categorical data.
y : string, optional
    Field name to draw the first y-position from
y2 : string, optional
    Field name to draw the second y-position from
stacked : boolean, optional
    Whether to stack multiple areas. Default is False.
**kwds : optional
    Additional keywords arguments are documented in `hvplot.help('area')`.

Returns
-------
A Holoviews object. You can `print` the object to study its composition and run

.. code-block::

    import holoviews as hv
    hv.help(the_holoviews_object)

to learn more about its parameters and options.

Example
-------

.. code-block::

    import hvplot.pandas
    import pandas as pd

    df = pd.DataFrame(
      

In [176]:
help(hv.Layout)

Help on class Layout in module holoviews.core.layout:

class Layout(Layoutable, holoviews.core.dimension.ViewableTree)
 |  Layout(items=None, identifier=None, parent=None, **kwargs)
 |
 |      A Layout is an ViewableTree with ViewableElement objects as leaf
 |      values. Unlike ViewableTree, a Layout supports a rich display,
 |      displaying leaf items in a grid style layout. In addition to the
 |      usual ViewableTree indexing, Layout supports indexing of items by
 |      their row and column index in the layout.
 |
 |      The maximum number of columns in such a layout may be controlled
 |      with the cols method.
 |
 |  [1;32mParameters of 'Layout'
 |  [0m
 |  [1;31mParameters changed from their default values are marked in red.[0m
 |  [1;36mSoft bound values are marked in cyan.[0m
 |  C/V= Constant/Variable, RO/RW = ReadOnly/ReadWrite, AN=Allow None
 |
 |  [1;34mName     Value     Type     Bounds   Mode [0m
 |
 |  group   'Layout'  String             C RW
 |  label 

In [177]:
hv.help(hv.Area)

Area

Online example: https://holoviews.org/reference/elements/bokeh/Area.html

[1;35m-------------
Style Options
-------------[0m

	alpha, color, fill_alpha, fill_color, hover_alpha, hover_color, hover_fill_alpha, hover_fill_color, hover_line_alpha, hover_line_cap, hover_line_color, hover_line_dash, hover_line_dash_offset, hover_line_join, hover_line_width, line_alpha, line_cap, line_color, line_dash, line_dash_offset, line_join, line_width, muted, muted_alpha, muted_color, muted_fill_alpha, muted_fill_color, muted_line_alpha, muted_line_cap, muted_line_color, muted_line_dash, muted_line_dash_offset, muted_line_join, muted_line_width, nonselection_alpha, nonselection_color, nonselection_fill_alpha, nonselection_fill_color, nonselection_line_alpha, nonselection_line_cap, nonselection_line_color, nonselection_line_dash, nonselection_line_dash_offset, nonselection_line_join, nonselection_line_width, selection_alpha, selection_color, selection_fill_alpha, selection_fill_color, selection

In [178]:
hv.help(hv.Curve)

Curve

Online example: https://holoviews.org/reference/elements/bokeh/Curve.html

[1;35m-------------
Style Options
-------------[0m

	alpha, color, hover_alpha, hover_color, hover_line_alpha, hover_line_cap, hover_line_color, hover_line_dash, hover_line_dash_offset, hover_line_join, hover_line_width, line_alpha, line_cap, line_color, line_dash, line_dash_offset, line_join, line_width, muted, muted_alpha, muted_color, muted_line_alpha, muted_line_cap, muted_line_color, muted_line_dash, muted_line_dash_offset, muted_line_join, muted_line_width, nonselection_alpha, nonselection_color, nonselection_line_alpha, nonselection_line_cap, nonselection_line_color, nonselection_line_dash, nonselection_line_dash_offset, nonselection_line_join, nonselection_line_width, selection_alpha, selection_color, selection_line_alpha, selection_line_cap, selection_line_color, selection_line_dash, selection_line_dash_offset, selection_line_join, selection_line_width, visible

(Consult bokeh's documentation f

In [179]:
hv.help(hv.Sankey)

Sankey

Online example: https://holoviews.org/reference/elements/bokeh/Sankey.html

[1;35m-------------
Style Options
-------------[0m

	cmap, edge_alpha, edge_cmap, edge_color, edge_fill_alpha, edge_fill_color, edge_hover_alpha, edge_hover_color, edge_hover_fill_alpha, edge_hover_fill_color, edge_hover_line_alpha, edge_hover_line_cap, edge_hover_line_color, edge_hover_line_dash, edge_hover_line_dash_offset, edge_hover_line_join, edge_hover_line_width, edge_line_alpha, edge_line_cap, edge_line_color, edge_line_dash, edge_line_dash_offset, edge_line_join, edge_line_width, edge_muted, edge_muted_alpha, edge_muted_color, edge_muted_fill_alpha, edge_muted_fill_color, edge_muted_line_alpha, edge_muted_line_cap, edge_muted_line_color, edge_muted_line_dash, edge_muted_line_dash_offset, edge_muted_line_join, edge_muted_line_width, edge_nonselection_alpha, edge_nonselection_color, edge_nonselection_fill_alpha, edge_nonselection_fill_color, edge_nonselection_line_alpha, edge_nonselection_line_

In [180]:
hv.help(hv.Graph)

Graph

Online example: https://holoviews.org/reference/elements/bokeh/Graph.html

[1;35m-------------
Style Options
-------------[0m

	cmap, edge_alpha, edge_cmap, edge_color, edge_fill_alpha, edge_fill_color, edge_hover_alpha, edge_hover_color, edge_hover_fill_alpha, edge_hover_fill_color, edge_hover_line_alpha, edge_hover_line_cap, edge_hover_line_color, edge_hover_line_dash, edge_hover_line_dash_offset, edge_hover_line_join, edge_hover_line_width, edge_line_alpha, edge_line_cap, edge_line_color, edge_line_dash, edge_line_dash_offset, edge_line_join, edge_line_width, edge_muted, edge_muted_alpha, edge_muted_color, edge_muted_fill_alpha, edge_muted_fill_color, edge_muted_line_alpha, edge_muted_line_cap, edge_muted_line_color, edge_muted_line_dash, edge_muted_line_dash_offset, edge_muted_line_join, edge_muted_line_width, edge_nonselection_alpha, edge_nonselection_color, edge_nonselection_fill_alpha, edge_nonselection_fill_color, edge_nonselection_line_alpha, edge_nonselection_line_ca