## Example for callbacks + Memory profiling of verbs and workflow

In this example we show how to use the workflow callbacks to be able profile the memory usage of every verb

In [1]:
import json
import os

from typing import Any, Dict, Optional, List
from datashaper.execution.execution_node import ExecutionNode
from datashaper.table_store import TableContainer

from typing import List

from datashaper.workflow import Workflow

### Create a Callback class
This class needs to follows the WorkflowCallbacks Protocol, we use tracemalloc to create memory snapshots and trace the memory

In [2]:
import pandas as pd
import tracemalloc
from collections import defaultdict


class MemoryProfilingCallbacks:
    def __init__(self):
        self._snapshots = defaultdict(list)
        self._peak_memory = defaultdict(list)
        self._peak_start_workflow = 0
        self._peak_start_verb = 0

    def on_workflow_start(self) -> None:
        """Called when the workflow starts."""
        tracemalloc.start()
        _, self._peak_start_workflow = tracemalloc.get_traced_memory()
        self._snapshots["all"].append(tracemalloc.take_snapshot())

    def on_step_start(self, node: ExecutionNode, inputs: Dict[str, Any]) -> None:
        """Called when a step starts."""
        # reset peak so we can get the peak during the verb execution
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        _, self._peak_start_verb = tracemalloc.get_traced_memory()

    def on_step_end(
        self, node: ExecutionNode, result: Optional[TableContainer]
    ) -> None:
        """Called when a step ends."""
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        # Get peak recorded during verb execution
        _, peak = tracemalloc.get_traced_memory()
        self._peak_memory[node.verb.name].append(peak - self._peak_start_verb)

    def on_workflow_end(self) -> None:
        """Called when the workflow ends."""
        self._snapshots["all"].append(tracemalloc.take_snapshot())
        _, peak = tracemalloc.get_traced_memory()
        self._peak_memory["all"].append(peak - self._peak_start_workflow)
        tracemalloc.stop()

    def get_snapshot_stats(self, sort_by="max"):
        stats = {}
        for verb, snapshots in memory_profiling_callbacks._snapshots.items():
            verb_stats = []
            for first, second in zip(snapshots[::2], snapshots[1::2]):
                stat_diff = second.compare_to(first, "lineno")
                diff_size = sum(stat.size_diff for stat in stat_diff)
                verb_stats.append(MemoryProfilingCallbacks.__bytes_to_mb(diff_size))
            stats[verb] = {
                "mean": sum(verb_stats) / len(verb_stats),
                "max": max(verb_stats),
                "min": min(verb_stats),
                "samples": len(verb_stats),
            }
        return pd.DataFrame(stats).transpose().sort_values(sort_by, ascending=False)

    def get_peak_stats(self, sort_by="max"):
        stats = {}
        for verb, peak in memory_profiling_callbacks._peak_memory.items():
            stats[verb] = {
                "mean": MemoryProfilingCallbacks.__bytes_to_mb(sum(peak) / len(peak)),
                "max": MemoryProfilingCallbacks.__bytes_to_mb(max(peak)),
                "min": MemoryProfilingCallbacks.__bytes_to_mb(min(peak)),
                "samples": len(peak),
            }
        return pd.DataFrame(stats).transpose().sort_values(sort_by, ascending=False)

    def get_detailed_view(self, verb: str, filter_package: Optional[str] = None):
        snapshots = self._snapshots[verb]
        diffs = []
        for first, second in zip(snapshots[::2], snapshots[1::2]):
            stat_diff = second.compare_to(first, "lineno")
            if filter_package is not None:
                stat_diff = [
                    stat
                    for stat in stat_diff
                    if filter_package in stat.traceback[0].filename
                ]
            diffs.append(stat_diff)
        return diffs

    @staticmethod
    def __bytes_to_mb(bytes):
        return bytes / 1024**2

In [3]:
FIXTURES_PATH = "../../../schema/fixtures/workflow"
TABLE_STORE_PATH = "../../../schema/fixtures/workflow_inputs"

memory_profiling_callbacks = MemoryProfilingCallbacks()


def get_verb_test_specs(root: str) -> List[str]:
    subfolders: List[str] = []
    for root, _, files in os.walk(root):
        if "workflow.json" in files:
            subfolders.append(root)
    return subfolders


def test_verbs_schema_input(fixture_path: str):
    with open(os.path.join(fixture_path, "workflow.json")) as schema:
        workflow = Workflow(
            schema=json.load(schema),
            input_path=TABLE_STORE_PATH,
        )

    workflow.run(workflow_callbacks=memory_profiling_callbacks)

In [4]:
for fixture_path in get_verb_test_specs(FIXTURES_PATH):
    test_verbs_schema_input(fixture_path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
  output[to] = output[column].str.replace(


In [5]:
memory_profiling_callbacks.get_snapshot_stats()

Unnamed: 0,mean,max,min,samples
all,0.015058,0.202339,0.004691,147.0
fold,0.027312,0.112832,0.008223,6.0
unfold,0.028426,0.08537,0.00857,4.0
union,0.040619,0.07421,0.007029,2.0
difference,0.043998,0.043998,0.043998,1.0
binarize,0.013999,0.037935,0.008766,7.0
onehot,0.020074,0.03739,0.013704,4.0
rollup,0.016019,0.036043,0.004776,3.0
merge,0.009467,0.035978,0.005861,15.0
unroll,0.034538,0.034538,0.034538,1.0


In [6]:
memory_profiling_callbacks.get_peak_stats()

Unnamed: 0,mean,max,min,samples
all,0.022202,0.207294,0.005412,147.0
fold,0.040604,0.122186,0.019953,6.0
unfold,0.024183,0.081252,0.001883,4.0
union,0.047059,0.076118,0.018,2.0
difference,0.053494,0.053494,0.053494,1.0
join,0.037332,0.051693,0.028029,7.0
onehot,0.027752,0.046329,0.018909,4.0
binarize,0.024636,0.044332,0.019788,7.0
merge,0.016154,0.043718,0.011037,15.0
unroll,0.04295,0.04295,0.04295,1.0


In [7]:
memory_profiling_callbacks.get_detailed_view("difference", filter_package="datashaper")

[[<StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/difference.py' lineno=21>,)> size=680 (+680) count=3 (+3)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/difference.py' lineno=19>,)> size=608 (+608) count=1 (+1)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/difference.py' lineno=23>,)> size=96 (+96) count=2 (+2)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=362>,)> size=48 (+48) count=1 (+1)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=280>,)> size=216 (+0) count=2 (+0)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/an

In [8]:
memory_profiling_callbacks.get_detailed_view("difference")

[[<StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.cache/pypoetry/virtualenvs/examples-gtU7jnqy-py3.10/lib/python3.10/site-packages/pandas/core/reshape/merge.py' lineno=842>,)> size=4096 (+4096) count=1 (+1)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.pyenv/versions/3.10.11/lib/python3.10/abc.py' lineno=123>,)> size=1406 (+1406) count=11 (+11)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.cache/pypoetry/virtualenvs/examples-gtU7jnqy-py3.10/lib/python3.10/site-packages/pandas/core/ops/common.py' lineno=72>,)> size=1360 (+1360) count=3 (+3)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.cache/pypoetry/virtualenvs/examples-gtU7jnqy-py3.10/lib/python3.10/site-packages/pandas/core/common.py' lineno=235>,)> size=1115 (+1115) count=2 (+2)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.cache/pypoetry/virtualenvs/examples-gtU7jnqy-py3.10/lib/python3.10/site-p