## Example for callbacks + Memory profiling of verbs and workflow

In this example we show how to use the workflow callbacks to be able profile the memory usage of every verb

In [1]:
import json
import os

from typing import Any, Dict, Optional, List
from datashaper.execution.execution_node import ExecutionNode
from datashaper.table_store import TableContainer

from typing import List

from datashaper.workflow import Workflow

### Create a Callback class
This class needs to follows the WorkflowCallbacks Protocol, we use tracemalloc to create memory snapshots and trace the memory

In [2]:
import pandas as pd
import tracemalloc
import time
from collections import defaultdict


class MemoryProfilingCallbacks:
    def __init__(self):
        self._snapshots = defaultdict(list)
        self._peak_memory = defaultdict(list)
        self._timing = defaultdict(list)
        self._peak_memory = defaultdict(list)
        self._peak_start_workflow = 0
        self._peak_start_verb = 0
        self._workflow_start = 0
        self._verb_start = 0

    def on_workflow_start(self) -> None:
        """Called when the workflow starts."""
        tracemalloc.start()
        _, self._peak_start_workflow = tracemalloc.get_traced_memory()
        self._snapshots["all"].append(tracemalloc.take_snapshot())
        self._workflow_start = time.time()

    def on_step_start(self, node: ExecutionNode, inputs: Dict[str, Any]) -> None:
        """Called when a step starts."""
        # reset peak so we can get the peak during the verb execution
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        _, self._peak_start_verb = tracemalloc.get_traced_memory()
        self._verb_start = time.time()

    def on_step_end(
        self, node: ExecutionNode, result: Optional[TableContainer]
    ) -> None:
        """Called when a step ends."""
        total_time = time.time() - self._verb_start
        self._timing[node.verb.name].append(total_time)
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        # Get peak recorded during verb execution
        _, peak = tracemalloc.get_traced_memory()
        self._peak_memory[node.verb.name].append(peak - self._peak_start_verb)

    def on_workflow_end(self) -> None:
        """Called when the workflow ends."""
        total_time = time.time() - self._workflow_start
        self._timing["all"].append(total_time)
        self._snapshots["all"].append(tracemalloc.take_snapshot())
        _, peak = tracemalloc.get_traced_memory()
        self._peak_memory["all"].append(peak - self._peak_start_workflow)
        tracemalloc.stop()

    def get_snapshot_stats(self, sort_by="max"):
        stats = {}
        for verb, snapshots in self._snapshots.items():
            verb_stats = []
            for first, second in zip(snapshots[::2], snapshots[1::2]):
                stat_diff = second.compare_to(first, "lineno")
                diff_size = sum(stat.size_diff for stat in stat_diff)
                verb_stats.append(MemoryProfilingCallbacks.__bytes_to_mb(diff_size))
            stats[verb] = {
                "mean": sum(verb_stats) / len(verb_stats),
                "max": max(verb_stats),
                "min": min(verb_stats),
                "samples": len(verb_stats),
            }
        return pd.DataFrame(stats).transpose().sort_values(sort_by, ascending=False)

    def get_peak_stats(self, sort_by="max"):
        stats = {}
        for verb, peak in self._peak_memory.items():
            stats[verb] = {
                "mean": MemoryProfilingCallbacks.__bytes_to_mb(sum(peak) / len(peak)),
                "max": MemoryProfilingCallbacks.__bytes_to_mb(max(peak)),
                "min": MemoryProfilingCallbacks.__bytes_to_mb(min(peak)),
                "samples": len(peak),
            }
        return pd.DataFrame(stats).transpose().sort_values(sort_by, ascending=False)

    def get_time_stats(self, sort_by="max"):
        stats = {}
        for verb, times in self._timing.items():
            stats[verb] = {
                "mean": sum(times) / len(times),
                "max": max(times),
                "min": min(times),
                "samples": len(times),
            }
        return pd.DataFrame(stats).transpose().sort_values(sort_by, ascending=False)

    def get_detailed_view(self, verb: str, filter_package: Optional[str] = None):
        snapshots = self._snapshots[verb]
        diffs = []
        for first, second in zip(snapshots[::2], snapshots[1::2]):
            stat_diff = second.compare_to(first, "lineno")
            if filter_package is not None:
                stat_diff = [
                    stat
                    for stat in stat_diff
                    if filter_package in stat.traceback[0].filename
                ]
            diffs.append(stat_diff)
        return diffs

    @staticmethod
    def __bytes_to_mb(bytes):
        return bytes / 1024**2

In [3]:
FIXTURES_PATH = "../../../schema/fixtures/workflow"
TABLE_STORE_PATH = "../../../schema/fixtures/workflow_inputs"

memory_profiling_callbacks = MemoryProfilingCallbacks()


def get_verb_test_specs(root: str) -> List[str]:
    subfolders: List[str] = []
    for root, _, files in os.walk(root):
        if "workflow.json" in files:
            subfolders.append(root)
    return subfolders


def test_verbs_schema_input(fixture_path: str):
    with open(os.path.join(fixture_path, "workflow.json")) as schema:
        workflow = Workflow(
            schema=json.load(schema),
            input_path=TABLE_STORE_PATH,
        )

    workflow.run(workflow_callbacks=memory_profiling_callbacks)

In [4]:
for fixture_path in get_verb_test_specs(FIXTURES_PATH):
    test_verbs_schema_input(fixture_path)

Test print verb
    ID       Name  Employees     US
0    1  Microsoft     160000   True
1    2      Apple     150000   True
..  ..        ...        ...    ...
3    4     Amazon    1250000   True
4    5    Samsung     270000  False


In [5]:
memory_profiling_callbacks.get_snapshot_stats()

Unnamed: 0,mean,max,min,samples
all,0.015459,0.200583,0.004775,148.0
fold,0.027896,0.114183,0.008444,6.0
print,0.086039,0.086039,0.086039,1.0
unfold,0.026499,0.082424,0.007277,4.0
union,0.040701,0.074374,0.007029,2.0
difference,0.04226,0.04226,0.04226,1.0
binarize,0.015732,0.038435,0.007998,7.0
unroll,0.035092,0.035092,0.035092,1.0
convert,0.010555,0.034208,0.004603,15.0
merge,0.009023,0.031582,0.006018,15.0


In [6]:
memory_profiling_callbacks.get_peak_stats()

Unnamed: 0,mean,max,min,samples
all,0.022463,0.204617,0.005412,148.0
fold,0.040979,0.123507,0.020229,6.0
print,0.091555,0.091555,0.091555,1.0
unfold,0.023233,0.077254,0.001755,4.0
union,0.047049,0.076098,0.018,2.0
difference,0.052176,0.052176,0.052176,1.0
join,0.037613,0.051956,0.02814,7.0
binarize,0.026225,0.044988,0.019181,7.0
unroll,0.04342,0.04342,0.04342,1.0
onehot,0.02525,0.039433,0.016118,4.0


In [7]:
memory_profiling_callbacks.get_time_stats()

Unnamed: 0,mean,max,min,samples
all,0.00877,0.048321,0.000532,148.0
binarize,0.037021,0.047152,0.029247,7.0
merge,0.010196,0.027203,0.004643,15.0
join,0.018361,0.021782,0.012813,7.0
unhot,0.017313,0.01982,0.014806,2.0
fold,0.014956,0.019343,0.010067,6.0
difference,0.018826,0.018826,0.018826,1.0
filter,0.014073,0.016299,0.011562,7.0
intersect,0.015411,0.015411,0.015411,1.0
unfold,0.011909,0.014956,0.008003,4.0


In [8]:
memory_profiling_callbacks.get_detailed_view("difference", filter_package="datashaper")

[[<StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/difference.py' lineno=21>,)> size=680 (+680) count=3 (+3)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/difference.py' lineno=19>,)> size=608 (+608) count=1 (+1)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/difference.py' lineno=23>,)> size=96 (+96) count=2 (+2)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=359>,)> size=48 (+48) count=1 (+1)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=280>,)> size=216 (+0) count=2 (+0)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/an

In [9]:
memory_profiling_callbacks.get_detailed_view("difference")

[[<StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.cache/pypoetry/virtualenvs/examples-gtU7jnqy-py3.10/lib/python3.10/site-packages/pandas/core/construction.py' lineno=493>,)> size=3306 (+3306) count=2 (+2)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.pyenv/versions/3.10.11/lib/python3.10/abc.py' lineno=123>,)> size=1406 (+1406) count=11 (+11)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.cache/pypoetry/virtualenvs/examples-gtU7jnqy-py3.10/lib/python3.10/site-packages/pandas/core/ops/common.py' lineno=72>,)> size=1360 (+1360) count=3 (+3)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/.cache/pypoetry/virtualenvs/examples-gtU7jnqy-py3.10/lib/python3.10/site-packages/pandas/core/internals/blocks.py' lineno=457>,)> size=1112 (+1112) count=1 (+1)>,
  <StatisticDiff traceback=<Traceback (<Frame filename='<__array_function__ internals>' lineno=180>,)> size=1064 (+1064) count=7 (+7)>,
 