In [1]:
import json
import os
import tracemalloc

from collections import defaultdict

from typing import Any, Dict, Optional
from datashaper.execution.execution_node import ExecutionNode
from datashaper.table_store import TableContainer

from typing import List

from datashaper.workflow import Workflow

In [2]:
class MemoryProfilingCallbacks:

    def __init__(self):
        self._snapshots = defaultdict(list)
        self._peak_memory = defaultdict(list)

    def on_workflow_start(self) -> None:
        """Called when the workflow starts."""
        tracemalloc.start()
    
    def on_step_start(self, node: ExecutionNode, inputs: Dict[str, Any]) -> None:
        """Called when a step starts."""
        # reset peak so we can get the peak during the verb execution
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        tracemalloc.reset_peak()
        

    def on_step_end(self, node: ExecutionNode, result: Optional[TableContainer]) -> None:
        """Called when a step ends."""
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        # Get peak recorded during verb execution
        _, peak = tracemalloc.get_traced_memory()
        self._peak_memory[node.verb.name].append(peak)

    def on_workflow_end(self) -> None:
        """Called when the workflow ends."""
        tracemalloc.stop()

In [3]:
FIXTURES_PATH = "../../../schema/fixtures/workflow"
TABLE_STORE_PATH = "../../../schema/fixtures/workflow_inputs"

memory_profiling_callbacks = MemoryProfilingCallbacks()

def get_verb_test_specs(root: str) -> List[str]:
    subfolders: List[str] = []
    for root, _, files in os.walk(root):
        if "workflow.json" in files:
            subfolders.append(root)
    return subfolders


def test_verbs_schema_input(fixture_path: str):
    with open(os.path.join(fixture_path, "workflow.json")) as schema:
        workflow = Workflow(
            schema=json.load(schema),
            input_path=TABLE_STORE_PATH,
            
        )

    workflow.run(workflow_callbacks=memory_profiling_callbacks)

In [4]:
for fixture_path in get_verb_test_specs(FIXTURES_PATH):
    test_verbs_schema_input(fixture_path)

  output = output[other_columns].groupby(level=0).agg("first").append(output_temp)
  output = output[other_columns].groupby(level=0).agg("first").append(output_temp)
  output = output[other_columns].groupby(level=0).agg("first").append(output_temp)
  output = output[other_columns].groupby(level=0).agg("first").append(output_temp)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

In [5]:
stats = {}
stats_peak = {}

# function to turn bytes to kb
def bytes_to_kb(bytes):
    return bytes / 1024

for verb, snapshots in memory_profiling_callbacks._snapshots.items():
    verb_stats = []
    for first, second in zip(snapshots[::2], snapshots[1::2]):
        stat_diff = second.compare_to(first, 'lineno')
        diff_size = sum(stat.size_diff for stat in stat_diff)
        verb_stats.append(bytes_to_kb(diff_size)) # Get total size difference between start and end of verb execution for each sample
    stats[verb] = {
        'mean': sum(verb_stats) / len(verb_stats),
        'max': max(verb_stats),
        'min': min(verb_stats),
        'samples': len(verb_stats)
    }

for verb, traced_peak in memory_profiling_callbacks._peak_memory.items():
    stats_peak[verb] = {
        'mean': bytes_to_kb(sum(traced_peak) / len(traced_peak)),
        'max': bytes_to_kb(max(traced_peak)),
        'min': bytes_to_kb(min(traced_peak)),
        'samples': len(traced_peak)
    }


In [6]:
import pandas as pd

pd.DataFrame(stats).transpose().sort_values('mean', ascending=False)

Unnamed: 0,mean,max,min,samples
union,41.997559,77.245117,6.75,2.0
difference,41.474609,41.474609,41.474609,1.0
unroll,36.453125,36.453125,36.453125,1.0
unfold,33.547852,99.141602,11.62793,4.0
fold,28.296875,116.15625,8.453125,6.0
window,25.96875,25.96875,25.96875,1.0
onehot,20.564941,41.165039,13.081055,4.0
join,18.368722,30.152344,11.523438,7.0
rollup,15.885417,35.660156,4.707031,3.0
binarize,14.322266,39.216797,8.24707,7.0


In [7]:
import pandas as pd

pd.DataFrame(stats_peak).transpose().sort_values('mean', ascending=False)

Unnamed: 0,mean,max,min,samples
unfold,87.159424,233.588867,32.693359,4.0
union,59.94043,99.958008,19.922852,2.0
difference,53.662109,53.662109,53.662109,1.0
unroll,46.904297,46.904297,46.904297,1.0
fold,45.19987,129.15918,23.498047,6.0
join,40.118862,53.961914,30.495117,7.0
select,36.973307,81.128906,9.253906,3.0
onehot,32.208496,54.164062,21.797852,4.0
window,32.020508,32.020508,32.020508,1.0
intersect,31.255859,31.255859,31.255859,1.0
