In [1]:
import json
import os
import tracemalloc

from collections import defaultdict

from typing import Any, Dict, Optional
from datashaper.execution.execution_node import ExecutionNode
from datashaper.table_store import TableContainer

from typing import List

from datashaper.workflow import Workflow

In [2]:
class MemoryProfilingCallbacks:

    def __init__(self):
        self._snapshots = defaultdict(list)
        self._peak_memory = defaultdict(list)

    def on_workflow_start(self) -> None:
        """Called when the workflow starts."""
        tracemalloc.start()
        self._snapshots['all'].append(tracemalloc.take_snapshot())

    def on_step_start(self, node: ExecutionNode, inputs: Dict[str, Any]) -> None:
        """Called when a step starts."""
        # reset peak so we can get the peak during the verb execution
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        tracemalloc.reset_peak()


    def on_step_end(self, node: ExecutionNode, result: Optional[TableContainer]) -> None:
        """Called when a step ends."""
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        # Get peak recorded during verb execution
        _, peak = tracemalloc.get_traced_memory()
        self._peak_memory[node.verb.name].append(peak)

    def on_workflow_end(self) -> None:
        """Called when the workflow ends."""
        self._snapshots['all'].append(tracemalloc.take_snapshot())
        tracemalloc.stop()

In [3]:
FIXTURES_PATH = "../../../schema/fixtures/workflow"
TABLE_STORE_PATH = "../../../schema/fixtures/workflow_inputs"

memory_profiling_callbacks = MemoryProfilingCallbacks()

def get_verb_test_specs(root: str) -> List[str]:
    subfolders: List[str] = []
    for root, _, files in os.walk(root):
        if "workflow.json" in files:
            subfolders.append(root)
    return subfolders


def test_verbs_schema_input(fixture_path: str):
    with open(os.path.join(fixture_path, "workflow.json")) as schema:
        workflow = Workflow(
            schema=json.load(schema),
            input_path=TABLE_STORE_PATH,
            
        )

    workflow.run(workflow_callbacks=memory_profiling_callbacks)

In [4]:
for _ in range(1):
    for fixture_path in get_verb_test_specs(FIXTURES_PATH):
        test_verbs_schema_input(fixture_path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
  output[to] = output[column].str.replace(


In [5]:
stats = {}
stats_peak = {}

# function to turn bytes to kb
def bytes_to_kb(bytes):
    return bytes / 1024

for verb, snapshots in memory_profiling_callbacks._snapshots.items():
    verb_stats = []
    for first, second in zip(snapshots[::2], snapshots[1::2]):
        stat_diff = second.compare_to(first, 'lineno')
        diff_size = sum(stat.size_diff for stat in stat_diff)
        verb_stats.append(bytes_to_kb(diff_size)) # Get total size difference between start and end of verb execution for each sample
    stats[verb] = {
        'mean': sum(verb_stats) / len(verb_stats),
        'max': max(verb_stats),
        'min': min(verb_stats),
        'samples': len(verb_stats)
    }

for verb, traced_peak in memory_profiling_callbacks._peak_memory.items():
    stats_peak[verb] = {
        'mean': bytes_to_kb(sum(traced_peak) / len(traced_peak)),
        'max': bytes_to_kb(max(traced_peak)),
        'min': bytes_to_kb(min(traced_peak)),
        'samples': len(traced_peak)
    }


In [6]:
import pandas as pd

pd.DataFrame(stats).transpose().sort_values('max', ascending=False)

Unnamed: 0,mean,max,min,samples
all,15.412289,206.954102,4.776367,147.0
fold,28.312663,116.320312,8.620117,6.0
unfold,29.161865,86.510742,9.392578,4.0
union,42.348145,77.469727,7.226562,2.0
difference,41.202148,41.202148,41.202148,1.0
binarize,15.987305,39.214844,8.498047,7.0
merge,9.516667,38.679688,6.054688,15.0
onehot,20.429932,38.091797,14.005859,4.0
rollup,16.189779,36.323242,4.863281,3.0
unroll,35.453125,35.453125,35.453125,1.0


In [7]:
import pandas as pd

pd.DataFrame(stats_peak).transpose().sort_values('max', ascending=False)

Unnamed: 0,mean,max,min,samples
unfold,78.739502,212.139648,32.905273,4.0
fold,45.479655,129.84668,23.780273,6.0
union,60.44043,100.30957,20.571289,2.0
select,37.234375,81.464844,9.517578,3.0
join,40.694336,55.305664,30.948242,7.0
difference,53.041992,53.041992,53.041992,1.0
fill,22.699056,51.317383,8.290039,6.0
onehot,32.22583,51.208008,23.089844,4.0
merge,21.520703,51.024414,15.541992,15.0
binarize,30.922433,49.774414,24.467773,7.0


In [8]:
[stat for stat in memory_profiling_callbacks._snapshots['unfold'][1].statistics('lineno') if stat.traceback[0].filename.endswith('unfold.py')]

[<Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=27>,)> size=1208 count=3>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=20>,)> size=832 count=2>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=26>,)> size=576 count=3>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=25>,)> size=464 count=1>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=18>,)> size=408 count=1>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfol

In [9]:
[stat for stat in memory_profiling_callbacks._snapshots['unfold'][1].compare_to(memory_profiling_callbacks._snapshots['unfold'][0], 'lineno') if stat.traceback[0].filename.endswith('unfold.py')]

[<StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=27>,)> size=1208 (+1208) count=3 (+3)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=20>,)> size=832 (+832) count=2 (+2)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=26>,)> size=576 (+576) count=3 (+3)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=25>,)> size=464 (+464) count=1 (+1)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/unfold.py' lineno=18>,)> size=408 (+408) count=1 (+1)>,
 <StatisticDiff traceback=<Traceback (<Frame filename

In [10]:
sum(stat.size for stat  in memory_profiling_callbacks._snapshots['unfold'][1].statistics('lineno'))

211261