In [1]:
import json
import os
import tracemalloc

from collections import defaultdict

from typing import Any, Dict, Optional
from datashaper.execution.execution_node import ExecutionNode
from datashaper.table_store import TableContainer

from typing import List

from datashaper.workflow import Workflow

In [2]:
class MemoryProfilingCallbacks:

    def __init__(self):
        self._snapshots = defaultdict(list)
        self._peak_memory = defaultdict(list)
        self._peak_start_workflow = 0
        self._peak_start_verb = 0

    def on_workflow_start(self) -> None:
        """Called when the workflow starts."""
        tracemalloc.start()
        _, self._peak_start_workflow = tracemalloc.get_traced_memory()
        self._snapshots['all'].append(tracemalloc.take_snapshot())
        

    def on_step_start(self, node: ExecutionNode, inputs: Dict[str, Any]) -> None:
        """Called when a step starts."""
        # reset peak so we can get the peak during the verb execution
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        _, self._peak_start_verb = tracemalloc.get_traced_memory()


    def on_step_end(self, node: ExecutionNode, result: Optional[TableContainer]) -> None:
        """Called when a step ends."""
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())
        # Get peak recorded during verb execution
        _, peak = tracemalloc.get_traced_memory()
        self._peak_memory[node.verb.name].append(peak - self._peak_start_verb)

    def on_workflow_end(self) -> None:
        """Called when the workflow ends."""
        self._snapshots['all'].append(tracemalloc.take_snapshot())
        _, peak = tracemalloc.get_traced_memory()
        self._peak_memory['all'].append(peak - self._peak_start_workflow)
        tracemalloc.stop()

In [3]:
FIXTURES_PATH = "../../../schema/fixtures/workflow"
TABLE_STORE_PATH = "../../../schema/fixtures/workflow_inputs"

memory_profiling_callbacks = MemoryProfilingCallbacks()

def get_verb_test_specs(root: str) -> List[str]:
    subfolders: List[str] = []
    for root, _, files in os.walk(root):
        if "workflow.json" in files:
            subfolders.append(root)
    return subfolders


def test_verbs_schema_input(fixture_path: str):
    with open(os.path.join(fixture_path, "workflow.json")) as schema:
        workflow = Workflow(
            schema=json.load(schema),
            input_path=TABLE_STORE_PATH,
        )

    workflow.run(workflow_callbacks=memory_profiling_callbacks)

In [4]:
for _ in range(10):
    for fixture_path in get_verb_test_specs(FIXTURES_PATH):
        test_verbs_schema_input(fixture_path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
  output[to] = output[column].str.replace(
A val

In [5]:
stats = {}
stats_peak = {}

# function to turn bytes to kb
def bytes_to_mb(bytes):
    return bytes / 1024**2

for verb, snapshots in memory_profiling_callbacks._snapshots.items():
    verb_stats = []
    for first, second in zip(snapshots[::2], snapshots[1::2]):
        stat_diff = second.compare_to(first, 'lineno')
        diff_size = sum(stat.size_diff for stat in stat_diff)
        verb_stats.append(bytes_to_mb(diff_size)) # Get total size difference between start and end of verb execution for each sample
    stats[verb] = {
        'mean': sum(verb_stats) / len(verb_stats),
        'max': max(verb_stats),
        'min': min(verb_stats),
        'samples': len(verb_stats)
    }

for verb, traced_peak in memory_profiling_callbacks._peak_memory.items():
    stats_peak[verb] = {
        'mean': bytes_to_mb(sum(traced_peak) / len(traced_peak)),
        'max': bytes_to_mb(max(traced_peak)),
        'min': bytes_to_mb(min(traced_peak)),
        'samples': len(traced_peak)
    }


In [6]:
import pandas as pd

pd.DataFrame(stats).transpose().sort_values('max', ascending=False)

Unnamed: 0,mean,max,min,samples
all,0.010601,0.20384,0.004485,1470.0
fold,0.011662,0.114566,0.007505,60.0
unfold,0.012488,0.085137,0.00929,40.0
union,0.012218,0.073369,0.00717,20.0
difference,0.014985,0.040193,0.011419,10.0
binarize,0.009956,0.037193,0.007677,70.0
merge,0.007571,0.036962,0.005774,150.0
onehot,0.012638,0.036368,0.011093,40.0
rollup,0.005717,0.035499,0.003757,30.0
unroll,0.010438,0.034925,0.007259,10.0


In [7]:
import pandas as pd

pd.DataFrame(stats_peak).transpose().sort_values('max', ascending=False)

Unnamed: 0,mean,max,min,samples
all,0.018248,0.213573,0.00529,1470.0
fold,0.024862,0.123919,0.019412,60.0
unfold,0.009435,0.085798,0.002213,40.0
union,0.01705,0.075488,0.01343,20.0
join,0.036166,0.052702,0.027919,70.0
difference,0.030631,0.049689,0.027846,10.0
onehot,0.022828,0.045828,0.021439,40.0
merge,0.015154,0.04507,0.010926,150.0
binarize,0.022384,0.043723,0.019085,70.0
unroll,0.020483,0.043337,0.017774,10.0


In [8]:
[stat for stat in memory_profiling_callbacks._snapshots['aggregate'][3].statistics('lineno') if 'datashaper' in stat.traceback[0].filename]

[<Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=43>,)> size=144 count=1>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/aggregate.py' lineno=27>,)> size=96 count=2>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=354>,)> size=69 count=1>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=353>,)> size=48 count=1>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=318>,)> size=48 count=1>,
 <Statistic traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=272>,)> size=48 count=1>,
 <Statistic tra

In [9]:
[stat for stat in memory_profiling_callbacks._snapshots['union'][1].compare_to(memory_profiling_callbacks._snapshots['union'][0], 'lineno') if 'datashaper' in stat.traceback[0].filename]

[<StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/verb_input.py' lineno=39>,)> size=416 (+416) count=1 (+1)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/union.py' lineno=16>,)> size=400 (+400) count=1 (+1)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=318>,)> size=112 (+112) count=2 (+2)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/union.py' lineno=17>,)> size=112 (+112) count=2 (+2)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/union.py' lineno=19>,)> size=96 (+96) count=2 (+2)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andre

In [10]:
sum(stat.size for stat  in memory_profiling_callbacks._snapshots['unfold'][1].statistics('lineno'))

213082

In [11]:
[stat for stat in memory_profiling_callbacks._snapshots['all'][3].compare_to(memory_profiling_callbacks._snapshots['all'][2], 'lineno') if 'datashaper' in stat.traceback[0].filename]

[<StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=374>,)> size=728 (+728) count=3 (+3)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/verb_input.py' lineno=39>,)> size=416 (+416) count=1 (+1)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/engine/verbs/union.py' lineno=16>,)> size=400 (+400) count=1 (+1)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=280>,)> size=280 (+280) count=3 (+3)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/datashaper/python/datashaper/datashaper/workflow.py' lineno=303>,)> size=168 (+168) count=1 (+1)>,
 <StatisticDiff traceback=<Traceback (<Frame filename='/home/andresmor/Projects/da