In [7]:
import json
import os
import tracemalloc

from collections import defaultdict

from typing import Any, Dict, Optional
from datashaper.execution.execution_node import ExecutionNode
from datashaper.table_store import TableContainer

from typing import List

from datashaper.workflow import Workflow

In [8]:


class MemoryProfilingCallbacks:

    def __init__(self):
        self._snapshots = defaultdict(list)

    def on_workflow_start(self) -> None:
        """Called when the workflow starts."""
        tracemalloc.start()
    
    def on_step_start(self, node: ExecutionNode, inputs: Dict[str, Any]) -> None:
        """Called when a step starts."""
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())

    def on_step_end(self, node: ExecutionNode, result: Optional[TableContainer]) -> None:
        """Called when a step ends."""
        self._snapshots[node.verb.name].append(tracemalloc.take_snapshot())

    def on_workflow_end(self) -> None:
        """Called when the workflow ends."""
        tracemalloc.stop()

In [11]:
FIXTURES_PATH = "../../../schema/fixtures/workflow"
TABLE_STORE_PATH = "../../../schema/fixtures/workflow_inputs"

memory_profiling_callbacks = MemoryProfilingCallbacks()

def get_verb_test_specs(root: str) -> List[str]:
    subfolders: List[str] = []
    for root, _, files in os.walk(root):
        if "workflow.json" in files:
            subfolders.append(root)
    return subfolders


def test_verbs_schema_input(fixture_path: str):
    with open(os.path.join(fixture_path, "workflow.json")) as schema:
        workflow = Workflow(
            schema=json.load(schema),
            input_path=TABLE_STORE_PATH,
            
        )

    workflow.run(workflow_callbacks=memory_profiling_callbacks)

In [12]:
for fixture_path in get_verb_test_specs(FIXTURES_PATH):
    test_verbs_schema_input(fixture_path)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  input_table[col].loc[i] = nan
  output[to] = output[column].str.replace(


In [20]:
stats = {}

# function to turn bytes to kb
def bytes_to_kb(bytes):
    return bytes / 1024

for verb, snapshots in memory_profiling_callbacks._snapshots.items():
    verb_stats = []
    for first, second in zip(snapshots[::2], snapshots[1::2]):
        stat_diff = second.compare_to(first, 'lineno')
        verb_stats.append(bytes_to_kb(sum(stat.size_diff for stat in stat_diff))) # Get total size difference between start and end of verb execution for each sample
    stats[verb] = {
        'mean': sum(verb_stats) / len(verb_stats),
        'max': max(verb_stats),
        'min': min(verb_stats),
        'samples': len(verb_stats)
    }




In [21]:
import pandas as pd

pd.DataFrame(stats).transpose().sort_values('mean', ascending=False)

Unnamed: 0,mean,max,min,samples
join,16.065569,18.508789,11.40625,7.0
onehot,12.90918,13.719727,11.729492,4.0
difference,12.75,12.75,12.75,1.0
binarize,12.304269,28.916016,8.083984,7.0
lookup,12.083984,12.083984,12.083984,1.0
intersect,11.814453,11.814453,11.814453,1.0
spread,10.981689,16.000977,8.673828,4.0
unhot,10.902832,12.06543,9.740234,2.0
unfold,10.43335,11.480469,9.555664,4.0
pivot,9.698405,11.944336,8.732422,6.0
