# Sample workflow for VAST MC3 data

In [1]:
import pandas as pd
from datashaper import Workflow, WorkflowCallbacks

from examples.verbs.custom_verbs import embed_mock as embed, genid

## Load data

In [2]:
gh_url = "https://raw.githubusercontent.com/darthtrevino/vast-mc3-data/main"
dtype = {"date(yyyyMMddHHmmss)": "string"}
vast1 = pd.read_csv(f"{gh_url}/csv-1700-1830.csv", dtype=dtype)
vast2 = pd.read_csv(f"{gh_url}/csv-1831-2000.csv", dtype=dtype)
vast3 = pd.read_csv(f"{gh_url}/csv-2001-2131.csv", dtype=dtype)

## Create workflow with custom verbs

You can define custom verbs in two ways for the Workflow, 

1. Use the `verbs: dict[str, Callable]` parameter of the `Workflow` to provide the custom verbs
2. Use the `@verb` decorator on your function to be discovered by the VerbManager. This requires importing the verb so that the decorator is used or using the load_verbs function to load all the verbs in a given package.

### Custom verbs with verbs parameter

In [3]:
workflow = Workflow(
    verbs={"genid_verb": genid, "embed_verb": embed},
    schema={
        "steps": [
            {
                "verb": "concat",
                "input": {"source": "vast1", "others": ["vast2", "vast3"]},
            },
            {
                "verb": "convert",
                "args": {
                    "column": "date(yyyyMMddHHmmss)",
                    "to": "datetime",
                    "type": "date",
                    "formatPattern": "yyyyMMddHHmmss",
                },
            },
            {
                "verb": "select",
                "args": {"columns": ["type", "datetime", "author", "message"]},
            },
            {
                "verb": "genid_verb",
                "args": {"to": "id", "hash": ["datetime", "author", "message"]},
            },
            {"verb": "embed_verb", "args": {"to": "embedding", "column": "message"}},
        ]
    },
    input_tables={"vast1": vast1, "vast2": vast2, "vast3": vast3},
    validate=False,
    schema_path="../../schema/workflow.json",
)

workflow.run()
result = workflow.output()
result

Unnamed: 0,type,datetime,author,message,id,embedding
0,mbdata,2014-01-23 17:00:00,POK,Follow us @POK-Kronos,hash(2014-01-23 17:00:00POKFollow us @POK-Kronos),"[0.1, 0.2, 0.3]"
1,mbdata,2014-01-23 17:00:00,maha_Homeland,Don't miss a moment! Follow our live coverage...,hash(2014-01-23 17:00:00maha_HomelandDon't mis...,"[0.1, 0.2, 0.3]"
2,mbdata,2014-01-23 17:00:00,Viktor-E,Come join us in the Park! Music tonight at Abi...,hash(2014-01-23 17:00:00Viktor-ECome join us i...,"[0.1, 0.2, 0.3]"
3,mbdata,2014-01-23 17:00:00,KronosStar,POK rally to start in Abila City Park. POK lea...,hash(2014-01-23 17:00:00KronosStarPOK rally to...,"[0.1, 0.2, 0.3]"
4,mbdata,2014-01-23 17:00:00,AbilaPost,POK rally set to take place in Abila City Park...,hash(2014-01-23 17:00:00AbilaPostPOK rally set...,"[0.1, 0.2, 0.3]"
...,...,...,...,...,...,...
4058,mbdata,2014-01-23 21:33:10,plasticParts,RT @AbilaPost unknown explosion heard from the...,hash(2014-01-23 21:33:10plasticPartsRT @AbilaP...,"[0.1, 0.2, 0.3]"
4059,mbdata,2014-01-23 21:33:45,klingon4real,RT @CentralBulletin explosion heard at dancing...,hash(2014-01-23 21:33:45klingon4realRT @Centra...,"[0.1, 0.2, 0.3]"
4060,mbdata,2014-01-23 21:34:00,lindyT,RT @KronosStar There has been an explosion fro...,hash(2014-01-23 21:34:00lindyTRT @KronosStar T...,"[0.1, 0.2, 0.3]"
4061,mbdata,2014-01-23 21:34:00,dolls4sale,RT @redisrad What was that? #boom,hash(2014-01-23 21:34:00dolls4saleRT @redisrad...,"[0.1, 0.2, 0.3]"


### Custom verbs with verb decorator

See [examples.verbs](../examples/verbs.py) for the implementation of the verbs with the `@verb` decorator

In [7]:
import os
from datashaper.engine.verbs import load_verbs
from datashaper.execution import ExecutionNode
from collections import defaultdict
import examples.verbs as custom_verbs_module

load_verbs(custom_verbs_module)

import tracemalloc
def bytes_to_kb(bytes):
    return bytes / 1024

class MemoryProfileCallbacks:

    def __init__(self):
        self.snapshots = defaultdict(list)
        self.first_size, self.first_peak, self.second_size, self.second_peak = 0, 0, 0, 0

    def _take_snapshot(self, node):
        self.snapshots[node.verb.name].append(tracemalloc.take_snapshot())

    def on_workflow_start(self) -> None:
        """Called when the workflow starts."""
        tracemalloc.start()
        self.first_size, self.first_peak = tracemalloc.get_traced_memory()
    
    def on_step_start(self, node: ExecutionNode) -> None:
        """Called when a step starts."""
        self._take_snapshot(node)

    def on_step_end(self, node: ExecutionNode) -> None:
        """Called when a step ends."""
        self._take_snapshot(node)

    def on_workflow_end(self) -> None:
        """Called when the workflow ends."""
        self.second_size, self.second_peak = tracemalloc.get_traced_memory()
        tracemalloc.stop()

    

    def print_stats(self):
        total_memory_used = self.second_peak - self.first_peak
        print("diff between memory peaks start and end of workflow", bytes_to_kb(total_memory_used))

        for verb, _snapshots in self.snapshots.items():
            top_stats = _snapshots[1].compare_to(_snapshots[0], "lineno")
            diff = sum(bytes_to_kb(stat.size_diff) for stat in top_stats)
            print(f"diff in kb between start and end of verb {verb}", diff)

for _ in range(10):
    workflow = Workflow(
        # verbs={"genid": genid, "embed": embed},  This is not needed since the @verb decorator is used and the verbs are loaded into the VerbManager
        schema={
            "steps": [
                {
                    "verb": "concat",
                    "input": {"source": "vast1", "others": ["vast2", "vast3"]},
                },
                {
                    "verb": "convert",
                    "args": {
                        "column": "date(yyyyMMddHHmmss)",
                        "to": "datetime",
                        "type": "date",
                        "formatPattern": "yyyyMMddHHmmss",
                    },
                },
                {
                    "verb": "select",
                    "args": {"columns": ["type", "datetime", "author", "message"]},
                },
                {
                    "verb": "genid",
                    "args": {"to": "id", "hash": ["datetime", "author", "message"]},
                },
                {"verb": "embed", "args": {"to": "embedding", "column": "message"}},
            ]
        },
        input_tables={"vast1": vast1, "vast2": vast2, "vast3": vast3},
        validate=False,
        schema_path="../../schema/workflow.json",
    )
    profile_callbacks = MemoryProfileCallbacks()
    workflow.run(workflow_callbacks=profile_callbacks)
    profile_callbacks.print_stats()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2705.8974609375
diff in kb between start and end of verb concat 230.193359375
diff in kb between start and end of verb convert 71.6396484375
diff in kb between start and end of verb select 129.521484375
diff in kb between start and end of verb genid 682.080078125
diff in kb between start and end of verb embed 378.7685546875


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2701.529296875
diff in kb between start and end of verb concat 227.130859375
diff in kb between start and end of verb convert 71.4755859375
diff in kb between start and end of verb select 129.37109375
diff in kb between start and end of verb genid 681.994140625
diff in kb between start and end of verb embed 378.6865234375


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2699.4482421875
diff in kb between start and end of verb concat 227.130859375
diff in kb between start and end of verb convert 70.1064453125
diff in kb between start and end of verb select 129.1513671875
diff in kb between start and end of verb genid 681.4501953125
diff in kb between start and end of verb embed 378.73828125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2707.2451171875
diff in kb between start and end of verb concat 228.193359375
diff in kb between start and end of verb convert 77.4677734375
diff in kb between start and end of verb select 129.1298828125
diff in kb between start and end of verb genid 680.8447265625
diff in kb between start and end of verb embed 378.7216796875


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2700.451171875
diff in kb between start and end of verb concat 227.130859375
diff in kb between start and end of verb convert 70.109375
diff in kb between start and end of verb select 129.431640625
diff in kb between start and end of verb genid 682.2265625
diff in kb between start and end of verb embed 378.681640625


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2697.8837890625
diff in kb between start and end of verb concat 227.130859375
diff in kb between start and end of verb convert 68.8134765625
diff in kb between start and end of verb select 129.431640625
diff in kb between start and end of verb genid 680.841796875
diff in kb between start and end of verb embed 378.724609375


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2707.470703125
diff in kb between start and end of verb concat 228.193359375
diff in kb between start and end of verb convert 77.6416015625
diff in kb between start and end of verb select 129.236328125
diff in kb between start and end of verb genid 680.78515625
diff in kb between start and end of verb embed 378.669921875


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2701.6337890625
diff in kb between start and end of verb concat 227.130859375
diff in kb between start and end of verb convert 72.1220703125
diff in kb between start and end of verb select 129.431640625
diff in kb between start and end of verb genid 681.3369140625
diff in kb between start and end of verb embed 378.818359375


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


diff between memory peaks start and end of workflow 2697.818359375
diff in kb between start and end of verb concat 227.130859375
diff in kb between start and end of verb convert 69.109375
diff in kb between start and end of verb select 129.318359375
diff in kb between start and end of verb genid 680.955078125
diff in kb between start and end of verb embed 380.5224609375
diff between memory peaks start and end of workflow 2706.9404296875
diff in kb between start and end of verb concat 228.044921875
diff in kb between start and end of verb convert 77.208984375
diff in kb between start and end of verb select 129.521484375
diff in kb between start and end of verb genid 680.6142578125
diff in kb between start and end of verb embed 378.6630859375


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: hash_row(row), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[to] = df.apply(lambda row: _embed(row[column]), axis=1)


In [6]:
profile_callbacks.print_stats()

diff between memory peaks start and end of workflow 2707.0087890625
diff in kb between start and end of verb concat 230.193359375
diff in kb between start and end of verb convert 72.3310546875
diff in kb between start and end of verb select 129.521484375
diff in kb between start and end of verb genid 682.443359375
diff in kb between start and end of verb embed 378.7685546875


295.56640625