In [1]:
import asyncio

import numpy as np
import pandas as pd

In [2]:
from datashaper import Table, VerbCallbacks, new_row, parallel_verb


@parallel_verb(name="embed_verb_for_df")
async def embed_verb_for_df(
    chunk: Table, callbacks: VerbCallbacks, column: str, to: str, **_kwargs: dict
) -> Table:
    """Stub function to embed a verb for a dataframe."""
    chunk[to] = chunk[column].apply(lambda _x: np.random.rand(10))
    await asyncio.sleep(1)
    return chunk


@parallel_verb(name="embed_verb_for_row")
async def embed_verb_for_row(
    chunk: tuple, callbacks: VerbCallbacks, to: str, column: str, **_kwargs: dict
) -> tuple:
    """Stub function to embed a verb for a row in a table."""
    result = np.random.rand(10)
    await asyncio.sleep(1)
    return new_row(chunk, to, result)

In [3]:
gh_url = "https://raw.githubusercontent.com/darthtrevino/vast-mc3-data/main"
dtype = {"date(yyyyMMddHHmmss)": "string"}
vast1 = pd.read_csv(f"{gh_url}/csv-1700-1830.csv", dtype=dtype)
vast2 = pd.read_csv(f"{gh_url}/csv-1831-2000.csv", dtype=dtype)
vast3 = pd.read_csv(f"{gh_url}/csv-2001-2131.csv", dtype=dtype)

In [4]:
from datashaper.execution.execution_node import ExecutionNode
from datashaper.progress.types import Progress
from datashaper.workflow.workflow import Workflow
from datashaper.workflow.workflow_callbacks.noop_workflow_callback import (
    NoopWorkflowCallbacks,
)


class Callbacks(NoopWorkflowCallbacks):
    """A callback class to handle workflow events."""

    def on_step_progress(self, node: ExecutionNode, progress: Progress) -> None:
        """Handle when progress occurs."""
        print(f"Progress: {progress}")


workflow = Workflow(
    schema={
        "steps": [
            {
                "verb": "concat",
                "input": {"source": "vast1", "others": ["vast2", "vast3"]},
            },
            {
                "verb": "convert",
                "args": {
                    "column": "date(yyyyMMddHHmmss)",
                    "to": "datetime",
                    "type": "date",
                    "formatPattern": "yyyyMMddHHmmss",
                },
            },
            {
                "verb": "select",
                "args": {"columns": ["type", "datetime", "author", "message"]},
            },
            {
                "verb": "embed_verb_for_df",
                "args": {"to": "embedding", "column": "message", "chunk_size": 50},
            },
        ]
    },
    input_tables={"vast1": vast1, "vast2": vast2, "vast3": vast3},
)

await workflow.run(callbacks=Callbacks())
result = workflow.output()
result

Progress: Progress(percent=0, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=1, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=0, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=1, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=0, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=1, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=0, description=None, total_items=None, completed_items=None)


  return bound(*args, **kwds)


Progress: Progress(percent=None, description=None, total_items=82, completed_items=1)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=2)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=3)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=4)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=5)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=6)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=7)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=8)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=9)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=10)
Progress: Progress(percent=None, description=None, total_items=82, completed_items=11)
Progress: Progress(percent=None, description=None, t

Unnamed: 0,type,datetime,author,message,embedding
0,mbdata,2014-01-23 17:00:00,POK,Follow us @POK-Kronos,"[0.992762302137496, 0.8471110011638578, 0.0234..."
1,mbdata,2014-01-23 17:00:00,maha_Homeland,Don't miss a moment! Follow our live coverage...,"[0.9985802890794845, 0.7518555961995833, 0.817..."
2,mbdata,2014-01-23 17:00:00,Viktor-E,Come join us in the Park! Music tonight at Abi...,"[0.46299571660610683, 0.32689315751677084, 0.3..."
3,mbdata,2014-01-23 17:00:00,KronosStar,POK rally to start in Abila City Park. POK lea...,"[0.5774714595393775, 0.6340547088991296, 0.765..."
4,mbdata,2014-01-23 17:00:00,AbilaPost,POK rally set to take place in Abila City Park...,"[0.35657516070159045, 0.10572059178242377, 0.3..."
...,...,...,...,...,...
4058,mbdata,2014-01-23 21:33:10,plasticParts,RT @AbilaPost unknown explosion heard from the...,"[0.8149193811879802, 0.8466132815064984, 0.547..."
4059,mbdata,2014-01-23 21:33:45,klingon4real,RT @CentralBulletin explosion heard at dancing...,"[0.25698503222622127, 0.5230854547999364, 0.06..."
4060,mbdata,2014-01-23 21:34:00,lindyT,RT @KronosStar There has been an explosion fro...,"[0.11418371721025533, 0.8810552054131907, 0.72..."
4061,mbdata,2014-01-23 21:34:00,dolls4sale,RT @redisrad What was that? #boom,"[0.3123928648418389, 0.8356350850641252, 0.576..."


In [5]:
from datashaper.execution.execution_node import ExecutionNode
from datashaper.progress.types import Progress
from datashaper.workflow.workflow import Workflow
from datashaper.workflow.workflow_callbacks.noop_workflow_callback import (
    NoopWorkflowCallbacks,
)


class Callbacks(NoopWorkflowCallbacks):
    """A callback class to handle workflow events."""

    def on_step_progress(self, node: ExecutionNode, progress: Progress) -> None:
        """Handle when progress occurs."""
        print(f"Progress: {progress}")


workflow = Workflow(
    schema={
        "steps": [
            {
                "verb": "concat",
                "input": {"source": "vast1", "others": ["vast2", "vast3"]},
            },
            {
                "verb": "convert",
                "args": {
                    "column": "date(yyyyMMddHHmmss)",
                    "to": "datetime",
                    "type": "date",
                    "formatPattern": "yyyyMMddHHmmss",
                },
            },
            {
                "verb": "select",
                "args": {"columns": ["type", "datetime", "author", "message"]},
            },
            {
                "verb": "embed_verb_for_row",
                "args": {
                    "to": "embedding",
                    "column": "message",
                    "max_parallelism": 256,
                },
            },
        ]
    },
    input_tables={"vast1": vast1, "vast2": vast2, "vast3": vast3},
)

await workflow.run(callbacks=Callbacks())
result = workflow.output()
result

Progress: Progress(percent=0, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=1, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=0, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=1, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=0, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=1, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=0, description=None, total_items=None, completed_items=None)
Progress: Progress(percent=None, description=None, total_items=4063, completed_items=1)
Progress: Progress(percent=None, description=None, total_items=4063, completed_items=2)
Progress: Progress(percent=None, description=None, total_items=4063, completed_items=3)
Progress: Progress(percent=None, description=None, total_items=4063, completed_items=4)
Progress: Progress(percent=None,

Unnamed: 0,Index,type,datetime,author,message,embedding
0,0,mbdata,2014-01-23 17:00:00,POK,Follow us @POK-Kronos,"[0.7261302990077598, 0.3656781673448264, 0.628..."
1,1,mbdata,2014-01-23 17:00:00,maha_Homeland,Don't miss a moment! Follow our live coverage...,"[0.4540679483368807, 0.7026164705691877, 0.214..."
2,2,mbdata,2014-01-23 17:00:00,Viktor-E,Come join us in the Park! Music tonight at Abi...,"[0.9526399323973125, 0.056928101809164855, 0.0..."
3,3,mbdata,2014-01-23 17:00:00,KronosStar,POK rally to start in Abila City Park. POK lea...,"[0.1366551234486959, 0.8261510701606365, 0.644..."
4,4,mbdata,2014-01-23 17:00:00,AbilaPost,POK rally set to take place in Abila City Park...,"[0.7359758903009926, 0.11936404559212188, 0.38..."
...,...,...,...,...,...,...
4058,4058,mbdata,2014-01-23 21:33:10,plasticParts,RT @AbilaPost unknown explosion heard from the...,"[0.8091847262774975, 0.8694422182168504, 0.425..."
4059,4059,mbdata,2014-01-23 21:33:45,klingon4real,RT @CentralBulletin explosion heard at dancing...,"[0.4276384589752016, 0.6595713997793372, 0.484..."
4060,4060,mbdata,2014-01-23 21:34:00,lindyT,RT @KronosStar There has been an explosion fro...,"[0.33691410898314345, 0.8505349866410271, 0.01..."
4061,4061,mbdata,2014-01-23 21:34:00,dolls4sale,RT @redisrad What was that? #boom,"[0.5203900721912476, 0.35986282416034265, 0.45..."
