In [1]:
import pandas as pd
from datashaper import Workflow

# Enable nesting asyncio since jupyter has a asyncio already running
import nest_asyncio

nest_asyncio.apply()

In [2]:
gh_url = "https://raw.githubusercontent.com/darthtrevino/vast-mc3-data/main"
dtype = {"date(yyyyMMddHHmmss)": "string"}
vast1 = pd.read_csv(f"{gh_url}/csv-1700-1830.csv", dtype=dtype)
vast2 = pd.read_csv(f"{gh_url}/csv-1831-2000.csv", dtype=dtype)
vast3 = pd.read_csv(f"{gh_url}/csv-2001-2131.csv", dtype=dtype)

In [3]:
from datashaper.engine.verbs import verb
from datashaper import TableContainer
from datashaper.engine.verbs.verb_input import VerbInput
import numpy as np
import asyncio


async def process_chunk(chunk, column, to):
    chunk[to] = chunk[column].apply(lambda x: np.random.rand(10))
    await asyncio.sleep(3)
    return chunk


@verb(name="test_1")
async def test_1(input: VerbInput, column: str, to: str):
    input_table = input.source.table
    chunks = np.array_split(input_table, 3)

    futures = [process_chunk(chunk, column, to) for chunk in chunks]
    chunks = await asyncio.gather(*futures)

    return TableContainer(pd.concat(chunks))

In [4]:
workflow = Workflow(
    schema={
        "steps": [
            {
                "verb": "concat",
                "input": {"source": "vast1", "others": ["vast2", "vast3"]},
            },
            {
                "verb": "convert",
                "args": {
                    "column": "date(yyyyMMddHHmmss)",
                    "to": "datetime",
                    "type": "date",
                    "formatPattern": "yyyyMMddHHmmss",
                },
            },
            {
                "verb": "select",
                "args": {"columns": ["type", "datetime", "author", "message"]},
            },
            {"verb": "test_1", "args": {"to": "embedding", "column": "message"}},
        ]
    },
    input_tables={"vast1": vast1, "vast2": vast2, "vast3": vast3},
    validate=False,
    schema_path="../../schema/workflow.json",
)

In [5]:
workflow.run()

In [6]:
result = workflow.output()
result

Unnamed: 0,type,datetime,author,message,embedding
0,mbdata,2014-01-23 17:00:00,POK,Follow us @POK-Kronos,"[0.12631154803472355, 0.6139963808821153, 0.20..."
1,mbdata,2014-01-23 17:00:00,maha_Homeland,Don't miss a moment! Follow our live coverage...,"[0.6047053624898311, 0.4271170859921898, 0.398..."
2,mbdata,2014-01-23 17:00:00,Viktor-E,Come join us in the Park! Music tonight at Abi...,"[0.6006147113881228, 0.9404353316344138, 0.309..."
3,mbdata,2014-01-23 17:00:00,KronosStar,POK rally to start in Abila City Park. POK lea...,"[0.5130750837810296, 0.43976497401861225, 0.23..."
4,mbdata,2014-01-23 17:00:00,AbilaPost,POK rally set to take place in Abila City Park...,"[0.04274277429254569, 0.2889262648531674, 0.55..."
...,...,...,...,...,...
4058,mbdata,2014-01-23 21:33:10,plasticParts,RT @AbilaPost unknown explosion heard from the...,"[0.6979623541778851, 0.013080035921973932, 0.4..."
4059,mbdata,2014-01-23 21:33:45,klingon4real,RT @CentralBulletin explosion heard at dancing...,"[0.4857120090086714, 0.5920785100183013, 0.762..."
4060,mbdata,2014-01-23 21:34:00,lindyT,RT @KronosStar There has been an explosion fro...,"[0.22702872474706615, 0.23524780678728852, 0.9..."
4061,mbdata,2014-01-23 21:34:00,dolls4sale,RT @redisrad What was that? #boom,"[0.07235046713414961, 0.6564293440420865, 0.56..."
