# Sample workflow for VAST MC3 data

In [1]:
import pandas as pd
from datashaper import Workflow

from examples.verbs.custom_verbs import embed_mock as embed, genid

## Load data

In [2]:
gh_url = "https://raw.githubusercontent.com/darthtrevino/vast-mc3-data/main"
dtype = {"date(yyyyMMddHHmmss)": "string"}
vast1 = pd.read_csv(f"{gh_url}/csv-1700-1830.csv", dtype=dtype)
vast2 = pd.read_csv(f"{gh_url}/csv-1831-2000.csv", dtype=dtype)
vast3 = pd.read_csv(f"{gh_url}/csv-2001-2131.csv", dtype=dtype)

## Create workflow with custom verbs

You can define custom verbs in two ways for the Workflow, 

1. Use the `verbs: dict[str, Callable]` parameter of the `Workflow` to provide the custom verbs
2. Use the `@verb` decorator on your function to be discovered by the VerbManager. This requires importing the verb so that the decorator is used or using the load_verbs function to load all the verbs in a given package.

### Custom verbs with verbs parameter

In [3]:
workflow = Workflow(
    verbs={"genid_verb": genid, "embed_verb": embed},
    schema={
        "steps": [
            {
                "verb": "concat",
                "input": {"source": "vast1", "others": ["vast2", "vast3"]},
            },
            {
                "verb": "convert",
                "args": {
                    "column": "date(yyyyMMddHHmmss)",
                    "to": "datetime",
                    "type": "date",
                    "formatPattern": "yyyyMMddHHmmss",
                },
            },
            {
                "verb": "select",
                "args": {"columns": ["type", "datetime", "author", "message"]},
            },
            {
                "verb": "genid_verb",
                "args": {"to": "id", "hash": ["datetime", "author", "message"]},
            },
            {"verb": "embed_verb", "args": {"to": "embedding", "column": "message"}},
        ]
    },
    input_tables={"vast1": vast1, "vast2": vast2, "vast3": vast3},
    validate=False,
    schema_path="../../schema/workflow.json",
)

workflow.run()
result = workflow.output()
result

Unnamed: 0,type,datetime,author,message,id,embedding
0,mbdata,2014-01-23 17:00:00,POK,Follow us @POK-Kronos,hash(2014-01-23 17:00:00POKFollow us @POK-Kronos),"[0.1, 0.2, 0.3]"
1,mbdata,2014-01-23 17:00:00,maha_Homeland,Don't miss a moment! Follow our live coverage...,hash(2014-01-23 17:00:00maha_HomelandDon't mis...,"[0.1, 0.2, 0.3]"
2,mbdata,2014-01-23 17:00:00,Viktor-E,Come join us in the Park! Music tonight at Abi...,hash(2014-01-23 17:00:00Viktor-ECome join us i...,"[0.1, 0.2, 0.3]"
3,mbdata,2014-01-23 17:00:00,KronosStar,POK rally to start in Abila City Park. POK lea...,hash(2014-01-23 17:00:00KronosStarPOK rally to...,"[0.1, 0.2, 0.3]"
4,mbdata,2014-01-23 17:00:00,AbilaPost,POK rally set to take place in Abila City Park...,hash(2014-01-23 17:00:00AbilaPostPOK rally set...,"[0.1, 0.2, 0.3]"
...,...,...,...,...,...,...
4058,mbdata,2014-01-23 21:33:10,plasticParts,RT @AbilaPost unknown explosion heard from the...,hash(2014-01-23 21:33:10plasticPartsRT @AbilaP...,"[0.1, 0.2, 0.3]"
4059,mbdata,2014-01-23 21:33:45,klingon4real,RT @CentralBulletin explosion heard at dancing...,hash(2014-01-23 21:33:45klingon4realRT @Centra...,"[0.1, 0.2, 0.3]"
4060,mbdata,2014-01-23 21:34:00,lindyT,RT @KronosStar There has been an explosion fro...,hash(2014-01-23 21:34:00lindyTRT @KronosStar T...,"[0.1, 0.2, 0.3]"
4061,mbdata,2014-01-23 21:34:00,dolls4sale,RT @redisrad What was that? #boom,hash(2014-01-23 21:34:00dolls4saleRT @redisrad...,"[0.1, 0.2, 0.3]"


### Custom verbs with verb decorator

See [examples.verbs](../examples/verbs.py) for the implementation of the verbs with the `@verb` decorator

In [4]:
import os
from datashaper.engine.verbs import load_verbs
import examples.verbs as custom_verbs_module

load_verbs(custom_verbs_module)

workflow = Workflow(
    # verbs={"genid": genid, "embed": embed},  This is not needed since the @verb decorator is used and the verbs are loaded into the VerbManager
    schema={
        "steps": [
            {
                "verb": "concat",
                "input": {"source": "vast1", "others": ["vast2", "vast3"]},
            },
            {
                "verb": "convert",
                "args": {
                    "column": "date(yyyyMMddHHmmss)",
                    "to": "datetime",
                    "type": "date",
                    "formatPattern": "yyyyMMddHHmmss",
                },
            },
            {
                "verb": "select",
                "args": {"columns": ["type", "datetime", "author", "message"]},
            },
            {
                "verb": "genid",
                "args": {"to": "id", "hash": ["datetime", "author", "message"]},
            },
            {"verb": "embed", "args": {"to": "embedding", "column": "message"}},
        ]
    },
    input_tables={"vast1": vast1, "vast2": vast2, "vast3": vast3},
    validate=False,
    schema_path="../../schema/workflow.json",
)

workflow.run()
result = workflow.output()
result

Unnamed: 0,type,datetime,author,message,id,embedding
0,mbdata,2014-01-23 17:00:00,POK,Follow us @POK-Kronos,hash(2014-01-23 17:00:00POKFollow us @POK-Kronos),"[0.1, 0.2, 0.3]"
1,mbdata,2014-01-23 17:00:00,maha_Homeland,Don't miss a moment! Follow our live coverage...,hash(2014-01-23 17:00:00maha_HomelandDon't mis...,"[0.1, 0.2, 0.3]"
2,mbdata,2014-01-23 17:00:00,Viktor-E,Come join us in the Park! Music tonight at Abi...,hash(2014-01-23 17:00:00Viktor-ECome join us i...,"[0.1, 0.2, 0.3]"
3,mbdata,2014-01-23 17:00:00,KronosStar,POK rally to start in Abila City Park. POK lea...,hash(2014-01-23 17:00:00KronosStarPOK rally to...,"[0.1, 0.2, 0.3]"
4,mbdata,2014-01-23 17:00:00,AbilaPost,POK rally set to take place in Abila City Park...,hash(2014-01-23 17:00:00AbilaPostPOK rally set...,"[0.1, 0.2, 0.3]"
...,...,...,...,...,...,...
4058,mbdata,2014-01-23 21:33:10,plasticParts,RT @AbilaPost unknown explosion heard from the...,hash(2014-01-23 21:33:10plasticPartsRT @AbilaP...,"[0.1, 0.2, 0.3]"
4059,mbdata,2014-01-23 21:33:45,klingon4real,RT @CentralBulletin explosion heard at dancing...,hash(2014-01-23 21:33:45klingon4realRT @Centra...,"[0.1, 0.2, 0.3]"
4060,mbdata,2014-01-23 21:34:00,lindyT,RT @KronosStar There has been an explosion fro...,hash(2014-01-23 21:34:00lindyTRT @KronosStar T...,"[0.1, 0.2, 0.3]"
4061,mbdata,2014-01-23 21:34:00,dolls4sale,RT @redisrad What was that? #boom,hash(2014-01-23 21:34:00dolls4saleRT @redisrad...,"[0.1, 0.2, 0.3]"
