In [1]:
import pandas as pd
from datashaper import Workflow
from datashaper.table_store import DiskCacheTableStore

In [2]:
gh_url = "https://raw.githubusercontent.com/darthtrevino/vast-mc3-data/main"
dtype = {"date(yyyyMMddHHmmss)": "string"}
vast1 = pd.read_csv(f"{gh_url}/csv-1700-1830.csv", dtype=dtype)
vast2 = pd.read_csv(f"{gh_url}/csv-1831-2000.csv", dtype=dtype)
vast3 = pd.read_csv(f"{gh_url}/csv-2001-2131.csv", dtype=dtype)

In this example we define a DiskCacheTableStore with maxsize 2, this means that we will only keep at max 2 tables in memory, the other tables will be saved to a temp folder and loaded if needed.

In [7]:
from datashaper.engine.verbs import load_verbs
import examples.verbs as custom_verbs_module

load_verbs(custom_verbs_module)

with DiskCacheTableStore(maxsize=5) as table_store:
    workflow = Workflow(
        schema={
            "steps": [
                {
                    "verb": "concat",
                    "input": {"source": "vast1", "others": ["vast2", "vast3"]},
                    "id": "vast_all",
                },
                {
                    "verb": "convert",
                    "args": {
                        "column": "date(yyyyMMddHHmmss)",
                        "to": "datetime",
                        "type": "date",
                        "formatPattern": "yyyyMMddHHmmss",
                    },
                    "input": "vast_all",
                    "id": "convert",
                },
                {
                    "verb": "select",
                    "args": {"columns": ["type", "datetime", "author", "message"]},
                    "input": "convert",
                    "id": "selected",
                },
                {
                    "verb": "genid",
                    "args": {"to": "id", "hash": ["datetime", "author", "message"]},
                    "input": "selected",
                    "id": "genid",
                },
                {
                    "verb": "embed",
                    "args": {"to": "embedding", "column": "message"},
                    "input": "genid",
                    "id": "embed",
                },
                {
                    "verb": "select",
                    "args": {"columns": ["type", "datetime", "author", "message"]},
                    "input": "convert",
                    "id": "selected2",
                },
                {
                    "verb": "genid",
                    "args": {"to": "id", "hash": ["datetime", "author", "message"]},
                    "input": "selected",
                    "id": "genid2",
                },
                {
                    "verb": "embed",
                    "args": {"to": "embedding", "column": "message"},
                    "input": "genid",
                    "id": "embed2",
                },
            ]
        },
        input_tables={"vast1": vast1, "vast2": vast2, "vast3": vast3},
        validate=False,
        schema_path="../../schema/workflow.json",
        table_store=table_store,
    )
    workflow.run()
    result = workflow.output()
    print(workflow._table_store._get_caching_function().cache_info())

CacheInfo(hits=3, misses=8, maxsize=5, currsize=5)
