In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

### pipeline Examples custom input
#### use file create pipeline fill  verb

In [2]:
import pandas as pd

from pathlib import Path

PIPELINE_YAML = """

# Setup reporting however you'd like
reporting:
  type: console

# Setup storage however you'd like
storage:
  type: memory

# Setup cache however you'd like
cache: 
  type: memory

# Just a simple workflow
workflows:

  # This is an anonymous workflow, it doesn't have a name
  - steps:

    # Unpack the nodes from the graph
    - verb: fill
      args:
        to: filled_column
        value: "Filled Value"
"""
pipeline_file =  Path().cwd() / "pipeline.yaml"
with pipeline_file.open("w") as file:
    file.write(PIPELINE_YAML)
    

#### Load dataset with  verb fill 

In [3]:

def _load_dataset_some_unique_way() -> pd.DataFrame:
    # Totally loaded from some other place
    return pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}])
# Load your dataset
dataset = _load_dataset_some_unique_way()

In [4]:

# Load your config without the input section
config = str(pipeline_file)

In [5]:

from graphrag.index import run_pipeline_with_config


# Grab the last result from the pipeline, should be our entity extraction
outputs = []
async for output in run_pipeline_with_config(
    config_or_path=config, dataset=dataset
):
    outputs.append(output)
pipeline_result = outputs[-1]

if pipeline_result.result is not None:
    # Should look something like
    #            col1  col2 filled_column
    # 0     2     4  Filled Value
    # 1     5    10  Filled Value
    print(pipeline_result.result)
else:
    print("No results!")

   col1  col2 filled_column
0     2     4  Filled Value
1     5    10  Filled Value


#### use Python API create pipeline  derive  verb

In [6]:

from graphrag.index import run_pipeline
from graphrag.index.config import PipelineWorkflowReference

workflows: list[PipelineWorkflowReference] = [
    PipelineWorkflowReference(
        steps=[
            {
                # built-in verb
                "verb": "derive",  # https://github.com/microsoft/datashaper/blob/main/python/datashaper/datashaper/engine/verbs/derive.py
                "args": {
                    "column1": "col1",  # from above
                    "column2": "col2",  # from above
                    "to": "col_multiplied",  # new column name
                    "operator": "*",  # multiply the two columns
                },
                # Since we're trying to act on the default input, we don't need explicitly to specify an input
            }
        ]
    ),
]


#### Load dataset with  verb derive  

In [7]:

dataset = pd.DataFrame([{"col1": 2, "col2": 4}, {"col1": 5, "col2": 10}])


In [8]:
outputs = []
async for output in run_pipeline(dataset=dataset, workflows=workflows):
    outputs.append(output)
pipeline_result = outputs[-1]
print(pipeline_result)

PipelineRunResult(workflow='Anonymous Workflow 1', result=   col1  col2  col_multiplied
0     2     4               8
1     5    10              50, errors=None)
