In [1]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

### multiple_workflows
#### create pipeline workflows file1

In [2]:
import pandas as pd
import os 
from pathlib import Path

workflow_dir = os.path.join(
    Path().cwd(), "workflows/"
)
if not os.path.exists(workflow_dir):
    os.makedirs(workflow_dir)


PIPELINE_YAML = """
name: workflow_1
steps:
  - verb: fill
    args:
      to: "col_workflow_1"
      value: 1

 
"""
pipeline_file = Path(workflow_dir) / "workflow_1.yml"
with pipeline_file.open("w") as file:
    file.write(PIPELINE_YAML)
    


#### create pipeline workflows file2

In [3]:
import pandas as pd
import os 
from pathlib import Path
 

PIPELINE_YAML = """
name: workflow_2
steps:
  - verb: fill
    args:
      to: "col_workflow_2"
      value: 2
    input: 

      # workflow_2 is dependent on workflow_1
      # so in workflow_2 output, you'll also see the output from workflow_1
      source: "workflow:workflow_1"

  # Example of pulling in values from a shared file
  - verb: fill
    args:
      to: "col_from_shared_file"
      value: !include ./shared/shared_fill_value.txt


 
"""
pipeline_file = Path(workflow_dir) / "workflow_2.yml"
with pipeline_file.open("w") as file:
    file.write(PIPELINE_YAML)
    

workflow_shared_dir = os.path.join(
    Path().cwd(), "workflows/shared"
)
if not os.path.exists(workflow_shared_dir):
    os.makedirs(workflow_shared_dir)


PIPELINE_shared_file = """value_from_shared_file
"""

workflow_shared_file = Path(workflow_shared_dir) / "shared_fill_value.txt"
with workflow_shared_file.open("w") as file:
    file.write(PIPELINE_shared_file)
    
workflow_shared_dir


'/media/gpt4-pdf-chatbot-langchain/graphrag/examples_notebooks/6_multiple_workflows/workflows/shared'

#### create pipeline workflows file3

In [4]:
import pandas as pd
import os 
from pathlib import Path
 


PIPELINE_YAML = """
name: workflow_3
steps:
  - verb: fill
    args:
      to: "col_workflow_3"
      value: 3

 
"""
pipeline_file = Path(workflow_dir) / "workflow_3.yml"
with pipeline_file.open("w") as file:
    file.write(PIPELINE_YAML)
    


#### The workflows section in this configuration file includes three workflow files. Each workflow file is referenced using the !include syntax.

In [5]:


PIPELINE_YAML = """
workflows:
  - !include workflows/workflow_1.yml
  - !include workflows/workflow_2.yml
  - !include workflows/workflow_3.yml
 
"""
pipeline_file =  Path().cwd() / "pipeline.yaml"
with pipeline_file.open("w") as file:
    file.write(PIPELINE_YAML)
    

#### init csv file

In [6]:
# our fake dataset
import asyncio
from pathlib import Path
import pandas as pd
import os
from graphrag.index.config import PipelineCSVInputConfig
from graphrag.index.input import load_input
init_dataset = pd.DataFrame([{"author": "aufsnn", 
                         "message": "Apple Inc. is an American multinational technology company headquartered in Cupertino, California. Tim Cook is the CEO of Apple.",
                         "date(yyyyMMddHHmmss)": "20240709182511"
                        },
                        {"author": "dmeck", 
                         "message": "hello!", 
                         "date(yyyyMMddHHmmss)": "20240709182511"}])

sample_data_dir = os.path.join(
    Path().cwd(), "input/"
)
if not os.path.exists(sample_data_dir):
    os.makedirs(sample_data_dir)
init_dataset.to_csv(f'{sample_data_dir}/dataset.csv', index=False)
sample_data_dir

'/media/gpt4-pdf-chatbot-langchain/graphrag/examples_notebooks/6_multiple_workflows/input/'

In [7]:

shared_dataset = await load_input(
        PipelineCSVInputConfig(
            file_pattern=".*\\.csv$",
            base_dir=sample_data_dir,
            source_column="author",
            text_column="message",
            timestamp_column="date(yyyyMMddHHmmss)",
            timestamp_format="%Y%m%d%H%M%S",
            title_column="message",
        ),
    )
# We're cheap, and this is an example, lets just do 10
dataset = shared_dataset.head(10)
dataset

Unnamed: 0,author,message,date(yyyyMMddHHmmss),id,source,text,title,timestamp,year,month,day,hour,minute,second
0,aufsnn,Apple Inc. is an American multinational techno...,20240709182511,a9c886cbbc2905f65301a9215514ed0b,aufsnn,Apple Inc. is an American multinational techno...,Apple Inc. is an American multinational techno...,2024-07-09 18:25:11,2024,7,9,18,25,11
1,dmeck,hello!,20240709182511,409342b4e5e4bae7204236f4bf214055,dmeck,hello!,hello!,2024-07-09 18:25:11,2024,7,9,18,25,11


In [8]:

# run the pipeline with the config, and override the dataset with the one we just created
# and grab the last result from the pipeline, should be the last workflow that was run (our nodes)
# Load your config without the input section
pipeline_path = str(pipeline_file)
pipeline_path

'/media/gpt4-pdf-chatbot-langchain/graphrag/examples_notebooks/6_multiple_workflows/pipeline.yaml'

In [9]:
from graphrag.index import run_pipeline_with_config


async for result in run_pipeline_with_config(pipeline_path, dataset=dataset):
    print(f"Workflow {result.workflow} result\n: ")
    print(result.result)

Workflow workflow_1 result
: 
   author                                            message  \
0  aufsnn  Apple Inc. is an American multinational techno...   
1   dmeck                                             hello!   

   date(yyyyMMddHHmmss)                                id  source  \
0        20240709182511  a9c886cbbc2905f65301a9215514ed0b  aufsnn   
1        20240709182511  409342b4e5e4bae7204236f4bf214055   dmeck   

                                                text  \
0  Apple Inc. is an American multinational techno...   
1                                             hello!   

                                               title           timestamp  \
0  Apple Inc. is an American multinational techno... 2024-07-09 18:25:11   
1                                             hello! 2024-07-09 18:25:11   

   year  month  day  hour  minute  second  col_workflow_1  
0  2024      7    9    18      25      11               1  
1  2024      7    9    18      25      11          