In [4]:
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License.

## Index Migration (v2 to v3)

This notebook is used to maintain data model parity with older indexes for version 3.0 of GraphRAG. If you have a pre-3.0 index and need to migrate without re-running the entire pipeline, you can use this notebook to only update the pieces necessary for alignment. If you have a pre-2.0 index, please run the v2 migration notebook first!

NOTE: we recommend regenerating your settings.yml with the latest version of GraphRAG using `graphrag init`. Copy your LLM settings into it before running this notebook. This ensures your config is aligned with the latest version for the migration.

WARNING: This will overwrite your parquet files, you may want to make a backup!

In [None]:
# This is the directory that has your settings.yaml
PROJECT_DIRECTORY = "<your project directory>"

In [6]:
from pathlib import Path

from graphrag.config.load_config import load_config
from graphrag.storage.factory import StorageFactory

config = load_config(Path(PROJECT_DIRECTORY))
storage_config = config.output.model_dump()
storage = StorageFactory().create_storage(
    storage_type=storage_config["type"],
    kwargs=storage_config,
)

In [7]:
def remove_columns(df, columns):
    """Remove columns from a DataFrame, suppressing errors."""
    df.drop(labels=columns, axis=1, errors="ignore", inplace=True)

In [8]:
from graphrag.utils.storage import (
    load_table_from_storage,
    write_table_to_storage,
)

text_units = await load_table_from_storage("text_units", storage)

text_units["document_id"] = text_units["document_ids"].apply(lambda ids: ids[0])
remove_columns(text_units, ["document_ids"])

await write_table_to_storage(text_units, "text_units", storage)