<a href="https://colab.research.google.com/github/mangohehe/rags/blob/main/ms_graphrag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

[Run the notebook for MS graphrag tests](https://microsoft.github.io/graphrag/posts/get_started/)

In [None]:
# @title Install and Import Dependencies
!pip install pyyaml
!pip install python-dotenv

from dotenv import load_dotenv
from dotenv import set_key
import os
import yaml

In [None]:
# @title Install Openai
!pip install openai==0.28

In [None]:
# @title Specify GraphRag Settings
OPENAI_API_KEY = "" #@param {type:"string"}
LLM_MODEL = "gpt-4o-mini" #@param {type:"string"}
TRUSTED_SOURCE_URL = "https://spindance.com/wp-content/uploads/2021/12/Job-Description-Machine-Learning-Software-Engineer-SpinDance.pdf" #@param {type:"string"}


In [None]:
# @title Run to List Available Openai Models
import openai
import os
from IPython.display import display, Markdown, Latex

openai.api_key = OPENAI_API_KEY

def list_available_models():
    try:
        models = openai.Model.list()
        return [model.id for model in models.data]
    except Exception as e:
        print(f"Error: {e}")
        return []

available_models = list_available_models()
print("Available models:")
for model in available_models:
    display(Markdown(model))

In [None]:
# @title Install GraphRag

!pip install graphrag

In [None]:
# @title Specify the trusted source
!mkdir -p ./ragtest/input
!curl {TRUSTED_SOURCE_URL} > ./ragtest/input/book.txt

In [None]:
!pip install PyPDF2

In [None]:
import PyPDF2
import re

def pdf_to_text(pdf_path, txt_path):
    # Define a function to clean text
    def clean_text(text):
        # Replace unrecognized symbols with a space or remove them
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Removes non-ASCII characters
        text = re.sub(r'\s+', ' ', text)  # Replaces multiple whitespace with single space
        return text.strip()  # Removes leading and trailing spaces

    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)

        # Open the text file for writing
        with open(txt_path, 'w', encoding='utf-8') as text_file:
            # Iterate through each page of the PDF
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    # Clean the extracted text
                    cleaned_text = clean_text(text)
                    text_file.write(cleaned_text + '\n')

In [None]:
!wget {TRUSTED_SOURCE_URL} -O ./ragtest/input/book.pdf
pdf_to_text('./ragtest/input/book.pdf', './ragtest/input/book.txt')

In [None]:
# @title Running the Indexer

# @markdown Initiate workspace variables. For the environment variables available, see [GraphRAG Configuration](https://microsoft.github.io/graphrag/posts/config/overview/)
# This will create two files: .env and settings.yaml in the ./ragtest directory.
!python -m graphrag.index --init --root ./ragtest



In [None]:
# @title Run to Specify GraphRag Env Variables
# Update environment variables
env_path = './ragtest/.env'
set_key(env_path, 'GRAPHRAG_API_KEY', OPENAI_API_KEY)
set_key(env_path, 'GRAPHRAG_LLM_MODEL', 'gpt-3.5-turbo')
load_dotenv(env_path)

print("Environment variables updated.")
print("GRAPHRAG_API_KEY:", os.getenv('GRAPHRAG_API_KEY'))
print("GRAPHRAG_LLM_MODEL:", os.getenv('GRAPHRAG_LLM_MODEL'))

In [None]:
!cat {env_path}

In [None]:
# @title Running the Indexing pipeline
!python -m graphrag.index --root ./ragtest

In [None]:
# @title Running the Query Engine
!python -m graphrag.query \
--root ./ragtest \
--method global \
"What are the top themes in this story?"