Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/feat: Implementation of Minute-Based Rate Limiting in CommunityReportsExtractor Using asyncio and async_mode #373

Merged
merged 6 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ docsite/*/docsTemp/
docsite/*/build/
.swc/
dist/
.idea
# https://yarnpkg.com/advanced/qa#which-files-should-be-gitignored
docsite/.yarn/*
!docsite/.yarn/patches
Expand Down
4 changes: 4 additions & 0 deletions .semversioner/next-release/patch-20240705184142723331.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"type": "patch",
"description": "Add Minute-based Rate Limiting and fix rpm, tpm settings"
}
13 changes: 11 additions & 2 deletions graphrag/config/create_graphrag_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,8 @@ def hydrate_llm_params(
proxy=reader.str("proxy") or base.proxy,
model=reader.str("model") or base.model,
max_tokens=reader.int(Fragment.max_tokens) or base.max_tokens,
temperature=reader.float(Fragment.temperature) or base.temperature,
top_p=reader.float(Fragment.top_p) or base.top_p,
model_supports_json=reader.bool(Fragment.model_supports_json)
or base.model_supports_json,
request_timeout=reader.float(Fragment.request_timeout)
Expand Down Expand Up @@ -246,6 +248,9 @@ def hydrate_parallelization_params(
type=llm_type,
model=reader.str(Fragment.model) or defs.LLM_MODEL,
max_tokens=reader.int(Fragment.max_tokens) or defs.LLM_MAX_TOKENS,
temperature=reader.float(Fragment.temperature)
or defs.LLM_TEMPERATURE,
top_p=reader.float(Fragment.top_p) or defs.LLM_TOP_P,
model_supports_json=reader.bool(Fragment.model_supports_json),
request_timeout=reader.float(Fragment.request_timeout)
or defs.LLM_REQUEST_TIMEOUT,
Expand Down Expand Up @@ -485,6 +490,8 @@ def hydrate_parallelization_params(
reader.envvar_prefix(Section.global_search),
):
global_search_model = GlobalSearchConfig(
temperature=reader.float(Fragment.temperature) or defs.LLM_TEMPERATURE,
top_p=reader.float(Fragment.top_p) or defs.LLM_TOP_P,
max_tokens=reader.int(Fragment.max_tokens)
or defs.GLOBAL_SEARCH_MAX_TOKENS,
data_max_tokens=reader.int("data_max_tokens")
Expand Down Expand Up @@ -550,16 +557,18 @@ class Fragment(str, Enum):
max_retries = "MAX_RETRIES"
max_retry_wait = "MAX_RETRY_WAIT"
max_tokens = "MAX_TOKENS"
temperature = "TEMPERATURE"
top_p = "TOP_P"
model = "MODEL"
model_supports_json = "MODEL_SUPPORTS_JSON"
prompt_file = "PROMPT_FILE"
request_timeout = "REQUEST_TIMEOUT"
rpm = "RPM"
rpm = "REQUESTS_PER_MINUTE"
sleep_recommendation = "SLEEP_ON_RATE_LIMIT_RECOMMENDATION"
storage_account_blob_url = "STORAGE_ACCOUNT_BLOB_URL"
thread_count = "THREAD_COUNT"
thread_stagger = "THREAD_STAGGER"
tpm = "TPM"
tpm = "TOKENS_PER_MINUTE"
type = "TYPE"


Expand Down
2 changes: 2 additions & 0 deletions graphrag/config/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
LLM_TYPE = LLMType.OpenAIChat
LLM_MODEL = "gpt-4-turbo-preview"
LLM_MAX_TOKENS = 4000
LLM_TEMPERATURE = 0
LLM_TOP_P = 1
LLM_REQUEST_TIMEOUT = 180.0
LLM_TOKENS_PER_MINUTE = 0
LLM_REQUESTS_PER_MINUTE = 0
Expand Down
8 changes: 8 additions & 0 deletions graphrag/config/models/global_search_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
class GlobalSearchConfig(BaseModel):
"""The default configuration section for Cache."""

temperature: float | None = Field(
description="The temperature to use for token generation.",
default=defs.LLM_TEMPERATURE,
)
top_p: float | None = Field(
description="The top-p value to use for token generation.",
default=defs.LLM_TOP_P,
)
max_tokens: int = Field(
description="The maximum context size in tokens.",
default=defs.GLOBAL_SEARCH_MAX_TOKENS,
Expand Down
8 changes: 8 additions & 0 deletions graphrag/config/models/llm_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,14 @@ class LLMParameters(BaseModel):
description="The maximum number of tokens to generate.",
default=defs.LLM_MAX_TOKENS,
)
temperature: float | None = Field(
description="The temperature to use for token generation.",
default=defs.LLM_TEMPERATURE,
)
top_p: float | None = Field(
description="The top-p value to use for token generation.",
default=defs.LLM_TOP_P,
)
request_timeout: float = Field(
description="The request timeout to use.", default=defs.LLM_REQUEST_TIMEOUT
)
Expand Down
2 changes: 1 addition & 1 deletion graphrag/index/llm/load_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def _load_openai_completion_llm(
"frequency_penalty": config.get("frequency_penalty", 0),
"presence_penalty": config.get("presence_penalty", 0),
"top_p": config.get("top_p", 1),
"max_tokens": config.get("max_tokens"),
"max_tokens": config.get("max_tokens", 4000),
}),
on_error,
cache,
Expand Down
40 changes: 40 additions & 0 deletions graphrag/index/utils/rate_limiter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""Rate limiter utility."""

import asyncio
import time


class RateLimiter:
"""
The original TpmRpmLLMLimiter strategy did not account for minute-based rate limiting when scheduled.

The RateLimiter was introduced to ensure that the CommunityReportsExtractor could be scheduled to adhere to rate configurations on a per-minute basis.
"""

# TODO: RateLimiter scheduled: using asyncio for async_mode

def __init__(self, rate: int, per: int):
self.rate = rate
self.per = per
self.allowance = rate
self.last_check = time.monotonic()

async def acquire(self):
"""Acquire a token from the rate limiter."""
current = time.monotonic()
elapsed = current - self.last_check
self.last_check = current
self.allowance += elapsed * (self.rate / self.per)

if self.allowance > self.rate:
self.allowance = self.rate

if self.allowance < 1.0:
sleep_time = (1.0 - self.allowance) * (self.per / self.rate)
await asyncio.sleep(sleep_time)
self.allowance = 0.0
else:
self.allowance -= 1.0
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
CommunityReportsExtractor,
)
from graphrag.index.llm import load_llm
from graphrag.index.utils.rate_limiter import RateLimiter
from graphrag.index.verbs.graph.report.strategies.typing import (
CommunityReport,
StrategyConfig,
Expand Down Expand Up @@ -53,6 +54,8 @@ async def _run_extractor(
args: StrategyConfig,
reporter: VerbCallbacks,
) -> CommunityReport | None:
# RateLimiter
rate_limiter = RateLimiter(rate=1, per=60)
extractor = CommunityReportsExtractor(
llm,
extraction_prompt=args.get("extraction_prompt", None),
Expand All @@ -63,6 +66,7 @@ async def _run_extractor(
)

try:
await rate_limiter.acquire()
results = await extractor({"input_text": input})
report = results.structured_output
if report is None or len(report.keys()) == 0:
Expand Down
6 changes: 4 additions & 2 deletions graphrag/query/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,13 @@ def get_global_search_engine(
max_data_tokens=gs_config.data_max_tokens,
map_llm_params={
"max_tokens": gs_config.map_max_tokens,
"temperature": 0.0,
"temperature": gs_config.temperature,
"top_p": gs_config.top_p,
},
reduce_llm_params={
"max_tokens": gs_config.reduce_max_tokens,
"temperature": 0.0,
"temperature": gs_config.temperature,
"top_p": gs_config.top_p,
},
allow_general_knowledge=False,
json_mode=False,
Expand Down
Loading