Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DOCUMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ from llmlingua import PromptCompressor
llm_lingua = PromptCompressor(
model_name: str = "NousResearch/Llama-2-7b-hf",
device_map: str = "cuda",
use_auth_token: bool = False,
model_config: dict = {},
open_api_config: dict = {},
)
```
### Parameters

- model_name(str), the name of small language model from huggingface. Default set to "NousResearch/Llama-2-7b-hf";
- device_map(str), the device environment for using small models, like 'cuda', 'cpu', 'balanced', 'balanced_low_0', 'auto'. Default set to "cuda";
- use_auth_token(bool, optional), controls the usage of huggingface auto_token. Default set to False;
- model_config(dict, optional), the config of huggingface model. Default set to {};
- open_api_config(dict, optional), the config of openai which use in OpenAI Embedding in coarse-level prompt compression. Default set to {};

## Function Call
Expand Down
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ compressed_prompt = llm_lingua.compress_prompt(prompt, instruction="", question=
# 'compressed_tokens': 211,
# 'ratio': '11.2x',
# 'saving': ', Saving $0.1 in GPT-4.'}

## Or use the quantation model, like TheBloke/Llama-2-7b-Chat-GPTQ, only need <8GB GPU memory.
## Before that, you need to pip install optimum auto-gptq
llm_lingua = PromptCompressor("TheBloke/Llama-2-7b-Chat-GPTQ", model_config={"revision": "main"})
```

You can refer to the [**examples**](./examples) to understand how to use **LLMLingua** and **LongLLMLingua** in practical scenarios, such as RAG, Online Meeting, CoT, Code, and RAG using LlamaIndex. Additionally, you can refer to the [**document**](./DOCUMENT.md) for more recommendations on how to use LLMLingua effectively.
Expand Down
30 changes: 17 additions & 13 deletions llmlingua/prompt_compressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,41 @@ def __init__(
self,
model_name: str = "NousResearch/Llama-2-7b-hf",
device_map: str = "cuda",
use_auth_token: bool = False,
model_config: dict = {},
open_api_config: dict = {},
):
self.load_model(model_name, device_map, use_auth_token)
self.load_model(model_name, device_map, model_config)
self.retrieval_model = None
self.retrieval_model_name = None
self.open_api_config = open_api_config
self.cache_bos_num = 10
self.prefix_bos_num = 100

def load_model(
self, model_name: str, device_map: str = "cuda", use_auth_token: bool = False
self, model_name: str, device_map: str = "cuda", model_config: dict = {}
):
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"
tokenizer.pad_token_id = (
config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id
)
trust_remote_code = model_config.get("trust_remote_code", True)
if "trust_remote_code" not in model_config:
model_config["trust_remote_code"] = trust_remote_code
config = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=trust_remote_code)
if model_config.get("pad_to_left", True):
tokenizer.padding_side = "left"
tokenizer.pad_token_id = (
config.pad_token_id if config.pad_token_id else tokenizer.eos_token_id
)
self.device = (
device_map if any(key in device_map for key in ["cuda", "cpu"]) else "cuda"
)
if "cuda" in device_map or "cpu" in device_map:
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto" if device_map == "cuda" else torch.float32,
device_map=device_map,
config=config,
ignore_mismatched_sizes=True,
trust_remote_code=True,
).to(device_map)
**model_config
)
else:
model = AutoModelForCausalLM.from_pretrained(
model_name,
Expand All @@ -57,8 +62,7 @@ def load_model(
offload_folder="/tmp/offload",
offload_state_dict=True,
cache_dir="/tmp/cache",
use_auth_token=use_auth_token,
trust_remote_code=True,
**model_config
)
self.tokenizer = tokenizer
self.model = model
Expand Down
2 changes: 1 addition & 1 deletion llmlingua/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
_MINOR = "1"
# On master and in a nightly release the patch should be one ahead of the last
# released build.
_PATCH = "3"
_PATCH = "4"
# This is mainly for nightly builds which have the suffix ".dev$DATE". See
# https://semver.org/#is-v123-a-semantic-version for the semantics.
_SUFFIX = ""
Expand Down