## VLLMOpenAI

In [3]:
from langchain_community.llms import VLLMOpenAI

In [None]:
llm = VLLMOpenAI(
    openai_api_key="EMPTY",
    openai_api_base="http://0.0.0.0:6003/v1",
    model_name="qwen2.5",
    model_kwargs={"stop": ["."]},
)
for chunk in llm.stream("你好"):
    print(chunk, end="", flush=True)

## OpenAI

In [None]:
from openai import OpenAI

# Set OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://0.0.0.0:6003/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)

chat_response = client.chat.completions.create(
    model="qwen2.5",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "为我讲一个睡前小故事"},
    ],
    stream=True
)
for chunk in chat_response:
    print(chunk.choices[0].delta.content, end="", flush=True)

## VLLM

In [None]:
import torch

print(torch.cuda.is_available())

In [13]:
import gc
import ctypes
import torch
def clean_memory(deep=False):
    gc.collect()
    if deep:
        ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

In [None]:
from langchain_community.llms import VLLM

# Initializing the vLLM model
llm = VLLM(
    model="/root/autodl-fs/modelscope/Qwen2.5-7B-Instruct",
    trust_remote_code=False,  # mandatory for Hugging Face models
    max_new_tokens=128,
    top_k=10,
    top_p=0.95,
    temperature=0.8,
)

# Running a simple query
print(llm.invoke("What are the most popular Halloween Costumes?"))

In [None]:
print(llm.invoke("你好，你是谁"))

## ChatOpenAI

In [2]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    base_url="http://0.0.0.0:6003/v1",
    api_key="EMPTY",
    model="qwen2.5",
    temperature=1.0,
)

In [3]:
from pydantic import BaseModel

class StructuredOutput(BaseModel):
    """故事大纲"""
    title: str
    key_points: list[str]
    content: str

llm_with_structured_output = llm.with_structured_output(StructuredOutput, method="function_calling")

In [4]:
def add(a: int, b: int) -> int:
    """Add two integers.

    Args:
        a: First integer
        b: Second integer
    """
    return a + b

tools = [add]

llm_with_tools = llm.bind_tools(tools, tool_choice="auto")

In [5]:
llm_with_tools.invoke("What is 2 + 2?")

AIMessage(content='', additional_kwargs={'tool_calls': [{'id': 'chatcmpl-tool-f223c7c7e65a437e9f091318c02bb580', 'function': {'arguments': '{"a": 2, "b": 2}', 'name': 'add'}, 'type': 'function'}], 'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 192, 'total_tokens': 217, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'qwen2.5', 'system_fingerprint': None, 'id': 'chatcmpl-da14e328d0e148f89bc7de96ea3521c8', 'finish_reason': 'tool_calls', 'logprobs': None}, id='run-d573ff36-380c-4be7-b7d0-d67b91bb9400-0', tool_calls=[{'name': 'add', 'args': {'a': 2, 'b': 2}, 'id': 'chatcmpl-tool-f223c7c7e65a437e9f091318c02bb580', 'type': 'tool_call'}], usage_metadata={'input_tokens': 192, 'output_tokens': 25, 'total_tokens': 217, 'input_token_details': {}, 'output_token_details': {}})

In [6]:
llm.invoke("你好")

AIMessage(content='你好！很高兴为你服务。有什么问题或需要帮助的吗？', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 30, 'total_tokens': 45, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'qwen2.5', 'system_fingerprint': None, 'id': 'chatcmpl-0c6d7e7c89d140899fb895612241cc32', 'finish_reason': 'stop', 'logprobs': None}, id='run-45311403-6d12-4160-9a21-74928d3363ba-0', usage_metadata={'input_tokens': 30, 'output_tokens': 15, 'total_tokens': 45, 'input_token_details': {}, 'output_token_details': {}})