In [13]:
from typing import Any
from langchain_core.language_models.llms import BaseLLM
from langchain_community.llms.ollama import Ollama
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

def load_llm_hf(model_id: str, task: str, **kwargs) -> BaseLLM:
    """
    Load language model.

    Args:
        model_name (str): Model name
        task (str): Task
        kwargs (Dict[str, Any]): Additional arguments

    Returns:
        BaseLLM: The loaded language model.
    """
 
    # Load model tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Load model
    model = AutoModelForCausalLM.from_pretrained(model_id)

    # Load pipeline
    pipe = pipeline(
        task=task,
        model=model,
        tokenizer=tokenizer,
        **kwargs
    )

    # Instantiate LLM
    llm = HuggingFacePipeline(pipeline=pipe)

    return llm

def load_llm_ollama(model_name: str, base_url: str, **kwargs) -> BaseLLM:
    """
    Load large language model from Ollama.

    Args:
        model_name (str): The name of the model to load
        pipeline_kwargs Optional(Dict[str, Any]): The pipeline actions.

    Returns:
        BaseLLM: The loaded language model.
    
    Raises:
        ValueError: If there is an error loading the model
    """
    try:
        llm = Ollama(model=model_name, base_url=base_url, **kwargs )
    except Exception as e:
        raise ValueError(f"Error loading model {model_name}: {e}") from e        
    
    return llm
    
async def generate_response(prompt: str, llm: BaseLLM) -> Any:
    """
    Generate a response using large language model.

    Args:
        prompt (String): The user prompt.
        llm (BaseLLM): The loaded language model.

    Returns:
        Any: The generated response or a streaming response
    """
    try:
        return llm.invoke(prompt)
    except Exception as e:
        raise ValueError(f"Error generating: {str(e)}") from e


async def generate_streaming_response(prompt: str, llm: BaseLLM) -> Any:
    """
    Generate a response using large language model.

    Args:
        promt (String): The llm prompt.
        llm (BaseLLM): The loaded language model.

    Returns:
        Any: The generated streaming response.
    """
    try:
        for chunks in llm.stream(prompt):
            yield chunks
    except Exception as e:
        raise ValueError(f"Error generating: {str(e)}") from e



In [11]:
model_name = "llama3:instruct"
BASE_URL = "http://localhost:11434"
prompt = "What is the capital of France?"

llm = load_llm_ollama(model_name, base_url=BASE_URL)

response = await generate_response(prompt, llm)

print(response)


The capital of France is Paris.


In [None]:
from typing import Optional, Dict, Any, List
from pydantic import BaseModel

class EmbeddingsRequest(BaseModel):
    model: str = "sentence-transformers/all-MiniLM-L6-v2"
    query: Optional[str] = None

class EmbeddingsResponse(BaseModel):
    embeddings: List[float]

class GenerateRequest(BaseModel):
    model: str = "llama3"
    prompt: Optional[str] = None

class GenerateResponse(BaseModel):
    response: str

In [21]:
import asyncio

def main():
    model_name = "llama3:instruct"
    BASE_URL = "http://localhost:11434"
    prompt = "What is the capital of France?"

    llm = load_llm_ollama(model_name, base_url=BASE_URL)

    response = generate_streaming_response(prompt, llm)

    # Process the streaming response
    async for response_chunk in response:
        print(response_chunk)


# Run the main function
asyncio.run(main())



SyntaxError: 'async for' outside async function (1359228001.py, line 13)

In [None]:
async def main(request: ModelRequest):
    """
    Main endpoint to generate text using a language model.

    Args:
        request (ModelRequest): The request containing the model, prompt, and options

    Returns:
        Any: The generated response or a streaming response
    """
    model_name = request.model
    llm = load_llm_ollama(model_name, request.options)
    if request.stream:
        return StreamingResponse(generate_streaming_response(request.prompt, llm), media_type="text/event-stream")
    else:
        created_at = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        start = time.time()
        response = await generate_response(request.prompt, llm)
        end = time.time()
        return JSONResponse(
            content={
                "model": model_name,
                "created_at": created_at,
                "response": response,
                "total duration": end - start,
            })
        
