# LangChain: LLM Class for Open Llama server

This is an example on how to create a LLM class for our own Open Llama server.

**Note:**
This assumes the server is running using the example API (in this same code repository) on a server running on our own EC2 instance.

In [None]:
import os
from typing import Any, List, Mapping, Optional

import requests
from langchain.llms import OpenAI
from langchain.llms.base import LLM
from langchain.chains import LLMChain
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.prompts import PromptTemplate

from lchain.debug_handler import DebugCallbackHandler
from lchain.utils import read_openai_key

import logging

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

In [None]:
read_openai_key()
debug_handler = DebugCallbackHandler()

# OpenLLama LLM Class

In [None]:
from app.api import CompletionRequest

# Local test
HOST = "127.0.0.1"
PORT = 8000
MODEL_NAME = 'fake'

# Uncomment these lines and use your own server's IP address
# HOST = '10.101.102.103'
# PORT = 80
# MODEL_NAME = 'llama_7b'

URL = f"http://{HOST}:{PORT}/v1/completions/"

class OpenLlamaLLM(LLM):
    model_name = MODEL_NAME
    url = URL
    """Server URL"""
    temperature: float = 0.7
    """What sampling temperature to use."""
    max_tokens: int = 256
    """The maximum number of tokens to generate in the completion.
    -1 returns as many tokens as possible given the prompt and
    the models maximal context size."""
    top_p: float = 1
    """Penalizes repeated tokens."""
    n: int = 1
    """Adjust the probability of specific tokens being generated."""
    request_timeout = 120
    """The maximum number of seconds to wait for the server to respond."""
    max_retries: int = 3
    """The maximum number of retries before giving up."""
    
    @property
    def _llm_type(self) -> str:
        return "open_llama_fastapi_server"

    def _completion_request(self, prompt: str) -> CompletionRequest:
        """ Create a CompletionRequest object from the current parameters """
        return { 'model': self.model_name
                , 'prompt': prompt
                , 'suffix': ''
                , 'max_tokens': self.max_tokens
                , 'temperature': self.temperature
                , 'top_p': self.top_p
                , 'n': self.n
                , 'logprobs': 1
                , 'echo': False
                , 'stop': []
                , 'presence_penalty': 0
                , 'frequency_penalty': 0
                , 'best_of': 1
                , 'logit_bias': {}
                }

    def _call(
        self,
        prompt: str,
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
    ) -> str:
        print(f"OpenLlamaLLM prompt (len:{len(prompt)}): '{prompt}'")
        if stop is not None:
            print(f"OpenLlamaLLM, stop kwargs: {stop}")
        compl_req = self._completion_request(prompt)
        if run_manager:
            run_manager.on_llm_new_token(token=prompt)
        r = requests.post(url=self.url, json=compl_req, timeout=self.request_timeout)
        logger.debug(f"OpenLlamaLLM response: {r}")
        r = r.json()
        # Get first response
        response = r['choices'][0]['text']
        return response
    
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        """Get the identifying parameters."""
        return {"url": self.url
                , 'model': self.model_name
                , 'max_tokens': self.max_tokens
                , 'temperature': self.temperature
                , 'top_p': self.top_p
                , 'n': self.n
                , 'max_retries': self.max_retries
                , 'request_timeout': self.request_timeout
                }
    
llm = OpenLlamaLLM(callbacks=[debug_handler])
llm("Code a python function to add two numbers.")

## Usage example

In [None]:
llm("Hello, my name is ")