In [1]:
from langchain_community.llms import LlamaCpp

import abc
import langchain
langchain.debug = False


In [2]:

class Model(abc.ABC):
    @abc.abstractmethod
    def send_request(self):
        raise NotImplementedError()

class LlamaCPP(Model):
    def __init__(self, model_path="llama_models/llama-13b-hf_q8_0.gguf", n_gpu_layers=41, n_batch=1024, n_ctx=2048) -> None:
        self.model_path = model_path
        self.n_gpu_layers = n_gpu_layers
        self.n_batch = n_batch
        self.n_ctx = n_ctx

        self.model = LlamaCpp(
            model_path=model_path,
            n_gpu_layers=n_gpu_layers,
            n_batch=n_batch,
            n_ctx=n_ctx,
            f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
            verbose=True,
        )
        super().__init__()

    def send_request(self, X, break_word:str = " name:") -> str:
        tok_len = self.model.get_num_tokens(X)
        if tok_len > 1500:
            raise Exception(f"Request exceeds prefelable 1500 tokens. Has: {tok_len}")
        prev = ""
        res = ""
        for token in self.model.stream(X, echo=False):
            res += token
            if break_word == prev+token:
                # print(res)#dev
                res = res.replace(" ; name:", "")
                res = res.replace(";name:", "")
                res = res.replace(" ;name:", "")
                res = res.replace("; name:", "")
                res = res.replace(" name:", "")
                res = res.replace("name:", "")
                break
            prev = token
        
        res = res.replace("  ;  ", "")
        res = res.replace("  ; ", "")
        res = res.replace("  ;", "")
        res = res.replace(" ; ", "")
        res = res.replace("; ", "")
        res = res.replace(" ;", "")
        res = res.replace(";", "")
        return res

    def get_info(self) -> str:
        return {
            "model":"LLama-13b-hf-q8_0",
            "model_path":self.model_path,
            "n_gpu_layers":self.n_gpu_layers,
            "n_batch":self.n_batch,
            "n_ctx":self.n_ctx,
        }


In [3]:
model = LlamaCPP()


ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes
llama_model_loader: loaded meta data with 20 key-value pairs and 363 tensors from llama_models/llama-13b-hf_q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_l

In [4]:
model.send_request("asdasd asd asdasd asd ")


llama_print_timings:        load time =     241.71 ms
llama_print_timings:      sample time =      34.61 ms /   256 runs   (    0.14 ms per token,  7397.13 tokens per second)
llama_print_timings: prompt eval time =     241.67 ms /    12 tokens (   20.14 ms per token,    49.66 tokens per second)
llama_print_timings:        eval time =   16699.86 ms /   255 runs   (   65.49 ms per token,    15.27 tokens per second)
llama_print_timings:       total time =   17353.05 ms /   267 tokens


" asd asd asd asdasd\nasd asd asdasd asd asd asdasd asd asd\nasd asd asdasd asdasd asd asd asdasd\nasd asd asd asd asd asd asdasd asd asd\n\\end{code}\n\nHow can I print this string without the empty lines?\n\nComment: The `printf` function is not the same thing as the `printf` statement. The function doesn't care about newlines. The statement is a special case of the function. You have to use the function. And you should pass a format string instead of a string literal. And the `printf` function will only print a string literal if you don't pass it any arguments.\n\nAnswer: The `printf` function is not the same thing as the `printf` statement. The function doesn't care about newlines. The statement is a special case of the function. You have to use the function. And you should pass a format string instead of a string literal. And the `printf` function will only print a string literal if you don't pass it any arguments.\n\n\\begin{"

In [5]:
# test libs

from pprint import pprint
import pandas as pd
import numpy as np
# from transformers import LlamaTokenizerFast

In [6]:
# test request

df = pd.read_csv("train.csv")

def compose(gdf):
    comp = [f"name: {e[2]} ; ingredients: {e[4]} ; preparation:{e[3]}" for e in gdf.values[:-1]]
    # print(gdf.values[-1])
    comp = comp + [f"name: {e[2]} ; ingredients: {e[4]} ; preparation:" for e in [gdf.values[-1]]]
    return " ; ".join(comp)

# req = compose(df.iloc[[11, 15, 112, 122, 133, 144, 155, 8]])
# req = "The second planet in the solar system is "

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
import random

req = compose(df.iloc[[random.randint(0, len(df)) for _ in range(4)]])
print(req)
res = model.send_request(req) # przykład send request
print(res)

In [None]:
import random
import time

I_ITER = 100


reqs = []
ress = []

start = time.time()
for _ in range(I_ITER):
    req = compose(df.iloc[[random.randint(0, len(df)) for _ in range(8)]])

    res = model.send_request(req) # przykład send request

    reqs.append(req)
    ress.append(res)
    print("----------------------------------------------------------------------------------------------------------------------------------------")
    print(req)
    print(res)
end = time.time()
print((end - start)/I_ITER)


In [None]:
df = pd.DataFrame()

In [None]:
ress