In [None]:
from llama_cpp import Llama

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = Llama(
  model_path="/teamspace/studios/this_studio/mistral-7b-gguf/mistral-7b-instruct-v0.2.Q4_K_M.gguf",  # Download the model file first
  n_ctx=32768,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=0         # The number of layers to offload to GPU, if you have GPU acceleration available
)

# Simple inference example
output = llm(
  "<s>[INST] {prompt} [/INST]", # Prompt
  max_tokens=512,  # Generate up to 512 tokens
  stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
  echo=True        # Whether to echo the prompt
)

# Chat Completion API

llm = Llama(model_path="./mistral-7b-instruct-v0.2.Q4_K_M.gguf", chat_format="llama-2")  # Set chat_format according to the model you are using
llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are a story writing assistant."},
        {
            "role": "user",
            "content": "Write a story about llamas."
        }
    ]
)


In [None]:
!huggingface-cli download TheBloke/TinyLlama-1.1B-intermediate-step-1431k-3T-GGUF tinyllama-1.1b-intermediate-step-1431k-3t.Q4_K_M.gguf --local-dir . --local-dir-use-symlinks False


In [None]:
from llama_cpp import Llama

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = Llama(
  model_path="/teamspace/studios/this_studio/mistral-7b-gguf/tinyllama-1.1b-intermediate-step-1431k-3t.Q4_K_M.gguf",  # Download the model file first
  n_ctx=2048,  # The max sequence length to use - note that longer sequence lengths require much more resources
  n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
  n_gpu_layers=0         # The number of layers to offload to GPU, if you have GPU acceleration available
)

# Simple inference example
output = llm(
#   "{prompt}", # Prompt
  prompt="Write a story about llamas.",
  max_tokens=512,  # Generate up to 512 tokens
  stop=["</s>"],   # Example stop token - not necessarily correct for this specific model! Please check before using.
  echo=True        # Whether to echo the prompt
)

# Chat Completion API

# llm = Llama(model_path="./tinyllama-1.1b-intermediate-step-1431k-3t.Q4_K_M.gguf", chat_format="llama-2")  # Set chat_format according to the model you are using
# llm.create_chat_completion(
#     messages = [
#         {"role": "system", "content": "You are a story writing assistant."},
#         {
#             "role": "user",
#             "content": "Write a story about llamas."
#         }
#     ]
# )


In [1]:
from langchain_community.llms import CTransformers
llm = CTransformers(model='/teamspace/studios/this_studio/mistral-7b-gguf/tinyllama-1.1b-intermediate-step-1431k-3t.Q4_K_M.gguf')



In [2]:
llm.invoke("what is qlora?")

'\nI think I might have seen that on the web site, but I don\'t know all there\'s to it.\nThe 1001 version can still be used for SX815/SX816 and the newer devices will support the 2001 version.\nWhat is this RW? Can I get some help with this?\nIt is a "register write" which basically tells the camera that you have opened or closed the shutter, and what value it was. The value itself (i.e., your password) is not sent to the camera, but only the register number.\nI\'ve had some problems in the past with the 1001 version.\nThere\'s a problem with the 2001 version though because your password has to be encoded using hexadecimal (instead of base 64). The 2001 version only supports up to 16 characters, and you can only have 8 bits for each character in your encoded data (one character plus its 7 bits of value), so the 2001 version will not work with passwords that are longer than that.\nOn the other hand, I haven'

In [3]:
for text in llm("AI is going to", stream=True):
    print(text, end="", flush=True)


 have an enormous impact on the future of the industry.
This year, AI was all over the show floor and there are a number of companies that are using AI in their products. Here's a quick look at some of them:
Fulcrum is an automated, artificial intelligence-powered software solution that helps manufacturers improve productivity and efficiency by automating processes, removing errors, and freeing up employees to focus on more complex tasks.
Kapil Khera, co-founder and CEO of Fulcrum, told Inman: "I don't know a single company that doesn't have AI in their product. I think it's the most important thing to come out of 2018."
The company is currently working on machine learning tools and workflow applications with machine intelligence technology so you can really begin building these powerful AI-powered solutions. The full AI-focused ecosystem, with automation in product design and sophisticated analytics, will be available to all customers by the end of 2018.
Kapil Khera, co-founder and CE

# results on the cpu using ctransformers
### - 27.4 seconds
### output:
<p>
have an enormous impact on the future of the industry.
This year, AI was all over the show floor and there are a number of companies that are using AI in their products. Here's a quick look at some of them:
Fulcrum is an automated, artificial intelligence-powered software solution that helps manufacturers improve productivity and efficiency by automating processes, removing errors, and freeing up employees to focus on more complex tasks.
Kapil Khera, co-founder and CEO of Fulcrum, told Inman: "I don't know a single company that doesn't have AI in their product. I think it's the most important thing to come out of 2018."
The company is currently working on machine learning tools and workflow applications with machine intelligence technology so you can really begin building these powerful AI-powered solutions. The full AI-focused ecosystem, with automation in product design and sophisticated analytics, will be available to all customers by the end of 2018.
Kapil Khera, co-founder and CEO of Fulcrum
Surely
</p>

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_name_or_path = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)


In [11]:

def get_response_tiny_llama(text):
    path='/teamspace/studios/this_studio/TinyLlama-1.1B-Chat-v1.0-GPTQ'
    model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="auto",
                                             trust_remote_code=False,
                                             revision="main")

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
    prompt = text
    system_message = "You are a story writing assistant"
    prompt_template=f'''<|system|>
    {system_message}</s>
    <|user|>
    {prompt}</s>
    <|assistant|>
    '''

    print("\n\n*** Generate:")

    input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
    output = model.generate(inputs=input_ids, temperature=0.7, do_sample=True, top_p=0.95, top_k=40, max_new_tokens=512)
    return(tokenizer.decode(output[0]))

In [13]:
print(get_response_tiny_llama("what is qlora?"))





*** Generate:
<s> <|system|>
    You are a story writing assistant</s> 
    <|user|>
    what is qlora?</s> 
    <|assistant|>
    qlora is a cloud-based platform for writing and publishing short stories. It allows users to create and publish their stories, edit and format their work, and share them with the world. Qlora is a free and open-source platform that is designed to be user-friendly and intuitive, making it easy for anyone to write and publish stories.</s>


# result on gpu 2.8 seconds