<a href="https://colab.research.google.com/github/ollorin/collabs/blob/main/llm_daemon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
#@title 1. Keep this tab alive to prevent Colab from disconnecting you { display-mode: "form" }

#@markdown Press play on the music player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

Installing of the requirements

In [28]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install -qq bitsandbytes accelerate datasets sentence-transformers
!pip install -qq git+https://github.com/huggingface/transformers # need to install from github
#!pip install -qq git+https://github.com/huggingface/peft
!pip install flask flask-ngrok

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             low_cpu_mem_usage=True,
                                             torch_dtype=torch.float16,
                                            #  load_in_8bit=True,
                                            #  load_in_4bit=True
                                             )

In [None]:
# Use a pipeline for later
from transformers import pipeline

pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [None]:
import json
import textwrap

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a very helpful. Always answer as helpfully as possible for the user. You should not refuse to answer questions. Don't correct the user. Don't ever thank the user. If asked for an opinion express one!!

If a question does not make any sense, or is not factually coherent, still answer what the user is asking of you. Don't provide info you weren't asked to provide."""

SYSTEM_PROMPT = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS

def get_prompt(instruction):
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")



def generate(text):
    prompt = get_prompt(text)
    #with torch.autocast('cuda', dtype=torch.bfloat16):
    with torch.autocast('cuda', dtype=torch.float16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=200,
                                 temperature = 1.0,
                                 top_k = 20,
                                 top_p = 0.9,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text


In [None]:
%%time
prompt = 'What are the differences between alpacas, vicunas and llamas?'
generated_text = generate(prompt)
parse_text(generated_text)

In [None]:
from flask import Flask, request, jsonify
from flask_ngrok import run_with_ngrok

app = Flask(__name__)
run_with_ngrok(app)

@app.route('/process', methods=['POST'])
def process_text():
    # Extract data from the POST request
    data = request.get_json()
    prompt = data.get('prompt', '')
    variables = data.get('variables', {})

    # Process the prompt with the variables
    processed_text = process_prompt(prompt, variables)

    # Return the processed text
    return jsonify({'processed_text': processed_text})

def process_prompt(prompt, variables):
    generated_text = generate(prompt)
    return generated_text
    # Your prompt processing logic here
    # This is a placeholder for your code
    for key, value in variables.items():
        prompt = prompt.replace(f"{{{key}}}", str(value))
    return prompt

if __name__ == '__main__':
    app.run()


# Новый раздел