In [1]:
import sys
from datasets import load_dataset
from transformers import AutoTokenizer, pipeline
from tqdm import tqdm
from model_loader import *
from types import SimpleNamespace
from fastapi import FastAPI, Request
from pydantic import BaseModel, Field

model_name = '/models/Yarn-Llama-2-7b-128k'


tokenizer = AutoTokenizer.from_pretrained(
    model_name, model_max_length=sys.maxsize, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

from scaled_rope.modeling_llama_together_yarn import LlamaForCausalLM
from scaled_rope.configuration_llama import LlamaConfig
model_cls = LlamaForCausalLM
config_cls = LlamaConfig


config = config_cls.from_pretrained(model_name, trust_remote_code=True)

model = model_cls.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
    config=config,
    quantization_config=None
)

app = FastAPI(title=f"Serving {model_name}", version="0.1",)

@app.post("/get_n_token/")
def get_n_token(prompt:str = Field("You say you're Leo Tolstoy, but in reality", title='Model prompt')):
    return {"n_token":  len(tokenizer.tokenize(prompt))}

class Prompt(BaseModel):
    prompt:str = Field("You say you're Leo Tolstoy, but in reality", title='Model prompt')
    max_new_tokens:int = Field(256, ge=1, le=128000, title='Number of tokens generated in each sample')
    temperature:float = Field(1.0, ge=0.1, le=10.0, title='Temperature parameter for generation')
    top_k:int = Field(40, ge=1, le=30000)
    repetition_penalty:float = Field(1.1, ge=1.0, )
    penalty_alpha:float = Field(0.0, ge=0.0, )
    num_return_sequences:int = Field(1, ge=1, le=5, title='Number of samples generated')

@app.post("/generate/")
def gen_sample(prompt: Prompt):
        
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, pad_token_id=tokenizer.eos_token_id,
                    temperature=prompt.temperature, repetition_penalty=prompt.repetition_penalty,
                    top_k=prompt.top_k, penalty_alpha=prompt.penalty_alpha, do_sample=prompt.temperature is not None)
    input_tokens = len(tokenizer.tokenize(prompt.prompt))
    if input_tokens + prompt.max_new_tokens > config.max_position_embeddings: 
        return {"error": f'N of input tokens ({input_tokens}) + prompt.max_new_tokens ({max_new_tokens}) > config.max_position_embeddings ({max_position_embeddings})'}

    return {"replies": pipe(prompt.prompt, num_return_sequences=1, max_new_tokens=prompt.max_new_tokens)[
            0]["generated_text"][len(prompt.prompt):]}

@app.get("/health")
def healthcheck():
    return True


  from .autonotebook import tqdm as notebook_tqdm


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.83s/it]


In [6]:
from types import SimpleNamespace

prompt = SimpleNamespace()
prompt.prompt = "You say you're Leo Tolstoy, but in reality"
prompt.max_new_tokens = 256 
prompt.temperature = 1.0
prompt.top_k = 40 
prompt.repetition_penalty = 1.1
prompt.penalty_alpha = 0.0
prompt.num_return_sequences = 1

gen_sample(prompt)

 does not have profile information (Triggered internally at /opt/pytorch/pytorch/third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  kv = repeat_kv(kv, self.num_key_value_groups)


{'replies': '\nyou\'re an old gentleman of seventy-six. A very respectable old man--a\nrespectable author also...."[18]\n\nWith these words he turned to go. The "old man of seventy-six" saw him\noff with the remark:\n\n"There is something good about your work, my young friend! It will come to\nsomething."\n\nIt was said for no particular reason; still it may have been true.\n\nBut the best that could be said of all this story from the point of view\nof Tolstoy\'s literary reputation at the time was that the critics were\nmore indulgent than unfriendly. They made some allowance for his youth,\nand they forgave many of his faults because they attributed them to a\nnatural excess of zeal on behalf of the cause of enlightenment.\n\nThe general verdict is expressed by a critic who wrote thus about _Poor\nLiza_:[19]\n\n"We are inclined to excuse the novelist for not being able to do more\nthan he has done. He began too soon, and he did much in a year which\nwould not'}

In [1]:
try:
    from flash_attn.flash_attn_interface import (
        flash_attn_func, 
        flash_attn_kvpacked_func, 
        flash_attn_qkvpacked_func,
        flash_attn_varlen_kvpacked_func, 
    )
    from flash_attn.bert_padding import unpad_input, pad_input
    flash_attn_v2_installed = True
    print('>>>> Flash Attention installed')
except ImportError:
    flash_attn_v2_installed = False
    raise ImportError('Please install Flash Attention: `pip install flash-attn --no-build-isolation`')


>>>> Flash Attention installed


In [2]:

try:
    from flash_attn.layers.rotary import apply_rotary_emb_func
    flash_rope_installed = True
    print('>>>> Flash RoPE installed')
except ImportError:
    flash_rope_installed = False
    raise ImportError('Please install RoPE kernels: `pip install git+https://github.com/HazyResearch/flash-attention.git#subdirectory=csrc/rotary`')


>>>> Flash RoPE installed
