In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
from typing import Any
from sdk.sdk import example_payload, example_completion_payload
from loguru import logger
import httpx
import json
from sdk import sdk

async def get_real_response( payload: dict[str, Any] = example_payload, llm_url: str | None = None, chat: bool = True) -> dict[str, Any]:
    if llm_url is None:
        logger.warning("No LLM URL provided, using default")
        llm_url = "http://llm_server:6919"
    async with httpx.AsyncClient() as client:
        if chat:
            response = await client.post(llm_url.rstrip("/") + "/v1/chat/completions", json=payload)
        else:
            response = await client.post(llm_url.rstrip("/") + "/v1/completions", json=payload)
        if response.status_code != 200:
            logger.error(f"Error: {response.status_code} - {response.text}")
            return None
        messages= []
        async for chunk in response.aiter_lines():
            if "data: {" not in chunk:
                continue
            data = json.loads(chunk.split("data: ")[1])
            logprobs = data["choices"][0]["logprobs"]
            if logprobs is None:
                continue
            messages.append(data)
        return messages

In [4]:
# Check for chat 
payload = example_payload.copy()
payload["model"] = "unsloth/Llama-3.2-3B-Instruct"
messages = await get_real_response(payload=payload, chat=True, llm_url="http://83.143.115.20:6919")
await sdk.check_result(task="chat-llama-3-2-3b", orchestrator_url="http://83.143.115.20:6920/", miner_response=messages[:])

ConnectError: All connection attempts failed

In [6]:
# Check for completions 
from sdk.task_config import CHAT_ROGUE_ROSE_103B


payload = example_completion_payload.copy()
payload["model"] = "sophosympatheia/Rogue-Rose-103b-v0.2/tree/exl2-3.2bpw"
# messages = await get_real_response(payload=payload, chat=False, llm_url="http://83.143.115.20:6919")
# await sdk.check_result(task=CHAT_ROGUE_ROSE_103B, payload=payload, orchestrator_url="http://83.143.115.20:6920/", miner_response=messages[:-1])

await sdk.check_result(task=CHAT_ROGUE_ROSE_103B, payload=payload, orchestrator_url="http://83.143.115.20:6920/", miner_response=[])

[32m2024-12-03 20:42:42.268[0m | [1mINFO    [0m | [36msdk.sdk[0m:[36mcheck_result[0m:[36m112[0m - [1mGot task ID: 1b92da11-8eaa-44d9-94e6-207b721b9d4e !!![0m
[32m2024-12-03 20:42:42.276[0m | [1mINFO    [0m | [36msdk.orchestrator_handling[0m:[36mhandle_task_id[0m:[36m21[0m - [1mWaiting for task 1b92da11-8eaa-44d9-94e6-207b721b9d4e to be done - check number: 1[0m
[32m2024-12-03 20:42:43.636[0m | [1mINFO    [0m | [36msdk.orchestrator_handling[0m:[36mhandle_task_id[0m:[36m21[0m - [1mWaiting for task 1b92da11-8eaa-44d9-94e6-207b721b9d4e to be done - check number: 2[0m
[32m2024-12-03 20:42:44.877[0m | [1mINFO    [0m | [36msdk.orchestrator_handling[0m:[36mhandle_task_id[0m:[36m21[0m - [1mWaiting for task 1b92da11-8eaa-44d9-94e6-207b721b9d4e to be done - check number: 3[0m
[32m2024-12-03 20:42:46.124[0m | [1mINFO    [0m | [36msdk.orchestrator_handling[0m:[36mhandle_task_id[0m:[36m21[0m - [1mWaiting for task 1b92da11-8eaa-44d9-94e6-207b

ReadTimeout: 

In [29]:
import requests
import json
from loguru import logger

def tokenize_message(url, payload):
    response = requests.post(url, json=payload)
    if response.status_code == 200:
        return response.json()
    else:
        logger.error(f"Error: {response.status_code} - {response.text}")
        return None

def generate_completions(url, payload):
    response = requests.post(url, json=payload)
    if response.status_code == 200:
        return response.json()
    else:
        logger.error(f"Error: {response.status_code} - {response.text}")
        return None

def main():
    tokenize_url = "http://83.143.115.20:6919/tokenize"
    completions_url = "http://83.143.115.20:6919/v1/completions"

    tokenize_payload = {
        "messages": [
            {"role": "user", "content": "Hello, how are you?"}
        ],
        "model": "unsloth/Meta-Llama-3.1-8B-Instruct"
    }

    tokenize_response = tokenize_message(tokenize_url, tokenize_payload)
    if tokenize_response is not None:
        logger.info(tokenize_response)
        completions_payload = {
            "prompt": tokenize_response["tokens"],
            "max_tokens": 10,
            "model": "unsloth/Meta-Llama-3.1-8B-Instruct",
        }
        completions_response = generate_completions(completions_url, completions_payload)
        if completions_response is not None:
            logger.info(completions_response)

main()

[32m2024-11-30 20:07:47.511[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m34[0m - [1m{'count': 16, 'max_model_len': 20000, 'tokens': [128000, 128006, 882, 128007, 271, 9906, 11, 1268, 527, 499, 30, 128009, 128006, 78191, 128007, 271]}[0m
[32m2024-11-30 20:07:47.911[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m42[0m - [1m{'id': 'cmpl-6dca6c5b5044417b9db7a3d0f1f9375f', 'object': 'text_completion', 'created': 1732993667, 'model': 'unsloth/Meta-Llama-3.1-8B-Instruct', 'choices': [{'index': 0, 'text': "I'm just a computer program, so I don", 'logprobs': None, 'finish_reason': 'length', 'stop_reason': None, 'prompt_logprobs': None}], 'usage': {'prompt_tokens': 16, 'total_tokens': 26, 'completion_tokens': 10}}[0m


In [1]:
BASE_URL = "http://83.143.115.20:6919"

In [2]:
import httpx

async def apply_chat_template(messages: list[dict], model: str = "unsloth/Meta-Llama-3.1-8B-Instruct", eot_id: int = 128009, add_generation_prompt: bool = True):
    async with httpx.AsyncClient() as client:
        r = await client.post(url=f"{BASE_URL}/tokenize", json={"model": model, "messages": messages})
        r.raise_for_status()  # raise an exception for 4xx or 5xx status codes
        tokens: list[int] = r.json()["tokens"]
        if "llama-3" in model.lower() and not add_generation_prompt:
            index_of_last_eot_id = max((loc for loc, val in enumerate(tokens) if val == eot_id), default=None)
            if index_of_last_eot_id is not None:
                tokens = tokens[:index_of_last_eot_id]
        
        r2 = await client.post(url=f"{BASE_URL}/detokenize", json={"tokens": tokens, "model": model})
        r2.raise_for_status()  # raise an exception for 4xx or 5xx status codes
        
        prompt = r2.json()["prompt"]
        return prompt, len(tokens)

In [3]:
async def tokenize(prompt: str, model: str = "unsloth/Meta-Llama-3.1-8B-Instruct"):
    async with httpx.AsyncClient() as client:
        r = await client.post(url=f"{BASE_URL}/tokenize", json={"model": model, "prompt": prompt})
        r.raise_for_status()  # raise an exception for 4xx or 5xx status codes
        return r.json()["tokens"]

In [4]:
async def detokenize(tokens: list[int], model: str = "unsloth/Meta-Llama-3.1-8B-Instruct"):
    async with httpx.AsyncClient() as client:
        r = await client.post(url=f"{BASE_URL}/detokenize", json={"tokens": tokens, "model": model})
        r.raise_for_status()  # raise an exception for 4xx or 5xx status codes
        return r.json()["prompt"]


In [5]:
async with httpx.AsyncClient() as client:
    r = await client.post(url=f"{BASE_URL}/detokenize", json={"tokens": [0, 45, 128009, 12, 12, 24], "model": "unsloth/Meta-Llama-3.1-8B-Instruct"})
    r.raise_for_status()  # raise an exception for 4xx or 5xx status codes
    r.json()
r.json()

{'prompt': '!N<|eot_id|>--9'}

In [6]:
def _fjson(r: list[dict]):
    for d in r:
        for p in d.values():
            p["logprob"] = round(float(p["logprob"]), 2)
    return r


In [8]:
# Get the input prompt & chat messages
input_messages = [    
    {"role": "user", "content": "Hello, how are you? respond in 3 words"},
]
prompt, num_input_tokens = await apply_chat_template(
    messages=input_messages,
    model="unsloth/Meta-Llama-3.1-8B-Instruct",
    eot_id=128009,
    add_generation_prompt=True,
)
prompt, num_input_tokens

('<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello, how are you? respond in 3 words<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n',
 21)

In [10]:
# Get the response from completions
r = requests.post(f"{BASE_URL}/v1/completions", json={
    "prompt": prompt,
    "max_tokens": 30,
    "model": "unsloth/Meta-Llama-3.1-8B-Instruct",
    "temperature": 1.0,
    "include_stop_str_in_output": True,
})

print(r.status_code)
content = json.loads(r.text)["choices"][0]["text"]
content




NameError: name 'requests' is not defined

In [18]:
# Get the full response
response = prompt + content
response_tokens = await tokenize(response, "unsloth/Meta-Llama-3.1-8B-Instruct")
if response_tokens[-1] != 128009:
    response_tokens.append(128009)
response = await detokenize(response_tokens, "unsloth/Meta-Llama-3.1-8B-Instruct")
chat_response = input_messages + [{"role": "assistant", "content": content}]

In [19]:
await tokenize(content, "unsloth/Meta-Llama-3.1-8B-Instruct")

[40, 2846, 3815, 7060, 13]

In [6]:
response_tokens

NameError: name 'response_tokens' is not defined

In [21]:
# Get the prompt logprobs from completions
r = requests.post(f"{BASE_URL}/v1/completions", json={
    "prompt": response,
    "model": "unsloth/Meta-Llama-3.1-8B-Instruct",
    "temperature": 1.0,
    "max_tokens": 1,
    "prompt_logprobs": 2
})

print(r.status_code)
result = json.loads(r.text)
# result["prompt_logprobs"]
result["choices"][0]["prompt_logprobs"][num_input_tokens:]

200


[{'40': {'logprob': -0.0076646762900054455, 'rank': 1, 'decoded_token': 'I'},
  '84146': {'logprob': -5.445164680480957,
   'rank': 2,
   'decoded_token': 'Doing'}},
 {'2846': {'logprob': -0.10332441329956055, 'rank': 1, 'decoded_token': "'m"},
  '1097': {'logprob': -2.3220744132995605, 'rank': 2, 'decoded_token': ' am'}},
 {'3815': {'logprob': -0.61980140209198, 'rank': 1, 'decoded_token': ' doing'},
  '31301': {'logprob': -0.86980140209198,
   'rank': 2,
   'decoded_token': ' functioning'}},
 {'7060': {'logprob': -2.8158586025238037,
   'rank': 3,
   'decoded_token': ' fine'},
  '1664': {'logprob': -0.5502336621284485,
   'rank': 1,
   'decoded_token': ' well'},
  '2294': {'logprob': -1.0189836025238037,
   'rank': 2,
   'decoded_token': ' great'}},
 {'13': {'logprob': -0.2513454854488373, 'rank': 1, 'decoded_token': '.'},
  '128009': {'logprob': -1.9075955152511597, 'rank': 2, 'decoded_token': ''}},
 {'128009': {'logprob': -9.035655966727063e-05,
   'rank': 1,
   'decoded_token': ''

In [22]:
prompt_logprobs_to_check = _fjson(result["choices"][0]["prompt_logprobs"][num_input_tokens:])
prompt_logprobs_to_check


[{'40': {'logprob': -0.01, 'rank': 1, 'decoded_token': 'I'},
  '84146': {'logprob': -5.45, 'rank': 2, 'decoded_token': 'Doing'}},
 {'2846': {'logprob': -0.1, 'rank': 1, 'decoded_token': "'m"},
  '1097': {'logprob': -2.32, 'rank': 2, 'decoded_token': ' am'}},
 {'3815': {'logprob': -0.62, 'rank': 1, 'decoded_token': ' doing'},
  '31301': {'logprob': -0.87, 'rank': 2, 'decoded_token': ' functioning'}},
 {'7060': {'logprob': -2.82, 'rank': 3, 'decoded_token': ' fine'},
  '1664': {'logprob': -0.55, 'rank': 1, 'decoded_token': ' well'},
  '2294': {'logprob': -1.02, 'rank': 2, 'decoded_token': ' great'}},
 {'13': {'logprob': -0.25, 'rank': 1, 'decoded_token': '.'},
  '128009': {'logprob': -1.91, 'rank': 2, 'decoded_token': ''}},
 {'128009': {'logprob': -0.0, 'rank': 1, 'decoded_token': ''},
  '9930': {'logprob': -9.88, 'rank': 2, 'decoded_token': ' Thank'}}]

In [28]:
# Check random token

r = requests.post(f"{BASE_URL}/v1/completions", json={
    "prompt": await detokenize(response_tokens[:-2], "unsloth/Meta-Llama-3.1-8B-Instruct"),
    "model": "unsloth/Meta-Llama-3.1-8B-Instruct",
    "temperature": 0.0,
    "max_tokens": 1,
    # "prompt_logprobs": 1,
    "logprobs": 5,
})

print(r.status_code)
result = json.loads(r.text)
result
# result["choices"][0]["prompt_logprobs"][num_input_tokens:]
# result["logprobs"][-1]

200


{'id': 'cmpl-f2fb4d973c8f4128b1120ccb8c9e0538',
 'object': 'text_completion',
 'created': 1732993583,
 'model': 'unsloth/Meta-Llama-3.1-8B-Instruct',
 'choices': [{'index': 0,
   'text': '.',
   'logprobs': {'text_offset': [0],
    'token_logprobs': [-0.2513526976108551],
    'tokens': ['.'],
    'top_logprobs': [{'.': -0.2513526976108551,
      '': -1.9076026678085327,
      ' thanks': -3.5794777870178223,
      ' thank': -3.6576027870178223,
      ' today': -4.298227787017822}]},
   'finish_reason': 'length',
   'stop_reason': None,
   'prompt_logprobs': None}],
 'usage': {'prompt_tokens': 25, 'total_tokens': 26, 'completion_tokens': 1}}

In [148]:
chat_response
fake_chat_response = [{"role": "user", "content": "Hello, how are you? respond in 3 words"}, {"role": "assistant", "content": "I'm doing"}]

In [152]:
# Check end of token
r = requests.post(f"{BASE_URL}/v1/chat/completions", json={
    "messages": fake_chat_response,
    "model": "unsloth/Meta-Llama-3.1-8B-Instruct",
    "temperature": 0.0,
    "max_tokens": 2,
    "logprobs": True,
    "top_logprobs": 5,
    "add_generation_prompt": True,
    "add_special_tokens": False,
    "include_stop_str_in_output": True,
    "top_k": 5,
})

print(r.status_code)
result = json.loads(r.text)
# result["logprobs"][-1]
result

200


{'id': 'chat-e2bbd310ca9947cca336b0a4b4b06751',
 'object': 'chat.completion',
 'created': 1732993403,
 'model': 'unsloth/Meta-Llama-3.1-8B-Instruct',
 'choices': [{'index': 0,
   'message': {'role': 'assistant', 'content': 'well,', 'tool_calls': []},
   'logprobs': {'content': [{'token': 'well',
      'logprob': -0.958371102809906,
      'bytes': [119, 101, 108, 108],
      'top_logprobs': [{'token': 'well',
        'logprob': -0.958371102809906,
        'bytes': [119, 101, 108, 108]},
       {'token': 'great',
        'logprob': -1.4271211624145508,
        'bytes': [103, 114, 101, 97, 116]},
       {'token': 'fine',
        'logprob': -2.177121162414551,
        'bytes': [102, 105, 110, 101]},
       {'token': 'pretty',
        'logprob': -3.286496162414551,
        'bytes': [112, 114, 101, 116, 116, 121]},
       {'token': 'very',
        'logprob': -3.489621162414551,
        'bytes': [118, 101, 114, 121]}]},
     {'token': ',',
      'logprob': -0.41791412234306335,
      'bytes':

In [151]:
r = requests.post(url=f"{BASE_URL}/tokenize", json={
    "model": "unsloth/Meta-Llama-3.1-8B-Instruct",
    "messages": [
        {"role": "user", "content": "Hello, how are you?"},
        # {"role": "assistant", "content": "I am good"},
    ]
})

print(r.status_code, r.text)
print(r.json())

tokens = r.json()["tokens"]

200 {"count":16,"max_model_len":20000,"tokens":[128000,128006,882,128007,271,9906,11,1268,527,499,30,128009,128006,78191,128007,271]}
{'count': 16, 'max_model_len': 20000, 'tokens': [128000, 128006, 882, 128007, 271, 9906, 11, 1268, 527, 499, 30, 128009, 128006, 78191, 128007, 271]}


In [25]:
r = requests.post(url=f"{BASE_URL}/detokenize", json={
    "tokens": tokens,
    "model": "unsloth/Meta-Llama-3.1-8B-Instruct"
})
print(r.status_code, r.text)
print(r.json())

200 {"prompt":"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello, how are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"}
{'prompt': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHello, how are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'}


In [10]:
from sdk import sdk

[32m2024-11-23 21:54:28.895[0m | [1mINFO    [0m | [36msdk.sdk[0m:[36mcheck_result[0m:[36m83[0m - [1mGot task ID: dab8de1c-076c-491e-93e8-a4f96c9b61ee !!![0m
[32m2024-11-23 21:54:28.904[0m | [1mINFO    [0m | [36msdk.orchestrator_handling[0m:[36mhandle_task_id[0m:[36m21[0m - [1mWaiting for task dab8de1c-076c-491e-93e8-a4f96c9b61ee to be done - check number: 1[0m
[32m2024-11-23 21:54:30.249[0m | [1mINFO    [0m | [36msdk.orchestrator_handling[0m:[36mhandle_task_id[0m:[36m21[0m - [1mWaiting for task dab8de1c-076c-491e-93e8-a4f96c9b61ee to be done - check number: 2[0m
[32m2024-11-23 21:54:31.473[0m | [1mINFO    [0m | [36msdk.orchestrator_handling[0m:[36mhandle_task_id[0m:[36m21[0m - [1mWaiting for task dab8de1c-076c-491e-93e8-a4f96c9b61ee to be done - check number: 3[0m
[32m2024-11-23 21:54:32.700[0m | [1mINFO    [0m | [36msdk.orchestrator_handling[0m:[36mhandle_task_id[0m:[36m21[0m - [1mWaiting for task dab8de1c-076c-491e-93e8-a4f96

({'node_scores': {'0': 0.0},
  'timestamp': '2024-11-23T20:55:49.764517',
  'error_message': None,
  'traceback': None},
 0)