In [1]:
!pip install uv --quiet

In [None]:
!uv pip install ngrok==1.4.0 fastapi==0.112.2 vllm==0.6.0 triton==3.0.0 --force-reinstall --system --quiet

In [None]:
from kaggle_secrets import UserSecretsClient
import subprocess
import threading
import requests
import ngrok
import queue
import time
import os

In [None]:
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("hf_token")
ngrok_token = user_secrets.get_secret("ngrok_token")

In [None]:
ngrok.set_auth_token(ngrok_token)

In [None]:
def start_ngrok(q):
    try:
        ngrok_listener = ngrok.forward("localhost:8000")
        q.put(ngrok_listener)
    except Exception as e:
        print(f"Error in start_ngrok: {e}")

In [None]:
url_queue = queue.Queue()

ngrok_thread = threading.Thread(target=start_ngrok, args=(url_queue,))
ngrok_thread.start()

In [None]:
while True:
    try:
        ngrok_listener = url_queue.get()
        public_url = ngrok_listener.url()
        if public_url:
            break
        print("Waiting for ngrok URL...")
        time.sleep(1)
    except Exception as e:
        print(f"Error in retrieving ngrok URL: {e}")

print("Ngrok tunnel established at:", public_url)

In [None]:
chat_templates = ['phi-3.jinja', 'llama-3-instruct.jinja']

for model_ctemplate in chat_templates:
    response = requests.get(f'https://raw.githubusercontent.com/chujiezheng/chat_templates/main/chat_templates/{model_ctemplate}')

    with open(f'/kaggle/working/{model_ctemplate}', 'wb') as file:
        file.write(response.content)

In [None]:
system_args = [
    "--download-dir", "/kaggle/working/models/",
    "--api-key", "open-source-model",
    "--tensor-parallel-size", "2"
]

In [None]:
llama_31_8b_args = [
    "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "--chat-template", "/kaggle/working/llama-3-instruct.jinja",
    "--max-model-len", "2048",
    "--dtype", "float16",
]

In [None]:
phi35_mini_args = [
    "microsoft/Phi-3.5-mini-instruct",
    "--chat-template", "/kaggle/working/phi-3.jinja",
    "--max-model-len", "2048",
    "--dtype", "float16",
]

In [None]:
vllm_env = os.environ.copy()
vllm_env['HF_TOKEN'] = hf_token

In [None]:
# Starting vLLM serve
def vllm_serve(model_args):
    cmd = ["vllm", "serve"] + model_args + system_args
    
    with open('/kaggle/working/vllm.log', 'w') as log_file:
        process = subprocess.Popen(
            cmd,
            stdout=log_file,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1,
            universal_newlines=True,
            env=vllm_env
        )
    
    return process

In [None]:
process = vllm_serve(llama_31_8b_args)

In [None]:
time.sleep(2*60)

In [None]:
with open('/kaggle/working/vllm.log', 'r') as f:
    lines = f.readlines()
    for f in lines[-10:]:
        print(f, end="")

In [None]:
# process.terminate()
# process.wait()