In [None]:
from requests import delete, get, post

from openai import AuthenticationError, OpenAI, PermissionDeniedError, RateLimitError

In [None]:
openshift_token = 'OPENSHIFT_AUTH_TOKEN'
cluster_domain_url = 'CLUSTER_DOMAIN_URL'
model_id = 'llama-31-8b'
# model_id = 'facebook/opt-125m'

maas_host = f'http://maas.apps.{cluster_domain_url}'

## Request MaaS token

In [None]:
tokens_url = f'{maas_host}/maas-api/v1/tokens'
expiration = '1h'

payload = {'expiration': expiration}
openshift_auth_header = {
    'Authorization': f'Bearer {openshift_token}',
    'Content-Type': 'application/json',
}
token_response = post(tokens_url, json=payload, headers=openshift_auth_header)
maas_token = token_response.json()['token']
print(maas_token)

## List available models

In [None]:
models_url = f'{maas_host}/maas-api/v1/models'

maas_auth_header = {
    'Authorization': f'Bearer {maas_token}',
    'Content-Type': 'application/json',
}
response = get(models_url, headers=maas_auth_header)
models = response.json()['data']
print(models)

for item in models:
    if item['id'] == model_id:
        model = item
        break

print(model)

### Using OpenAI client

In [None]:
client = OpenAI(base_url=f'{maas_host}/v1', api_key=maas_token)

client.models.list()

## Inference via OpenAI client

In [None]:
client = OpenAI(base_url=f'{model['url']}/v1', api_key=maas_token)

try:
    completion = client.chat.completions.create(
        model=model['id'],
        messages=[
            {'role': 'developer', 'content': 'You are a helpful assistant'},
            {'role': 'user', 'content': "What's your training data cutoff date?"},
        ]
    )
    print(completion.choices[0].message.content)
    print(completion.usage)
except PermissionDeniedError:
    print('not allowed to access model')

## Hit rate limit

In [None]:
for i in range(100):
    print(f'sending request {i}')
    try:
        completion = client.chat.completions.create(
            model=model['id'],
            messages=[
                {'role': 'developer', 'content': 'You are a helpful assistant'},
                {'role': 'user', 'content': "What's your training data cutoff date?"},
            ]
        )
        print(f'received response for request {i}')
    except RateLimitError:
        print('reached rate limit. aborting')
        break

## Revoke MaaS token

In [None]:
delete(tokens_url, headers=openshift_auth_header)

In [None]:
try:
    completion = client.chat.completions.create(
        model=model['id'],
        messages=[
            {'role': 'developer', 'content': 'You are a helpful assistant'},
            {'role': 'user', 'content': "What's your training data cutoff date?"},
        ]
    )
    print(completion.choices[0].message.content)
except AuthenticationError:
    print('authentication failed')