In [1]:
from flask import Flask, request, jsonify, stream_with_context, Response
from typing import Dict
import uuid
from core.conversation import Conversation
import threading
from infrastructure.auto_regressive_text_model import AutoRegressiveTextModel
import torch
from presentation.data_request import DataRequest
from application.apllication_batch_processor import ApplicationBatchProcessor
from core.supported_tasks import SupportedTasks
from core.message import Message


In [2]:
def get_gpu_usage():
    print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

In [3]:
model_path = "assets\\Qwen2_0.5B_Instruct"
max_batch_size = 2

In [4]:
get_gpu_usage()

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB


In [5]:
model = AutoRegressiveTextModel(model_path=model_path, device='cuda')

app_processor = ApplicationBatchProcessor(
    max_batch_size=max_batch_size, model=model, )

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [6]:
get_gpu_usage()

torch.cuda.memory_allocated: 1.840416GB
torch.cuda.memory_reserved: 2.371094GB
torch.cuda.max_memory_reserved: 2.371094GB


In [7]:
message_sample = [{"role": "user", "content": "what is AI?"}]

In [8]:
conversation = Conversation.from_json(message_sample)

In [9]:
get_gpu_usage()

torch.cuda.memory_allocated: 1.840416GB
torch.cuda.memory_reserved: 2.371094GB
torch.cuda.max_memory_reserved: 2.371094GB


In [10]:
request_id = str(uuid.uuid4())
data_request = DataRequest(
        id=request_id, conversation=conversation, task=SupportedTasks.completion)

In [11]:
app_processor.add_sample(data_request)

In [12]:
get_gpu_usage()

torch.cuda.memory_allocated: 1.840416GB
torch.cuda.memory_reserved: 2.371094GB
torch.cuda.max_memory_reserved: 2.371094GB


In [13]:
response_list = app_processor.generate()
print(response_list[0].token)

AI


In [14]:
get_gpu_usage()

torch.cuda.memory_allocated: 1.848351GB
torch.cuda.memory_reserved: 2.373047GB
torch.cuda.max_memory_reserved: 2.373047GB


In [15]:
for i in range(100):
    response_list = app_processor.generate()
    print(response_list[0].token)

 stands
 for
 artificial
 intelligence
,
 which
 refers
 to
 the
 simulation
 of
 human
 intelligence
 in
 machines
.
 It
 is
 a
 subset
 of
 artificial
 intelligence
 that
 involves
 the
 creation
 of
 intelligent
 machines
 that
 can
 perform
 tasks
 that
 typically
 require
 human
 intelligence
,
 such
 as
 learning
,
 reasoning
,
 and
 problem
-solving
.
 AI
 can
 be
 used
 in
 a
 variety
 of
 applications
,
 including
 robotics
,
 autonomous
 vehicles
,
 natural
 language
 processing
,
 and
 predictive
 analytics
.
<|im_end|>


<|endoftext|>
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Human
Human
Human
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant
Assistant


In [16]:
get_gpu_usage()

torch.cuda.memory_allocated: 1.848351GB
torch.cuda.memory_reserved: 2.386719GB
torch.cuda.max_memory_reserved: 2.386719GB
