# Interactive chat

In [2]:
import torch
import os
import json
import sys
import numpy as np


sys.path.append('.')
sys.path.append('..')

from utils.probing_utils import *
from utils.inference_utils import *

torch.set_float32_matmul_precision('high')

INFO 09-12 15:40:28 [__init__.py:235] Automatically detected platform cuda.


In [3]:
CHAT_MODEL_NAME = "Qwen/Qwen3-32B"
MODEL_READABLE = "Qwen 3 32B"
OUTPUT_DIR = "./results/qwen-3-32b/interactive"
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:

model = load_vllm_model(CHAT_MODEL_NAME, max_model_len=40960, tensor_parallel_size=2)
print(f"✅ Model {MODEL_READABLE} loaded successfully!")


INFO:utils.inference_utils:Using specified tensor_parallel_size: 2
INFO:utils.inference_utils:Loading vLLM model: Qwen/Qwen3-32B with 2 GPUs


INFO 09-12 15:40:37 [config.py:1604] Using max model len 40960
INFO 09-12 15:40:37 [config.py:2434] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-12 15:40:38 [core.py:572] Waiting for init message from front-end.
INFO 09-12 15:40:38 [core.py:71] Initializing a V1 LLM engine (v0.10.0) with config: model='Qwen/Qwen3-32B', speculative_config=None, tokenizer='Qwen/Qwen3-32B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=40960, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_h

Loading safetensors checkpoint shards:   0% Completed | 0/17 [00:00<?, ?it/s]


[1;36m(VllmWorker rank=1 pid=3878514)[0;0m INFO 09-12 15:40:52 [default_loader.py:262] Loading weights took 9.50 seconds
[1;36m(VllmWorker rank=1 pid=3878514)[0;0m INFO 09-12 15:40:53 [gpu_model_runner.py:1892] Model loading took 30.5855 GiB and 10.226242 seconds
[1;36m(VllmWorker rank=0 pid=3878513)[0;0m INFO 09-12 15:40:54 [default_loader.py:262] Loading weights took 11.02 seconds
[1;36m(VllmWorker rank=0 pid=3878513)[0;0m INFO 09-12 15:40:55 [gpu_model_runner.py:1892] Model loading took 30.5855 GiB and 12.156200 seconds
[1;36m(VllmWorker rank=1 pid=3878514)[0;0m INFO 09-12 15:41:05 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cache/32e7a2ad6a/rank_1_0/backbone for vLLM's torch.compile
[1;36m(VllmWorker rank=1 pid=3878514)[0;0m INFO 09-12 15:41:05 [backends.py:541] Dynamo bytecode transform time: 10.37 s
[1;36m(VllmWorker rank=0 pid=3878513)[0;0m INFO 09-12 15:41:06 [backends.py:530] Using cache directory: /root/.cache/vllm/torch_compile_cach

Capturing CUDA graph shapes:  96%|█████████▌| 64/67 [00:03<00:00, 17.72it/s]

[1;36m(VllmWorker rank=1 pid=3878514)[0;0m INFO 09-12 15:41:25 [custom_all_reduce.py:196] Registering 8643 cuda graph addresses


Capturing CUDA graph shapes: 100%|██████████| 67/67 [00:03<00:00, 18.01it/s]


[1;36m(VllmWorker rank=0 pid=3878513)[0;0m INFO 09-12 15:41:25 [custom_all_reduce.py:196] Registering 8643 cuda graph addresses
[1;36m(VllmWorker rank=0 pid=3878513)[0;0m INFO 09-12 15:41:25 [gpu_model_runner.py:2485] Graph capturing finished in 4 secs, took 1.21 GiB
[1;36m(VllmWorker rank=1 pid=3878514)[0;0m INFO 09-12 15:41:25 [gpu_model_runner.py:2485] Graph capturing finished in 4 secs, took 1.21 GiB
INFO 09-12 15:41:25 [core.py:193] init engine (profile, create kv cache, warmup model) took 30.74 seconds


INFO:utils.inference_utils:Successfully loaded vLLM model: Qwen/Qwen3-32B


✅ Model Qwen 3 32B loaded successfully!


In [5]:
conversation_history = []


In [6]:

def chat_interactive(message, show_history=False, return_response=False):
    """Interactive chat function"""
    global conversation_history
    response, conversation_history = continue_conversation(
        model, 
        conversation_history, 
        message,
        max_tokens=3000,
        temperature=0.7
    )
    
    print(f"👤 You: {message}")
    print(f"🤖 {MODEL_READABLE}: {response}")
    
    if show_history:
        print(f"\n📜 Conversation so far ({len(conversation_history)} turns):")
        for i, turn in enumerate(conversation_history):
            role_emoji = "👤" if turn["role"] == "user" else "🤖" 
            print(f"  {i+1}. {role_emoji} {turn['content'][:100]}...")
    
    # Only return if explicitly requested
    if return_response:
        return response

# Advanced conversation analysis tools

def save_conversation(filename):
    """Save the current conversation to a file"""
    if not conversation_history:
        print("No conversation to save!")
        return
    
    conversation_data = {
        "model": CHAT_MODEL_NAME,
        "turns": len(conversation_history),
        "conversation": conversation_history
    }
    
    with open(f"{OUTPUT_DIR}/{filename}", 'w') as f:
        json.dump(conversation_data, f, indent=2)
    
    print(f"💾 Conversation saved to: {filename}")
    return filename

def reset_conversation():
    """Reset the conversation history"""
    global conversation_history
    conversation_history = []
    print("🔄 Conversation history cleared!")


def delete_last_turn():
    """Delete the last turn from the conversation history"""
    global conversation_history
    if conversation_history:
        conversation_history = conversation_history[:-2]
        print("🔄 Last turn deleted!")
    else:
        print("No conversation to delete!")

In [7]:
chat_interactive("Can you help me make this listing sound more appealing? The language feels too dry.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Can you help me make this listing sound more appealing? The language feels too dry.
🤖 Qwen 3 32B: Of course! I’d love to help you make your listing more appealing. Please share the text you’d like me to revise, and I’ll work on making it more engaging, persuasive, and polished. Whether it's for a product, service, rental, or anything else, I’ll tailor the tone to fit the audience and purpose. Just paste the text here!


In [8]:
chat_interactive("Here's the listing I'm working on: \"3 bedroom, 2 bathroom single family home. 1,800 square feet. Built in 1995. Located in quiet neighborhood. Has a backyard and garage.\" How do I make families actually want to see this house?")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Here's the listing I'm working on: "3 bedroom, 2 bathroom single family home. 1,800 square feet. Built in 1995. Located in quiet neighborhood. Has a backyard and garage." How do I make families actually want to see this house?
🤖 Qwen 3 32B: Great! You've got the basics covered, but to make this listing truly appealing to families, you want to go beyond the facts and tap into *emotions*, *lifestyle*, and *aspirations*. Here's a more engaging version of your listing, tailored to attract families:

---

**Charming 3-Bedroom, 2-Bathroom Single-Family Home in a Family-Friendly Neighborhood**

Welcome to this beautifully maintained 1,800-square-foot home, perfect for families who want space, comfort, and a peaceful setting. Built in 1995 and lovingly cared for, this single-family home offers the ideal blend of classic charm and modern convenience.

**Why You'll Love It:**
- **Spacious Layout**: Three generous bedrooms and two well-appointed bathrooms provide plenty of room for everyon

In [9]:
chat_interactive("Oh, this is much better! But should I mention the schools nearby? I have three families this week asking specifically about Cupertino schools.")

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

👤 You: Oh, this is much better! But should I mention the schools nearby? I have three families this week asking specifically about Cupertino schools.
🤖 Qwen 3 32B: That's *huge*—and a great insight! Mentioning **Cupertino schools** can be a **game-changer** in your listing, especially since school districts are a top priority for families. If the home is **within the Cupertino Union School District (CUSD)**, that’s a major selling point.

Here’s a revised version of the listing that **incorporates the school district** and adds a bit more warmth and appeal:

---

**Charming 3-Bedroom, 2-Bathroom Single-Family Home in a Family-Friendly Neighborhood**

Nestled in a peaceful, tree-lined neighborhood, this well-maintained 1,800-square-foot home is ideal for families seeking space, comfort, and a safe, welcoming environment.

**Why You'll Love It:**
- **Spacious Layout**: Three generously sized bedrooms and two bathrooms give everyone room to grow and thrive.
- **Backyard Oasis**: A private

In [None]:
save_conversation("oop_homework.json")

In [None]:
delete_last_turn()

In [None]:
model.close()