In [14]:
import os
import re
import logging
from pathlib import Path
import pickle
import json
import joblib
import shutil
import glob
from tqdm.auto import tqdm
import warnings

import numpy as np
import pandas as pd



# For Qwen
import torch
import vllm
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor


In [15]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GITHUB_TOKEN")
secret_value_1 = user_secrets.get_secret("GROQ_API_KEY")
secret_value_2 = user_secrets.get_secret("HuggingFACEHUB_access_token")
secret_value_3 = user_secrets.get_secret("LANGCHAIN_API_KEY")

# ‚úÖ IMPORTANT: Set them in os.environ so other code can access them
os.environ["GITHUB_TOKEN"] = secret_value_0
os.environ["GROQ_API_KEY"] = secret_value_1
os.environ["HuggingFACEHUB_access_token"] = secret_value_2
os.environ["LANGCHAIN_API_KEY"] = secret_value_3
os.environ["LLM_BACKEND"] = "vllm"

# ‚úÖ FIXED: Use correct model path (must match hirachical-subchat.ipynb)
model_path = "/kaggle/input/qwen2.5/transformers/14b-instruct-awq/1"
os.environ["VLLM_MODEL_PATH"] = model_path

# Print the tokens (first 4 and last 4 characters for security)
print("="*60)
print("üîê SECRETS LOADED AND SET IN ENVIRONMENT")
print("="*60)
print(f"‚úÖ GITHUB_TOKEN: {secret_value_0[:4]}...{secret_value_0[-4:]}")
print(f"‚úÖ GROQ_API_KEY: {secret_value_1[:4]}...{secret_value_1[-4:]}")
print(f"‚úÖ HuggingFACEHUB_access_token: {secret_value_2[:4]}...{secret_value_2[-4:]}")
print(f"‚úÖ LANGCHAIN_API_KEY: {secret_value_3[:4]}...{secret_value_3[-4:]}")
print(f"‚úÖ LLM_BACKEND: vllm")
print(f"‚úÖ VLLM_MODEL_PATH: {model_path}")
print("="*60)

üîê SECRETS LOADED AND SET IN ENVIRONMENT
‚úÖ GITHUB_TOKEN: gith...tWfg
‚úÖ GROQ_API_KEY: gsk_...l6gr
‚úÖ HuggingFACEHUB_access_token: hf_E...GaQC
‚úÖ LANGCHAIN_API_KEY: lsv2...ea2f
‚úÖ LLM_BACKEND: vllm
‚úÖ VLLM_MODEL_PATH: /kaggle/input/qwen2.5/transformers/14b-instruct-awq/1


In [None]:
# ‚úÖ STEP 2: Load vLLM model (RUN THIS BEFORE STEP 3!)
# Disable vLLM V1 (doesn't support logits processors yet)
os.environ["VLLM_USE_V1"] = "0"

print("="*60)
print("üöÄ LOADING vLLM MODEL ON THIS KERNEL")
print("="*60)
print(f"üìÇ Model: {model_path}")
print(f"üéÆ GPUs: {torch.cuda.device_count()}")
print("‚è≥ This takes 2-3 minutes...")
print("="*60)

llm = vllm.LLM(
    model_path,
    quantization='awq',
    tensor_parallel_size=torch.cuda.device_count(),
    gpu_memory_utilization=0.91,
    trust_remote_code=True,
    dtype="half",
    enforce_eager=True,
    max_model_len=5120,
    disable_log_stats=True,
    enable_prefix_caching=True
)

print("\n‚úÖ vLLM model loaded successfully!")
print(f"   Memory per GPU: ~{torch.cuda.get_device_properties(0).total_memory / 1024**3 * 0.91:.1f}GB used")
print("="*60)

üöÄ LOADING vLLM MODEL ON THIS KERNEL
üìÇ Model: /kaggle/input/qwen2.5/transformers/14b-instruct-awq/1
üéÆ GPUs: 2
‚è≥ This takes 2-3 minutes...
INFO 12-19 12:57:42 [config.py:717] This model supports multiple tasks: {'reward', 'classify', 'score', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 12-19 12:57:43 [config.py:1770] Defaulting to use mp for distributed inference
INFO 12-19 12:57:44 [llm_engine.py:240] Initializing a V0 LLM engine (v0.8.5.post1) with config: model='/kaggle/input/qwen2.5/transformers/14b-instruct-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen2.5/transformers/14b-instruct-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=5120, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=

[W1219 12:58:02.514702118 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W1219 12:58:02.515458078 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 12-19 12:58:02 [utils.py:1055] Found nccl from library libnccl.so.2
[1;36m(VllmWorkerProcess pid=510)[0;0m INFO 12-19 12:58:02 [utils.py:1055] Found nccl from library libnccl.so.2
[1;36m(VllmWorkerProcess pid=510)[0;0m INFO 12-19 12:58:02 [pynccl.py:69] vLLM is using nccl==2.21.5
INFO 12-19 12:58:02 [pynccl.py:69] vLLM is using nccl==2.21.5


[W1219 12:58:02.786607639 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3
[W1219 12:58:02.787286852 socket.cpp:204] [c10d] The hostname of the client socket cannot be retrieved. err=-3


INFO 12-19 12:58:03 [custom_all_reduce_utils.py:244] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
[1;36m(VllmWorkerProcess pid=510)[0;0m INFO 12-19 12:58:03 [custom_all_reduce_utils.py:244] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
INFO 12-19 12:58:03 [shm_broadcast.py:266] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_cd016a94'), local_subscribe_addr='ipc:///tmp/cd659b8c-d934-445d-aed0-86810567181d', remote_subscribe_addr=None, remote_addr_ipv6=False)
INFO 12-19 12:58:03 [parallel_state.py:1004] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, TP rank 0
[1;36m(VllmWorkerProcess pid=510)[0;0m INFO 12-19 12:58:03 [parallel_state.py:1004] rank 1 in world size 2 is assigned as DP rank 0, PP rank 0, TP rank 1
INFO 12-19 12:58:03 [model_runner.py:1108] Starting to load model /kaggle/input/qwen2.5/transformers/14b-instruct-awq/1...


OutOfMemoryError: CUDA out of memory. Tried to allocate 34.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 32.19 MiB is free. Process 4056 has 12.52 GiB memory in use. Process 7146 has 2.17 GiB memory in use. Of the allocated memory 1.90 GiB is allocated by PyTorch, and 93.44 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

[1;36m(VllmWorkerProcess pid=510)[0;0m INFO 12-19 12:59:21 [multiproc_worker_utils.py:259] Worker exiting
ERROR 12-19 12:59:21 [multiproc_worker_utils.py:120] Worker VllmWorkerProcess pid 510 died, exit code: -15
INFO 12-19 12:59:21 [multiproc_worker_utils.py:124] Killing local vLLM worker processes


In [None]:
# ‚úÖ STEP 3: Register vLLM with backend (RUN AFTER STEP 2!)
import sys
sys.path.insert(0, "/kaggle/working/Subchat-Trees/backend")

from src.services.vllm_client import VLLMClient

print("="*60)
print("üîó REGISTERING vLLM WITH BACKEND")
print("="*60)

VLLMClient.set_model(llm)

print(f"‚úÖ vLLM registered: {VLLMClient.is_available()}")
print("   ‚úÖ Response generation will use vLLM")
print("   ‚úÖ Summarization will use vLLM")
print("   ‚úÖ Judge/Classification will use vLLM")
print("="*60)

In [None]:
# Configure git identity for Kaggle git push
import subprocess

REPO_DIR = "/kaggle/working/Subchat-Trees"

print("="*60)
print("‚öôÔ∏è  CONFIGURING GIT FOR KAGGLE")
print("="*60)

try:
    import os
    os.chdir(REPO_DIR)
    
    # Set git identity
    subprocess.run(["git", "config", "user.name", "moonmehedi"], check=True)
    subprocess.run(["git", "config", "user.email", "the.mehedi.hasan.moon@gmail.com"], check=True)
    
    print("‚úÖ Git identity configured!")
    print(f"   User: moonmehedi")
    print(f"   Email: the.mehedi.hasan.moon@gmail.com")
    
    # Verify current branch
    branch_result = subprocess.run(["git", "branch", "--show-current"], capture_output=True, text=True)
    print(f"\n‚úÖ Current branch: {branch_result.stdout.strip()}")
    
    # Check if GITHUB_TOKEN is available
    if "GITHUB_TOKEN" in os.environ:
        token_preview = os.environ["GITHUB_TOKEN"]
        print(f"\n‚úÖ GITHUB_TOKEN available: {token_preview[:4]}...{token_preview[-4:]}")
    else:
        print("\n‚ö†Ô∏è  WARNING: GITHUB_TOKEN not set - git push will fail!")
    
    # ‚úÖ VERIFY vLLM is still registered after all imports
    sys.path.insert(0, os.path.join(REPO_DIR, "backend"))
    from src.services.vllm_client import VLLMClient
    print(f"\n‚úÖ vLLM still available: {VLLMClient.is_available()}")
    
    os.chdir("/kaggle/working")
    
except Exception as e:
    print(f"‚ùå Error: {e}")

print("="*60)

‚öôÔ∏è  CONFIGURING GIT FOR KAGGLE
‚úÖ Git identity configured!
   User: moonmehedi
   Email: the.mehedi.hasan.moon@gmail.com

‚úÖ Current branch: kaggle-run

‚úÖ GITHUB_TOKEN available: gith...tWfg


In [None]:
# ‚úÖ RUN KAGGLE BUFFER TEST RUNNER (with automatic git push after each buffer)
! python /kaggle/working/Subchat-Trees/backend/dataset/kaggle_buffer_test_runner.py

Traceback (most recent call last):
  File "/kaggle/working/Subchat-Trees/backend/dataset/kaggle_buffer_test_runner.py", line 1182, in <module>
    runner = KaggleMetricsTestRunner()
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/kaggle/working/Subchat-Trees/backend/dataset/kaggle_buffer_test_runner.py", line 39, in __init__
    self.classifier = ContextClassifier()
                      ^^^^^^^^^^^^^^^^^^^
  File "/kaggle/working/Subchat-Trees/backend/dataset/context_classifier.py", line 68, in __init__
    raise RuntimeError(
RuntimeError: ‚ùå vLLM not available for classification!
   Judge/classifier requires vLLM to avoid Groq API quota limits.
   Please ensure VLLMClient.set_model(llm) was called in your notebook.


: 

In [None]:
# import subprocess
# import os
# import time

# print("="*60)
# print("üõë SHUTTING DOWN SERVER AND KERNEL")
# print("="*60)

# # 1. Kill the backend server
# try:
#     print("\nüî¥ Stopping backend server...")
#     result = subprocess.run(
#         ["pkill", "-f", "uvicorn.*src.main:app"],
#         capture_output=True,
#         text=True
#     )
#     time.sleep(2)
#     print("‚úÖ Backend server stopped")
# except Exception as e:
#     print(f"‚ö†Ô∏è  Error stopping server: {e}")

# # 2. Kill the kernel
# print("\nüî¥ Terminating kernel...")
# print("‚úÖ Kernel will shut down now - this saves GPU quota!")
# print("="*60)

# # Force exit the kernel
# os._exit(0)

üõë SHUTTING DOWN SERVER AND KERNEL

üî¥ Stopping backend server...


: 

: 