In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
workspace = '/content/drive/MyDrive/S-LoRAA'
os.makedirs(workspace, exist_ok=True)
os.chdir(workspace)

In [None]:
if not os.path.exists('S-LoRAA'):
    !git clone https://github.com/obedjunias19/S-LoRAA.git
    os.chdir('S-LoRAA')
else:
    os.chdir('LoRAA')
    !git pull origin main

In [None]:
!pip install -q vllm
!pip install -q transformers==4.53.0 peft
!pip install -q accelerate huggingface-hub
!pip install -q pandas matplotlib seaborn networkx

In [None]:
import torch


print(f"  CUDA Available: {torch.cuda.is_available()}")
print(f"  GPU: {torch.cuda.get_device_name(0)}")
print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print(f"  CUDA Version: {torch.version.cuda}")

In [None]:
from huggingface_hub import snapshot_download

# Use Drive for caching (persistent across sessions)
cache_dir = f"{workspace}/model_cache"
os.makedirs(cache_dir, exist_ok=True)

print("Downloading models...")

# Base model
base_model_path = snapshot_download(
    repo_id="meta-llama/Llama-2-7b-hf",
    cache_dir=cache_dir
)
print(f"Base model: {base_model_path}")

# SQL LoRA
sql_lora_path = snapshot_download(
    repo_id="yard1/llama-2-7b-sql-lora-test",
    cache_dir=cache_dir
)
print(f"SQL LoRA: {sql_lora_path}")

# Code LoRA
code_lora_path = snapshot_download(
    repo_id="monsterapi/llama2-code-generation",
    cache_dir=cache_dir
)
print(f"Code LoRA: {code_lora_path}")

# Save paths for later
with open('model_paths.txt', 'w') as f:
    f.write(f"BASE_MODEL={base_model_path}\n")
    f.write(f"SQL_LORA={sql_lora_path}\n")
    f.write(f"CODE_LORA={code_lora_path}\n")



from vllm import LLM, SamplingParams

print("Testing vLLM...")

# Initialize (smaller memory usage for Colab)
llm = LLM(
    model="meta-llama/Llama-2-7b-hf",
    dtype="float16",
    gpu_memory_utilization=0.8,  
    max_model_len=1024
)

# Test generation
prompts = ["The capital of France is", "Python is a programming"]
sampling_params = SamplingParams(temperature=0.7, max_tokens=20)

outputs = llm.generate(prompts, sampling_params)

print("\nTest Outputs:")
for i, output in enumerate(outputs):
    print(f"{i+1}. {output.outputs[0].text}")

print("\nvLLM working!")