# 🚀 Run Streamlit App via ngrok on Notebook

# Warning: this notebook is work in progress, due to incompability of update of CUDA 12.5 in Google Colab env. The following llama-cpp build may or may not work 

This notebook will:
1. Clone your private GitHub repo (app.py, config.yaml, model.py, etc.)  
2. Install dependencies  
3. Expose port 8501 via ngrok  
4. Launch the Streamlit app  


In [None]:
# check cuda version
!nvcc --version

In [None]:
!set LLAMA_CUBLAS=1
!set FORCE_CMAKE=1
!set CMAKE_ARGS="-DGGML_CUDA=on" 
!pip wheel --no-deps --wheel-dir=llamacpp_wheel llama-cpp-python
!pip install https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.4-cu124/llama_cpp_python-0.3.4-cp311-cp311-linux_x86_64.whl
!pip install llama-cpp-python # --upgrade --force-reinstall --no-cache-dir

In [None]:
# downloading project files
import os
from google.colab import userdata
github_token = userdata.get('GITHUB_TOKEN')
if github_token:
    repo_url_with_token = f"https://{github_token}@github.com/phuocnguyen90/tabular_chat_local_llm.git"
    !git clone {repo_url_with_token} /content/app/
else:
    print("GitHub token not found.")
# moving back /app to parent folder for easy of deployment of streamlit app
!mv /content/app/* /content/

/bin/bash: line 1: llama_cpp: command not found


In [None]:
# Cell 1: Installs
!pip install streamlit pyngrok dotenv
!pip install huggingface_hub
!pip install pynvml
!pip install fastembed qdrant-client

In [None]:
# Write configuration into .env file
%%bash
cat <<EOF > .env

# --- Local Mode (Main LLM GGUF) ---
SQL_GGUF_MODEL_PATH=https://huggingface.co/afrideva/pip-sql-1.3b-GGUF/resolve/main/pip-sql-1.3b.Q5_K_M.gguf
SQL_GGUF_REPO_ID=afrideva/pip-sql-1.3b-GGUF
SQL_GGUF_FILENAME=pip-sql-1.3b.Q5_K_M.gguf
SQL_USE_CPU=false
SQL_GPU_LAYERS=-1
# change local llm file location
LOCAL_LLM_GGUF_MODEL_PATH=/root/.cache/huggingface/hub/models--google--gemma-3-4b-it/.no_exist/093f9f388b31de276ce2de164bdc2081324b9767/google_gemma-3-4b-it-Q5_K_M.gguf
LOCAL_LLM_GGUF_REPO_ID=google/gemma-3-4b-it
LOCAL_LLM_GGUF_FILENAME=google_gemma-3-4b-it-Q5_K_M.gguf
LLM_USE_CPU=false
LLM_GPU_LAYERS=-1
# --- Embedding model ---
EMBEDDING_MODEL_NAME=sentence-transformers/paraphrase-multilingual-mpnet-base-v2
EMBEDDING_VECTOR_SIZE=768
# --- Production/Local Mode (Main LLM GGUF) ---
GGUF_MODEL_PATH=./models/google_gemma-3-4b-it-Q5_K_M.gguf
GGUF_N_GPU_LAYERS=-1
GGUF_N_CTX=8192
EOF

In [None]:
# download the models
!python utils/download_models.py

In [None]:
# test llama cpp with LOCAL_MAIN_LLM_PATH
import llama_cpp
print(llama_cpp.__version__)
from llama_cpp import Llama
LOCAL_SQL_LLM_PATH = os.getenv("LOCAL_SQL_LLM_PATH")
print(f"LOCAL_SQL_LLM_PATH: {LOCAL_SQL_LLM_PATH}")
LOCAL_MAIN_LLM_PATH = os.getenv("LOCAL_MAIN_LLM_PATH")
print(f"LOCAL_MAIN_LLM_PATH: {LOCAL_MAIN_LLM_PATH}")
llm = Llama(model_path=LOCAL_MAIN_LLM_PATH , n_gpu_layers=-1, verbose=False)
response = llm("Who are you and what model are you based on?", max_tokens=300)
print(response)


In [12]:
# @title 4. Launch app via ngrok tunnel
from pyngrok import ngrok
import os
from dotenv import load_dotenv

load_dotenv("/content/config/.env")
NGROK_AUTH_TOKEN = os.getenv("NGROK_AUTH_TOKEN")

if NGROK_AUTH_TOKEN:
    ngrok.set_auth_token(NGROK_AUTH_TOKEN)
    print("Ngrok auth token set.")
else:
    print("Ngrok auth token not found in .env. Running without token.")

import subprocess
import threading

def run_streamlit():
    subprocess.run(['streamlit', 'run', '--server.port', '8501', '--server.headless=true', 'app.py'])

thread = threading.Thread(target=run_streamlit, daemon=True)
thread.start()

public_url = None
try:
    public_url = ngrok.connect(addr='8501', proto='http')
    print(f"🎉 Your Streamlit app should be available at: {public_url}")
except Exception as e:
    print(f"❌ Error starting ngrok: {e}")
    print("   Streamlit might be running locally but ngrok tunnel failed.")

import time
try:
    while thread.is_alive():
        time.sleep(60)
    print("Streamlit thread seems to have stopped.")
except KeyboardInterrupt:
    print("\nKeyboardInterrupt received. Shutting down...")
except Exception as e:
    print(f"Exception in keep-alive loop: {e}")
finally:
    print("Closing ngrok tunnel...")
    if public_url:
        try: ngrok.disconnect(public_url); print("Ngrok tunnel disconnected.")
        except Exception as ng_e: print(f"Error disconnecting ngrok: {ng_e}")
    ngrok.kill()
    print("Ngrok process killed.")
    print("Exiting keep-alive loop.")

Ngrok auth token set.
🎉 Your Streamlit app should be available at: NgrokTunnel: "https://7b3d-35-194-192-253.ngrok-free.app" -> "http://localhost:8501"

KeyboardInterrupt received. Shutting down...
Closing ngrok tunnel...
Ngrok tunnel disconnected.
Ngrok process killed.
Exiting keep-alive loop.
