In [None]:
## HF CACHES, DATASET DELETES:

import shutil
from pathlib import Path

cache_dir = Path.home() / ".cache" / "huggingface" / "hub"

# These are unrelated to your KPI extraction
to_remove = [
    
    "datasets--JanosAudran--financial-reports-sec",  # 13.41 GB
    "models--openai--clip-vit-large-patch14",  # 1.71 GB 
    "models--CompVis--stable-diffusion-safety-checker",  # 1.22 GB - SD
    "datasets--TeoGchx--HumanML3D",  # Motion capture dataset
]

for model in to_remove:
    path = cache_dir / model
    if path.exists():
        print(f"Removing {model}...")
        shutil.rmtree(path)
        print(f"  Freed {model.split('--')[1]}")



In [None]:
## HF CACHE ANALYSIS:


import os
from pathlib import Path

def get_dir_size(path: Path) -> int:
    """Return total size (in bytes) of all files under given path."""
    total = 0
    for root, _, files in os.walk(path):
        for f in files:
            try:
                fp = Path(root) / f
                total += fp.stat().st_size
            except (FileNotFoundError, PermissionError):
                pass
    return total

def human_readable(size_bytes: int) -> str:
    """Format bytes into human-readable units."""
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_bytes < 1024:
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024

# Hugging Face cache locations (adjust if needed)
hf_base = Path.home() / ".cache" / "huggingface"
paths = {
    "models": hf_base / "hub" / "models",
    "datasets": hf_base / "datasets",
    "transformers": hf_base / "transformers",
    "diffusers": hf_base / "diffusers",
    "others": hf_base / "hub",
}

print(f"\n=== Hugging Face Cache Analysis ===\nBase path: {hf_base}\n")

if not hf_base.exists():
    print("No Hugging Face cache found.")
else:
    total = 0
    for name, path in paths.items():
        if path.exists():
            size = get_dir_size(path)
            total += size
            print(f"{name:<15}: {human_readable(size)} ({path})")
    print(f"\nTotal estimated cache size: {human_readable(total)}")


# Minimal Ollama model manager with a VS Code/Jupyter-friendly progress bar
# pip install requests tqdm

import json
import requests
from typing import Dict, Any, List
from tqdm.auto import tqdm  # <- works in VS Code notebooks and Jupyter

class OllamaManager:
    def __init__(self, host: str = "http://localhost:11434", timeout: int = 120):
        self.base = host.rstrip("/")
        self.timeout = timeout

    # --- Internal helpers ---
    def _post(self, path: str, json_body: Dict[str, Any], stream: bool = False):
        url = f"{self.base}{path}"
        r = requests.post(url, json=json_body, timeout=self.timeout, stream=stream)
        r.raise_for_status()
        return r

    def _get(self, path: str):
        url = f"{self.base}{path}"
        r = requests.get(url, timeout=self.timeout)
        r.raise_for_status()
        return r.json()

    # --- API ---
    def list_models(self) -> List[Dict[str, Any]]:
        return self._get("/api/tags").get("models", [])

    def delete(self, model: str) -> Dict[str, Any]:
        return self._post("/api/delete", {"name": model}).json()

    def show(self, model: str) -> Dict[str, Any]:
        return self._post("/api/show", {"name": model}).json()

    def server_ok(self) -> bool:
        try:
            self.list_models()
            return True
        except Exception:
            return False

    # --- Pull with progress ---
    def pull(self, model: str, insecure: bool = False) -> Dict[str, Any]:
        """
        Download a model and show a live progress bar in VS Code/Jupyter.
        Example: mgr.pull("qwen2.5:14b-instruct-q4_K_M")
        """
        payload = {"name": model, "insecure": insecure}
        resp = self._post("/api/pull", payload, stream=True)

        pbar = None
        last = {}
        current_desc = "downloading"

        # chunk_size=1 to flush lines quickly; decode text
        for line in resp.iter_lines(chunk_size=1, decode_unicode=True):
            if not line:
                continue

            # Each line is a JSON object from Ollama
            msg = json.loads(line)
            last = msg
            status = msg.get("status") or current_desc
            completed = int(msg.get("completed", 0) or 0)
            total = int(msg.get("total", 0) or 0)

            if total > 0:
                if pbar is None:
                    pbar = tqdm(total=total, unit="B", unit_scale=True,
                                desc=status, dynamic_ncols=True, leave=False)
                # If total changes across layers, update bar total
                if total != pbar.total:
                    pbar.total = total
                pbar.n = completed
                if status != current_desc:
                    pbar.set_description(status)
                    current_desc = status
                pbar.refresh()
            else:
                # Manifest/layer messages without totals
                print(status, flush=True)

        if pbar:
            pbar.close()
        print(f"✅ Download completed: {model}")
        return last



mgr = OllamaManager()
if not mgr.server_ok():
    print("Start Ollama first (run the Ollama app/daemon).")

mgr.pull("qwen2.5:14b-instruct-q4_K_M")

mgr.pull("qwen2.5:7b-instruct-q4_K_M")

## check models.
print([m["name"] for m in mgr.list_models()])




from pathlib import Path
import os

cache_dir = Path.home() / ".cache" / "huggingface"

def analyze_cache():
    models = {}
    
    # Check hub folder
    hub_dir = cache_dir / "hub"
    if hub_dir.exists():
        for model_dir in hub_dir.iterdir():
            if model_dir.is_dir():
                size = sum(f.stat().st_size for f in model_dir.rglob('*') if f.is_file())
                models[model_dir.name] = size / 1e9  # GB
    
    # Check transformers folder (old cache)
    transformers_dir = cache_dir / "transformers"
    if transformers_dir.exists():
        size = sum(f.stat().st_size for f in transformers_dir.rglob('*') if f.is_file())
        models['transformers_legacy'] = size / 1e9
    
    # Sort by size
    sorted_models = sorted(models.items(), key=lambda x: x[1], reverse=True)
    
    print("Cache Analysis:")
    print("-" * 50)
    for name, size in sorted_models[:20]:  # Top 20
        print(f"{size:6.2f} GB  {name}")
    print("-" * 50)
    print(f"Total: {sum(models.values()):.2f} GB")
    
    # Suggest cleanup
    if sum(models.values()) > 10:
        print("\nTo clean specific models:")
        print("shutil.rmtree(cache_dir / 'hub' / 'models--MODEL_NAME')")

analyze_cache()

# """
# Cache Analysis:
# --------------------------------------------------
#  13.41 GB  datasets--JanosAudran--financial-reports-sec
#   1.71 GB  models--openai--clip-vit-large-patch14
#   1.22 GB  models--CompVis--stable-diffusion-safety-checker
#   0.50 GB  models--deepset--roberta-base-squad2
#   0.44 GB  models--sentence-transformers--all-mpnet-base-v2
#   0.26 GB  models--distilbert-base-cased-distilled-squad
#   0.23 GB  models--mrm8488--bert-small-finetuned-squadv2
#   0.00 GB  models--bert-base-uncased
#   0.00 GB  models--distilbert-base-uncased
#   0.00 GB  datasets--TeoGchx--HumanML3D
#   0.00 GB  .locks
# --------------------------------------------------
# Total: 17.76 GB

# To clean specific models:
# shutil.rmtree(cache_dir / 'hub' / 'models--MODEL_NAME')
# """

In [1]:
## MODEL DEBUGS 1:

# Debug extraction - see raw Ollama output
def debug_extract(sentence, model="qwen2.5:14b-instruct-q4_K_M"):
    import requests
    
    # Simpler prompt
    prompt = f"""Extract financial numbers from: {sentence}
Return JSON array like: [{{"metric":"revenue","value":100,"unit":"millions"}}]
If no metrics, return: []"""
    
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "format": "json",
        "temperature": 0
    }
    
    resp = requests.post("http://localhost:11434/api/generate", json=payload, timeout=30)
    
    if resp.status_code == 200:
        raw = resp.json()
        print(f"Raw response: {raw.get('response', 'NO RESPONSE')}")
        return raw
    else:
        print(f"Error: {resp.status_code}")
        return None

# Test on a known financial sentence
test_sentence = "The company reported revenue of $2.5 billion in fiscal 2023."
result = debug_extract(test_sentence)



Raw response: {"metrics":[{"metric":"revenue","value":2500,"unit":"millions"}]}


## Llama server cpp style + model download; Using Qwen2.5-3B-Instruct-Q4_K_M.gguf

In [8]:
import os
import requests
from pathlib import Path
from typing import List

def download_model(url: str, dst_path: str, chunk_size: int = 1<<20):
    """Download model with progress bar and resume support"""
    
    dst_path = Path(dst_path)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Check if file already exists and is complete
    if dst_path.exists():
        existing_size = dst_path.stat().st_size
        
        # Get expected file size from server
        head_response = requests.head(url, allow_redirects=True)
        expected_size = int(head_response.headers.get('content-length', 0))
        
        if existing_size == expected_size and expected_size > 0:
            print(f"✓ File already exists and is complete: {dst_path.name}")
            print(f"  Size: {existing_size/1e9:.2f} GB")
            return
        else:
            print(f"⚠ Incomplete file found ({existing_size/1e6:.1f}MB), re-downloading...")
            dst_path.unlink()
    
    print(f"↓ Downloading: {dst_path.name}")
    
    try:
        with requests.get(url, stream=True, timeout=30) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            downloaded = 0
            
            with open(dst_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
                        downloaded += len(chunk)
                        
                        if total_size > 0:
                            pct = (downloaded / total_size) * 100
                            gb_done = downloaded / 1e9
                            gb_total = total_size / 1e9
                            
                            print(f"\r  Progress: {pct:6.2f}% ({gb_done:.2f}GB/{gb_total:.2f}GB)", end="", flush=True)
        
        print(f"\n✓ Downloaded: {dst_path.name}")
        
    except Exception as e:
        print(f"\n❌ Download failed: {e}")
        if dst_path.exists():
            dst_path.unlink()
        raise

def merge_split_files(part_files: List[Path], output_path: Path):
    """Merge split GGUF files into single file"""
    print(f"\n→ Merging {len(part_files)} parts into {output_path.name}")
    
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    with open(output_path, 'wb') as outfile:
        for i, part_file in enumerate(sorted(part_files), 1):
            print(f"  Merging part {i}/{len(part_files)}: {part_file.name}")
            with open(part_file, 'rb') as infile:
                outfile.write(infile.read())
    
    print(f"✓ Merged successfully: {output_path}")
    print(f"  Final size: {output_path.stat().st_size/1e9:.2f} GB")

# Model configurations
MODELS = {
    "qwen2.5-3b": {
        "url": "https://huggingface.co/Qwen/Qwen2.5-3B-Instruct-GGUF/resolve/main/qwen2.5-3b-instruct-q4_k_m.gguf",
        "path": r"C:\llama_server\models\qwen2p5_3b\Qwen2.5-3B-Instruct-Q4_K_M.gguf",
        "size": "~2.2GB",
        "split": False
    },
    "qwen2.5-7b": {
        # Q4_K_M is split into 2 parts
        "urls": [
            "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q4_k_m-00001-of-00002.gguf",
            "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q4_k_m-00002-of-00002.gguf"
        ],
        "temp_dir": r"C:\llama_server\models\qwen2p5_7b\parts",
        "path": r"C:\llama_server\models\qwen2p5_7b\Qwen2.5-7B-Instruct-Q4_K_M.gguf",
        "size": "~4.7GB",
        "split": True
    },
    "qwen2.5-7b-bartowski": {
        # Bartowski version - single file, easier!
        "url": "https://huggingface.co/bartowski/Qwen2.5-7B-Instruct-GGUF/resolve/main/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
        "path": r"C:\llama_server\models\qwen2p5_7b\Qwen2.5-7B-Instruct-Q4_K_M.gguf",
        "size": "~4.7GB",
        "split": False
    }
}

def download_qwen_model(model_key: str):
    """Download a Qwen model (handles split files automatically)"""
    
    model_config = MODELS[model_key]
    
    print("="*60)
    print(f"Downloading: {model_key}")
    print(f"Expected size: {model_config['size']}")
    print("="*60)
    
    if model_config["split"]:
        # Download split files
        temp_dir = Path(model_config["temp_dir"])
        temp_dir.mkdir(parents=True, exist_ok=True)
        
        part_files = []
        for i, url in enumerate(model_config["urls"], 1):
            filename = url.split("/")[-1]
            part_path = temp_dir / filename
            
            print(f"\nPart {i}/{len(model_config['urls'])}: {filename}")
            download_model(url, str(part_path))
            part_files.append(part_path)
        
        # Merge files
        output_path = Path(model_config["path"])
        
        if not output_path.exists():
            merge_split_files(part_files, output_path)
            
            # Clean up parts
            print("\n→ Cleaning up part files...")
            for part_file in part_files:
                part_file.unlink()
                print(f"  Deleted: {part_file.name}")
        else:
            print(f"\n✓ Merged file already exists: {output_path}")
    else:
        # Single file download
        download_model(model_config["url"], model_config["path"])
    
    print("\n" + "="*60)
    print("Download complete!")
    print("="*60)

if __name__ == "__main__":
    # RECOMMENDED: Use bartowski's version - single file, no merge needed
    # download_qwen_model("qwen2.5-7b-bartowski")
        

SyntaxError: incomplete input (425907411.py, line 148)

In [9]:
import requests
from pathlib import Path
from tqdm import tqdm

def download_model_efficient(url: str, dst_path: str, chunk_size: int = 8*1024*1024):
    """
    Efficient model download with progress bar and resume support
    Uses 8MB chunks for optimal download speed
    """
    
    dst_path = Path(dst_path)
    dst_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Check if file already exists and get current size
    resume_byte_pos = 0
    if dst_path.exists():
        resume_byte_pos = dst_path.stat().st_size
        print(f"Found existing file: {resume_byte_pos / 1e9:.2f} GB")
    
    # Get file info from server
    print(f"Connecting to: {url}")
    headers = {}
    if resume_byte_pos > 0:
        headers['Range'] = f'bytes={resume_byte_pos}-'
    
    response = requests.get(url, headers=headers, stream=True, timeout=30)
    
    # Get total file size
    total_size = int(response.headers.get('content-length', 0))
    if resume_byte_pos > 0:
        total_size += resume_byte_pos
    
    # Check if resume worked
    if response.status_code == 206:
        print(f"Resuming download from {resume_byte_pos / 1e9:.2f} GB")
        mode = 'ab'
    elif response.status_code == 200:
        if resume_byte_pos > 0:
            print("Server doesn't support resume, starting fresh")
            dst_path.unlink()
            resume_byte_pos = 0
        mode = 'wb'
    else:
        raise Exception(f"Download failed with status code: {response.status_code}")
    
    # Download with progress bar
    print(f"\nDownloading: {dst_path.name}")
    print(f"Total size: {total_size / 1e9:.2f} GB")
    print()
    
    with open(dst_path, mode) as f:
        with tqdm(
            total=total_size,
            initial=resume_byte_pos,
            unit='B',
            unit_scale=True,
            unit_divisor=1024,
            desc='Progress',
            ascii=True
        ) as pbar:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))
    
    # Verify download
    final_size = dst_path.stat().st_size
    print(f"\n✓ Download complete!")
    print(f"  File: {dst_path}")
    print(f"  Size: {final_size / 1e9:.2f} GB")
    
    if total_size > 0 and final_size != total_size:
        print(f"⚠ Warning: File size mismatch (expected {total_size / 1e9:.2f} GB)")
    
    return dst_path


if __name__ == "__main__":
    # Model configuration
    MODEL_URL = "https://huggingface.co/bartowski/Qwen2.5-7B-Instruct-GGUF/resolve/main/Qwen2.5-7B-Instruct-Q5_K_M.gguf"
    MODEL_PATH = r"C:\llama_server\models\qwen2p5_7b\Qwen2.5-7B-Instruct-Q5_K_M.gguf"
    
    print("="*70)
    print("Qwen2.5-7B-Instruct Q5_K_M Model Downloader")
    print("="*70)
    print()
    
    try:
        download_model_efficient(MODEL_URL, MODEL_PATH)
        
        print("\n" + "="*70)
        print("Ready to use!")
        print("="*70)
        
    except KeyboardInterrupt:
        print("\n\n⚠ Download interrupted")
        print("Run script again to resume from where you left off")
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()

Qwen2.5-7B-Instruct Q5_K_M Model Downloader

Connecting to: https://huggingface.co/bartowski/Qwen2.5-7B-Instruct-GGUF/resolve/main/Qwen2.5-7B-Instruct-Q5_K_M.gguf

Downloading: Qwen2.5-7B-Instruct-Q5_K_M.gguf
Total size: 5.44 GB



Progress: 100%|##########| 5.07G/5.07G [10:31<00:00, 8.62MB/s]


✓ Download complete!
  File: C:\llama_server\models\qwen2p5_7b\Qwen2.5-7B-Instruct-Q5_K_M.gguf
  Size: 5.44 GB

Ready to use!





In [None]:
import subprocess, time, os, sys

LLAMA_DIR = r"C:\llama_server\llama_cpp_b6814"  # folder containing llama-server.exe
MODEL     = r"C:\models\qwen2p5_3b\Qwen2.5-3B-Instruct-Q4_K_M.gguf"

def start_llama_server(model_path=MODEL, port=8080, ctx=1024, parallel=16, gpu_layers=-1):
    exe = os.path.join(LLAMA_DIR, "llama-server.exe")
    args = [
        exe, "-m", model_path, "--api",
        "--port", str(port),
        "--ctx-size", str(ctx),
        "--cont-batching",
        "--parallel", str(parallel),
        "--gpu-layers", str(gpu_layers)
    ]
    print("Launching:", " ".join(args))
    proc = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
    # Wait for it to bind the port and load the model
    time.sleep(3)
    # Print first few lines for sanity
    for _ in range(10):
        line = proc.stdout.readline()
        if not line: break
        print(line.rstrip())
    return proc

def stop_llama_server(proc):
    if proc and proc.poll() is None:
        proc.terminate()

server_proc = start_llama_server()
# When you're done later:
# stop_llama_server(server_proc)


In [None]:
# High-expressive but compact schema; lets the LLM invent KPI categories.
PROMPT_TEMPLATE = """You are a financial KPI extractor. Output ONLY a JSON array.
Use dynamic categories (e.g., revenue, earnings, insurance_reserves, regulatory_capital,
restructuring_metrics, employee_count, interest_rate_change, debt_issuance, etc.).
Do not invent numbers. Omit null fields.

Text: {text}
Meta: sentenceID={sid}

For each KPI in Text, include:
- category: short snake_case category you choose
- value: numeric (no commas); percentages as numbers (e.g., 92 means 92%)
- unit: USD|USD_millions|USD_billions|percent|count|boe|rate|ratio|other
- period_text: exact substring describing the period (if present)
- year: YYYY if present
- quarter: Q1|Q2|Q3|Q4 if present
- explanation: <= 15 words (what/why)
- spans: list of {{start,end}} char offsets for each numeric in Text
- evidence_sentence_id: set to sentenceID

If no KPIs, return [].
"""