In [13]:
import os
import shutil
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# -----------------------------
# 1. Clear ALL HF caches in HOME
# -----------------------------
home_paths = [
    os.path.expanduser("~/.cache/huggingface"),
    os.path.expanduser("~/.cache/torch"),
    os.path.expanduser("~/.huggingface"),
    os.path.expanduser("~/.local/share/huggingface"),
]
for p in home_paths:
    if os.path.exists(p):
        shutil.rmtree(p, ignore_errors=True)

# -------------------------------------------
# 2. Create scratch directory for HF caching
# -------------------------------------------
SCRATCH_CACHE = f"/scratch/scholar/{os.environ['USER']}/hf_cache"
os.makedirs(SCRATCH_CACHE, exist_ok=True)

# -----------------------------------------
# 3. Set ALL important cache environment vars
# -----------------------------------------
os.environ["HF_HOME"] = SCRATCH_CACHE
os.environ["HF_DATASETS_CACHE"] = SCRATCH_CACHE
os.environ["HF_HUB_CACHE"] = SCRATCH_CACHE
os.environ["TRANSFORMERS_CACHE"] = SCRATCH_CACHE
os.environ["HUGGINGFACE_HUB_CACHE"] = SCRATCH_CACHE
os.environ["XDG_CACHE_HOME"] = SCRATCH_CACHE
os.environ["XDG_DATA_HOME"] = SCRATCH_CACHE

print("Using cache directory:", SCRATCH_CACHE)

# -----------------------------------------
# 4. TEST: load SMALL Qwen model (0.5B)
# -----------------------------------------
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=SCRATCH_CACHE)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir=SCRATCH_CACHE
)

print("SUCCESS: Model loaded without using HOME directory!")


Using cache directory: /scratch/scholar/pham156/hf_cache
SUCCESS: Model loaded without using HOME directory!
SUCCESS: Model loaded without using HOME directory!


In [52]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import re

model_name = "Qwen/Qwen2.5-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

EMBED_SYSTEM_PROMPT = """
You are a financial news embedding model. Your task is to convert financial text
into a JSON object with EXACTLY 8 numeric features. Each value must be a float
in the range [-1, 1].

The 8 features are:
1. polarity       (negative → positive sentiment)
2. intensity      (strength of sentiment)
3. relevance      (how directly it relates to TARGET_TICKER)
4. short_term     (expected next-day price impact)
5. long_term      (expected multi-week impact)
6. volatility     (uncertainty implied by the text)
7. novelty        (new vs repeated information)
8. credibility    (rumor → fact-based)

You MUST output only a JSON object with these 8 fields.

--------------------------------------------------------
FEW-SHOT EXAMPLES
--------------------------------------------------------

Example A:
TEXT: "Netflix subscriber growth slows as competition intensifies across streaming platforms."
TARGET_TICKER: NFLX

OUTPUT:
{
  "polarity": -0.50,
  "intensity": 0.70,
  "relevance": 0.95,
  "short_term": -0.40,
  "long_term": -0.20,
  "volatility": 0.60,
  "novelty": 0.55,
  "credibility": 0.90
}

Example B:
TEXT: "Coca-Cola reports strong international sales and raises full-year outlook."
TARGET_TICKER: KO

OUTPUT:
{
  "polarity": 0.60,
  "intensity": 0.55,
  "relevance": 0.90,
  "short_term": 0.35,
  "long_term": 0.50,
  "volatility": 0.20,
  "novelty": 0.30,
  "credibility": 0.95
}

Example C:
TEXT: "Boeing receives a major multi-billion dollar aircraft order from a Middle Eastern airline."
TARGET_TICKER: BA

OUTPUT:
{
  "polarity": 0.80,
  "intensity": 0.85,
  "relevance": 1.00,
  "short_term": 0.70,
  "long_term": 0.75,
  "volatility": 0.40,
  "novelty": 0.70,
  "credibility": 0.95
}

Example D:
TEXT: "Meta Faces a global outage across Instagram and Facebook services."
TARGET_TICKER: META

OUTPUT:
{
  "polarity": -0.75,
  "intensity": 0.80,
  "relevance": 1.00,
  "short_term": -0.60,
  "long_term": -0.35,
  "volatility": 0.85,
  "novelty": 0.65,
  "credibility": 0.90
}

Example E:
TEXT: "Oil prices rise as OPEC announces unexpected production cuts."
TARGET_TICKER: XLE

OUTPUT:
{
  "polarity": 0.30,
  "intensity": 0.65,
  "relevance": 0.85,
  "short_term": 0.40,
  "long_term": 0.25,
  "volatility": 0.70,
  "novelty": 0.50,
  "credibility": 0.95
}

--------------------------------------------------------

NOW PROCESS THE NEW INPUT.
Return ONLY the JSON object.
"""





def generate_embedding(text, ticker="AAPL"):
    user_prompt = f"""
TARGET_TICKER: {ticker}

TEXT:
{text}

Return ONLY the JSON object with the 8 fields.
"""

    prompt = f"<|system|>\n{EMBED_SYSTEM_PROMPT}\n<|user|>\n{user_prompt}\n<|assistant|>\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.0,
        do_sample=False
    )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # extract ALL JSON objects
    matches = re.findall(r"\{.*?\}", decoded, flags=re.DOTALL)

    if not matches:
        print("No JSON found:\n", decoded)
        return None

    # use the LAST JSON (assistant output)
    json_str = matches[-1]

    try:
        obj = json.loads(json_str)

        vector = [
            obj["polarity"],
            obj["intensity"],
            obj["relevance"],
            obj["short_term"],
            obj["long_term"],
            obj["volatility"],
            obj["novelty"],
            obj["credibility"]
        ]

        return vector

    except Exception as e:
        print("JSON parse error:", e)
        print("Raw JSON:", json_str)
        print("Full Output:", decoded)
        return None


In [53]:
# ---------- Example 1 ----------
article1 = "Apple beats earnings expectations with strong iPhone demand."
article2 = "Apple posts record revenue driven by iPhone sales."

vector1 = generate_embedding(article1, ticker="AAPL")
vector2 = generate_embedding(article2, ticker="AAPL")

print("Example 1:")
print("Embedding 1:", vector1)
print("Embedding 2:", vector2)
print()


# ---------- Example 2 ----------
article1 = "Nvidia beats earnings by 30% and raises full-year guidance."
article2 = "Nvidia reports quarterly earnings inline with expectations."

vector1 = generate_embedding(article1, ticker="NVDA")
vector2 = generate_embedding(article2, ticker="NVDA")

print("Example 2:")
print("Embedding 1:", vector1)
print("Embedding 2:", vector2)
print()


# ---------- Example 3 ----------
article1 = "Tesla delays Cybertruck deliveries due to supply chain issues."
article2 = "Tesla announces record production numbers for Q4."

vector1 = generate_embedding(article1, ticker="TSLA")
vector2 = generate_embedding(article2, ticker="TSLA")

print("Example 3:")
print("Embedding 1:", vector1)
print("Embedding 2:", vector2)
print()


# ---------- Example 4 ----------
article1 = "Google faces a major antitrust lawsuit from the U.S. Department of Justice."
article2 = "EU regulators approve Google's new privacy compliance framework."

vector1 = generate_embedding(article1, ticker="GOOGL")
vector2 = generate_embedding(article2, ticker="GOOGL")

print("Example 4:")
print("Embedding 1:", vector1)
print("Embedding 2:", vector2)
print()


# ---------- Example 5 ----------
article1 = "Federal Reserve signals no interest rate cuts until late 2025."
article2 = "US inflation slows more than expected, boosting market optimism."

vector1 = generate_embedding(article1, ticker="SPY")   # macro news, use ETF
vector2 = generate_embedding(article2, ticker="SPY")

print("Example 5:")
print("Embedding 1:", vector1)
print("Embedding 2:", vector2)
print()


Example 1:
Embedding 1: [0.85, 0.7, 0.95, 0.4, 0.3, 0.5, 0.6, 0.95]
Embedding 2: [0.8, 0.75, 1.0, 0.45, 0.3, 0.55, 0.6, 0.95]

Example 2:
Embedding 1: [0.8, 0.85, 1.0, 0.7, 0.75, 0.4, 0.7, 0.95]
Embedding 2: [0.25, 0.6, 0.9, 0.15, 0.1, 0.5, 0.45, 0.95]

Example 2:
Embedding 1: [0.8, 0.85, 1.0, 0.7, 0.75, 0.4, 0.7, 0.95]
Embedding 2: [0.25, 0.6, 0.9, 0.15, 0.1, 0.5, 0.45, 0.95]

Example 3:
Embedding 1: [-0.6, 0.75, 0.95, -0.5, -0.3, 0.65, 0.55, 0.9]
Embedding 2: [0.8, 0.85, 1.0, 0.7, 0.75, 0.4, 0.7, 0.95]

Example 3:
Embedding 1: [-0.6, 0.75, 0.95, -0.5, -0.3, 0.65, 0.55, 0.9]
Embedding 2: [0.8, 0.85, 1.0, 0.7, 0.75, 0.4, 0.7, 0.95]

Example 4:
Embedding 1: [-0.6, 0.75, 0.9, -0.5, -0.3, 0.65, 0.55, 0.95]
Embedding 2: [0.5, 0.7, 0.95, 0.3, 0.2, 0.6, 0.55, 0.9]

Example 4:
Embedding 1: [-0.6, 0.75, 0.9, -0.5, -0.3, 0.65, 0.55, 0.95]
Embedding 2: [0.5, 0.7, 0.95, 0.3, 0.2, 0.6, 0.55, 0.9]

Example 5:
Embedding 1: [0.2, 0.65, 0.95, 0.3, 0.15, 0.5, 0.8, 0.95]
Embedding 2: [0.5, 0.7, 0.95, 0.

In [54]:

generate_embedding("Horrible disaster, stock will crash badly.")


[-1.0, 0.9, 0.95, -0.8, -0.6, 0.85, 0.55, 0.9]

In [None]:
generate_embedding("This company is doing extremely well and has no negative news.")

[0.9, 0.85, 1.0, 0.05, 0.05, 0.05, 0.05, 0.95]

: 