In [1]:
pip install --quiet --upgrade torch==2.1.0+cu118 torchvision==0.16.0+cu118 torchaudio==2.1.0+cu118 --extra-index-url https://download.pytorch.org/whl/cu118 typing_extensions bitsandbytes transformers peft accelerate psycopg2-binary pandas tqdm


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import torch
import re

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

# Chat-format tokens
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS,  E_SYS  = "<<SYS>>\n", "\n<</SYS>>\n\n"

SYSTEM_PROMPT = (
    "You are a portfolio optimization assistant.\n\n"
    "Given a stock's profile, fundamen  tals, sentiment, and its current allocation weight in a portfolio, "
    "recommend how the allocation should be adjusted.\n\n"
    "Output format:\n"
    "- Verdict: <Increase|Decrease|Hold|Add|Remove>\n"
    "- New Allocation: <X.XX>%\n"
    "- Reasoning: <short explanation>\n"
)




In [None]:
# Cell 3 — Load CSVs and pick one ticker
import json
import pandas as pd

tickers_df = pd.read_csv(
    "tickers.csv",
    parse_dates=["date_added"]
)

# 2) Prices
prices_df = pd.read_csv(
    "prices.csv",
    parse_dates=["price_date"]
)

# 3) Analyst labels
labels_df = pd.read_csv(
    "analyst_labels.csv",
    parse_dates=["label_date"]
)

# 4) Analyst estimates
estimates_df = pd.read_csv(
    "analyst_estimates.csv",
    parse_dates=["report_date"]
)

# 5) Historical grades
grades_df = pd.read_csv(
    "grades_historical.csv",
    parse_dates=["rating_date"]
)

# 6) Key metrics (JSON-encoded column)
metrics_df = pd.read_csv(
    "key_metrics.csv",
    parse_dates=["date"]
)
# use json.loads for the JSON column
profiles_df    = pd.read_csv(
    "profiles.csv",
    converters={"profile_data": lambda x: json.loads(x)}
)
news_df = pd.read_csv(
    "stock_news.csv",
    parse_dates=["published_date"]
)
allocations_df = pd.read_csv("allocations.csv", parse_dates=["allocation_date"])
# Quick sanity-check
print("Tickers:",      tickers_df.shape)
print("Prices:",       prices_df.shape)
print("Labels:",       labels_df.shape)
print("Estimates:",    estimates_df.shape)
print("Grades:",       grades_df.shape)
print("Metrics:",      metrics_df.shape)
print("Profiles:",     profiles_df.shape)
print("News:",         news_df.shape)


Tickers: (503, 4)
Prices: (376682, 7)
Labels: (503, 11)
Estimates: (3704, 23)
Grades: (17815, 7)
Metrics: (1509, 3)
Profiles: (503, 3)
News: (21350, 9)


In [None]:
def _compute_price_stats(df, since):
    sub = df[df.price_date >= since]
    if len(sub) < 2:
        return "  Insufficient data"
    change = (sub.close_price.iloc[-1] / sub.close_price.iloc[0] - 1) * 100
    hi, lo = sub.high_price.max(), sub.low_price.min()
    avg_vol = int(sub.volume.mean())
    return (f"  Change: {change:.2f}% | High: {hi:.2f} | Low: {lo:.2f} | "
            f"Avg Vol: {avg_vol}")

def _top_news(df, since, n=2):
    sub = df[df.published_date >= since].sort_values("published_date", ascending=False).head(n)
    if sub.empty:
        return ["  No news"]
    return [f"  [Headline]: {r.title}\n  [Summary]: {r.text}"
            for _, r in sub.iterrows()]

def _latest_label(df, since):
    sub = df[df.label_date >= since].sort_values("label_date", ascending=False)
    if sub.empty:
        return "  No recent analyst rating"
    r = sub.iloc[0]
    return (f"  Rating: {r.rating} (Score {r.overall_score}/5) on {r.label_date.date()}")

def _latest_estimate(df, since):
    sub = df[df.report_date >= since].sort_values("report_date", ascending=False)
    if sub.empty:
        return "  No recent estimates"
    e = sub.iloc[0]
    return (f"  EPS Avg: {e.eps_avg:.2f} | Revenue Avg: {e.revenue_avg:,} "
            f"on {e.report_date.date()}")

def get_previous_allocation(ticker, alloc_df):
    """Returns most recent allocation percent for ticker, or 0.00 if not found."""
    try:
        latest = (
            alloc_df[alloc_df["ticker"] == ticker]
            .sort_values("allocation_date", ascending=False)
            .iloc[0]
        )
        return round(float(latest["allocation_pct"]) * 100, 2)
    except:
        return 0.00


In [12]:
from datetime import datetime, timedelta
def build_weekly_block(ticker):
    today = pd.to_datetime("today").normalize()
    since = today - timedelta(days=7)

    stats = _compute_price_stats(prices_df[prices_df.ticker==ticker], since)
    news  = _top_news   (news_df[news_df.symbol==ticker], since)
    label = _latest_label(labels_df[labels_df.ticker==ticker], since)
    est   = _latest_estimate(estimates_df[estimates_df.symbol==ticker], since)

    lines = [
        "### Weekly Outlook",
        stats,
        *news,
        label,
        est,
    ]
    return "\n".join(lines)

def build_quarterly_block(ticker):
    today = pd.to_datetime("today").normalize()
    since = today - timedelta(days=90)

    stats = _compute_price_stats(prices_df[prices_df.ticker==ticker], since)
    news  = _top_news   (news_df[news_df.symbol==ticker], since)
    label = _latest_label(labels_df[labels_df.ticker==ticker], since)
    est   = _latest_estimate(estimates_df[estimates_df.symbol==ticker], since)

    lines = [
        "### Quarterly Outlook",
        stats,
        *news,
        label,
        est,
    ]
    return "\n".join(lines)

def build_yearly_block(ticker):
    today = pd.to_datetime("today").normalize()
    since = today - timedelta(days=365)

    stats = _compute_price_stats(prices_df[prices_df.ticker==ticker], since)
    news  = _top_news   (news_df[news_df.symbol==ticker], since)
    label = _latest_label(labels_df[labels_df.ticker==ticker], since)
    est   = _latest_estimate(estimates_df[estimates_df.symbol==ticker], since)

    lines = [
        "### Yearly Outlook",
        stats,
        *news,
        label,
        est,
    ]
    return "\n".join(lines)



In [None]:
def build_full_prompt(ticker):
    # optional: include company profile
    prof = profiles_df[profiles_df.ticker==ticker].profile_data.iloc[0]
    prev_alloc = get_previous_allocation(ticker, allocations_df)
    intro = (
        f"[Company]: {ticker} — {prof.get('companyName', '')}, "
        f"Sector: {prof.get('sector', '')}\n"
        f"Previous Allocation: {prev_alloc:.2f}%\n\n"
    )
    blocks = [
        intro,
        build_weekly_block(ticker),
        "",
        build_quarterly_block(ticker),
        "",
        build_yearly_block(ticker),
    ]
    body = "\n\n".join(blocks)
    return f"{B_INST} {B_SYS}{SYSTEM_PROMPT}{E_SYS}{body}{E_INST}"

# Test
print(build_full_prompt("AAPL")[:600], "…")


[INST] <<SYS>>
You are a seasoned stock market analyst.

First, issue a single “Verdict:” line, choosing one of “Buy”, “Hold”, or “Sell” for the stock based on the aggregated data below.

Then, under separate headings, provide your rationale for that verdict over three timeframes:
  • Weekly Rationale
  • Quarterly Rationale
  • Yearly Rationale

For each timeframe, use:
  1. 3 bullet points of key price‐movement statistics (change %, high/low, avg volume)
  2. 2 news headlines with summaries
  3. 1–2 sentences tying those points back to your overall Verdict

Label your response exactly as:
Ve …


In [None]:
import os; os.environ["HF_TOKEN"] = 
!pip install --quiet huggingface_hub && huggingface-cli login --token "$HF_TOKEN"

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
The token `sp500` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [7]:
# Cell X — Load model in FP16 (requires ~14 GB VRAM)
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft         import PeftModel

# 1) Tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

# 2) Base model in FP16
base = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

# 3) Attach your LoRA adapter
model = PeftModel.from_pretrained(
    base,
    "FinGPT/fingpt-forecaster_dow30_llama2-7b_lora",
    torch_dtype=torch.float16,
    trust_remote_code=True
)

model.eval()


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/40.0M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(
  

In [16]:
from IPython.display import display, Markdown

# 2) Tokenize with truncation & padding
prompt = build_full_prompt("AAPL")
inputs = tokenizer(
    prompt,
    return_tensors="pt",
    truncation=True,
    max_length=model.config.max_position_embeddings,
    padding="longest"
)
if torch.cuda.is_available():
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
eos_id = tokenizer.eos_token_id
with torch.no_grad():
    out_ids = model.generate(
        **inputs,
        max_new_tokens=1024,
        min_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        repetition_penalty=1.1,
        eos_token_id=eos_id,
        pad_token_id=eos_id,
        early_stopping=True,
        use_cache=True
    )

# 4) Decode & strip off everything before the model’s actual response
raw = tokenizer.decode(out_ids[0], skip_special_tokens=True)
response = raw.split("[/INST]")[-1].strip()

# 5) Show it in Markdown for readability
display(Markdown(f"**Response:**\n\n{response}"))

The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


**Response:**

[Verdict: Buy]

Weekly Rationale:

* The weekly performance of Apple has seen an increase in price by 1.34%.
* The high of 206.00 suggests bullish sentiment.
* The average volume of 62700140 shows strong investor interest, which is a positive sign.

Quarterly Rationale:

* Over the quarter, Apple saw a significant increase in price by 7.66%, indicating a strong upward trend.
* The high of 225.62 indicates continued strength in the stock, while the low of 169.21 shows some consolidation.
* The average volume of 61671595 suggests consistent investment in the stock and a growing following among institutional investors.

Yearly Rationale:

* Over the year, the performance of Apple has been excellent, with a rise of 9.00%. This strong uptrend suggests that investors have confidence in the long-term prospects of the company, leading them to buy more shares.
* The high of 260.10 shows a clear potential for further growth, particularly if Apple can continue to innovate and expand its product line.
* Despite tepid AI impact and valuation issues, the company's solid financial position, evident from EPS Avg and Revenue Avg, provides evidence of robust fundamentals and long-term growth potential. 

Therefore, considering the current performance and the overall outlook for the next quarter and year, I would recommend a "Buy" rating for Apple stocks. However, it's important to monitor the situation closely and make adjustments accordingly based on market developments. 

The recent WWDC announcement and WhatsApp's joining in the legal challenge against the U.K. government's demand may also signal potential opportunities for future growth. Therefore, there is upside potential in Apple's stock price, especially if the company continues to make strategic moves like these. 

However, as mentioned earlier, AI impact and valuation concerns still need to be watched closely, as they could potentially negatively affect the share prices if not adequately addressed. Thus, caution remains essential when dealing with volatile markets. Overall, buying at this point appears promising, but one must stay vigilant about future market developments

In [17]:
out_df = pd.DataFrame([{"ticker": ticker, "prompt": prompt, "response": response}])
out_df.to_csv("output.csv", index=False)
print("Saved prompt+response → output.csv")


NameError: name 'ticker' is not defined