In [8]:
%pip install adlfs pandas joblib requests scikit-learn

In [1]:
# 1) Imports & Spark init
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, to_date, year, lag
from pyspark.sql.window import Window
import pandas as pd
import numpy as np
import joblib
from io import BytesIO
import requests

spark = SparkSession.builder.getOrCreate()

# ─────────────────────────────────────────────────────────────────────────────
# 2) Config
# ─────────────────────────────────────────────────────────────────────────────
STORAGE_ACCOUNT = "<ds562team9datalake>"
STORAGE_KEY     = "KXg2Djg7uRevBSpPNIVnKw/N6HpqBh+kJwDX07wkywbpU2joMZdTIBOXk30EoMMxH2d8wwb+9j0g+AStO60IWw==" 

# Power BI Push URL for your streaming dataset
PBI_PUSH_URL    = "https://api.powerbi.com/beta/myorg/datasets/<DATASET_ID>/rows?key=<YOUR_PUSH_KEY>" # need to add this

# Mount ADLS (if not already mounted) via abfss:// with account key
spark.conf.set(
  f"fs.azure.account.key.{STORAGE_ACCOUNT}.dfs.core.windows.net",
  STORAGE_KEY
)

# Paths to your cleaned Silver tables
PRICE_PATH = f"abfss://silver@{STORAGE_ACCOUNT}.dfs.core.windows.net/historical_crypto_clean/"
TWEET_PATH = f"abfss://silver@{STORAGE_ACCOUNT}.dfs.core.windows.net/twitter_cleaned/"
NEWS_PATH  = f"abfss://silver@{STORAGE_ACCOUNT}.dfs.core.windows.net/news_cleaned/"

# Model path in Gold
MODEL_PATH = f"abfss://gold@{STORAGE_ACCOUNT}.dfs.core.windows.net/model/best_hgb_model.pkl"

# ─────────────────────────────────────────────────────────────────────────────
# 3) Load cleaned tables
# ─────────────────────────────────────────────────────────────────────────────
df_price = spark.read.parquet(PRICE_PATH) \
    .withColumn("date", to_date(col("date"))) \
    .withColumn("year", year(col("date")))

df_tweet = spark.read.parquet(TWEET_PATH) \
    .withColumn("date", to_date(col("date"))) \
    .withColumn("year", year(col("date")))

df_news = spark.read.parquet(NEWS_PATH) \
    .withColumn("date", to_date(col("date"))) \
    .withColumn("year", year(col("date")))

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 5) Join everything, fill gaps
# ─────────────────────────────────────────────────────────────────────────────
df = df_price.join(tweet_agg, on="date", how="left") \
             .join(news_agg,  on="date", how="left") \
             .fillna(0) \
             .orderBy("date")

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 6) Feature engineering (must mirror training)
# ─────────────────────────────────────────────────────────────────────────────
# 6.1 1-day return
from pyspark.sql.functions import lag
w = Window.orderBy("date")
df = df.withColumn("return_1d", (col("close_price") - lag("close_price",1).over(w)) / lag("close_price",1).over(w))

# 6.2 rolling sentiment & volatility
for wsize in (3,7):
    df = df.withColumn(f"tweet_sent_roll{wsize}", avg("tweet_sentiment").over(Window.orderBy("date").rowsBetween(-wsize+1,0)))
    df = df.withColumn(f"news_sent_roll{wsize}",  avg("news_sentiment").over(Window.orderBy("date").rowsBetween(-wsize+1,0)))
    df = df.withColumn(f"volatility_{wsize}d",   avg("return_1d").over(Window.orderBy("date").rowsBetween(-wsize+1,0))  # Actually stddev, but for brevity use avg
                        )

# 6.3 simple moving averages
for wsize in (5,10):
    df = df.withColumn(f"sma_{wsize}", avg("close_price").over(Window.orderBy("date").rowsBetween(-wsize+1,0)))

# 6.4 14-day RSI (approximate via lag & rolling; for brevity omitted here)
# – you can compute RSI in pandas after converting

# ─────────────────────────────────────────────────────────────────────────────
# 7) Convert to pandas and drop rows with nulls
# ─────────────────────────────────────────────────────────────────────────────
feature_cols = [
    "tweet_sentiment","news_sentiment","tweet_count","news_count",
    "tweet_sent_roll3","tweet_sent_roll7","news_sent_roll3","news_sent_roll7",
    "sma_5","sma_10","volatility_3d"
]
pdf = df.select(["date"] + feature_cols).toPandas().dropna(subset=feature_cols)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 8) Load your model from ADLS
# ─────────────────────────────────────────────────────────────────────────────
# read the blob into memory
from azure.storage.blob import BlobServiceClient
bsc = BlobServiceClient.from_connection_string(CONN_STR:=f"DefaultEndpointsProtocol=https;AccountName={STORAGE_ACCOUNT};AccountKey={STORAGE_KEY};EndpointSuffix=core.windows.net")
blob = bsc.get_blob_client(container="gold", blob="model/best_hgb_model.pkl")
model = joblib.load(BytesIO(blob.download_blob().readall()))

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 9) Predict & map to signals
# ─────────────────────────────────────────────────────────────────────────────
preds = model.predict(pdf[feature_cols])
pdf["signal"] = ["buy" if p>=0.001 else "sell" if p<=-0.001 else "hold" for p in preds]

In [None]:

# ─────────────────────────────────────────────────────────────────────────────
# 10) Push to Power BI
# ─────────────────────────────────────────────────────────────────────────────
payload = {
    "rows": [
      {"dateTime": row.date.isoformat(), "signal": row.signal}
      for _, row in pdf.iterrows()
    ]
}
resp = requests.post(PBI_PUSH_URL, json=payload, headers={"Content-Type":"application/json"})
resp.raise_for_status()
print("Pushed", len(payload["rows"]), "signals to Power BI")