In [None]:
import duckdb
import pandas as pd

# Connect to your dbt DuckDB file
con = duckdb.connect("../stock_dbt_project/dev.duckdb")



con.execute("SHOW TABLES").fetchall()


In [None]:
df = con.execute("SELECT * FROM features_stock_data").fetchdf()

In [None]:
df['datetime'] = pd.to_datetime(df['datetime'])

# Sort for time series modeling
df = df.sort_values(by=['symbol', 'datetime'])

# Preview
df.head()


In [None]:
df.shape

In [None]:
df['symbol'].value_counts()



In [None]:

symbol_counts = df['symbol'].value_counts()
valid_symbols = symbol_counts[symbol_counts > 300].index.tolist()

df = df[df['symbol'].isin(valid_symbols)]

print("Symbols retained:", valid_symbols)


In [None]:
features = [
    'open', 'high', 'low', 'volume',
    'rolling_avg_5', 'rolling_std_5',
    'daily_range', 'avg_volume_5',
    'daily_return', 'volume_ratio', 'rolling_volatility_10'
]

In [None]:
X = df[features]
y = df['close']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from dotenv import load_dotenv
import os



In [None]:
TRACKING_SERVER_HOST = os.getenv('MLFLOW_TRACKING_URI')
print(TRACKING_SERVER_HOST)

In [None]:
import mlflow
mlflow.set_tracking_uri(f"{TRACKING_SERVER_HOST}")


In [None]:
print(mlflow.get_tracking_uri())

In [None]:
import boto3

# Initialize a session, optionally passing the region if necessary
session = boto3.Session(region_name='us-east-1')

# Get the credentials from the session
credentials = session.get_credentials().get_frozen_credentials()

# Get the region from the session
region = session.region_name

# Print credentials and region
# print(f"AWS Access Key ID: {credentials.access_key}")
# print(f"AWS Secret Access Key: {credentials.secret_key}")
# print(f"AWS Region: {region}")

s3_client = session.client('s3')

In [None]:
mlflow.set_experiment("Stock_Price_Prediction")

In [None]:
import xgboost as xgb
import mlflow
import mlflow.xgboost

In [None]:
from mlflow.models.signature import infer_signature

In [None]:
with mlflow.start_run(run_name="xgboost_stock_model"):
    model = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        tree_method="hist"
    )

    model.fit(X_train, y_train)

    # Predict
    preds = model.predict(X_test)
    train_preds = model.predict(X_train)
    rmse = mean_squared_error(y_test, preds)
    signature = infer_signature(X_train, train_preds)
    print(f"Test RMSE: {rmse:.4f}")

    # Log parameters
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_param("max_depth", 6)

    # Log metric
    mlflow.log_metric("rmse", rmse)

    # Log model artifact
    mlflow.xgboost.log_model(
    model,
    artifact_path="models/xgboost_stock_model",
    signature=signature,
    input_example=X_train.iloc[:5],
    registered_model_name="StockPricePredictor"
)

print("✅ Model training + MLflow logging complete!")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter for AAPL
aapl_df = df[df['symbol'] == 'AAPL'].sort_values('datetime')

# Plot closing price
plt.figure(figsize=(12, 5))
sns.lineplot(x='datetime', y='close', data=aapl_df)
plt.title("AAPL Closing Price Over Time")
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
model_uri = "models:/StockPricePredictor/1"  # version 1 of registered model
model = mlflow.pyfunc.load_model(model_uri)

In [None]:
import pandas as pd

sample_input = pd.DataFrame([{
    "open": 170.0,
    "high": 172.0,
    "low": 168.5,
    "volume": 15000000,
    "rolling_avg_5": 171.2,
    "rolling_std_5": 1.3,
    "daily_range": 3.5,
    "avg_volume_5": float(14500000),
    "daily_return": 0.5,
    "volume_ratio": 1.05,
    "rolling_volatility_10": 1.2,
    "sector_Consumer_Discretionary": 0,
    "sector_Consumer_Staples": 0,
    "sector_Energy": 0,
    "sector_Financials": 0,
    "sector_Healthcare": 0,
    "sector_Industrials": 0,
    "sector_Materials": 0,
    "sector_Communication_Services": 0,
    "sector_Technology": 1,  # assume AAPL
    "sector_Unknown": 0,
}])

# Predict
prediction = model.predict(sample_input)
print("✅ Prediction:", prediction[0])