In [34]:
import os
import sys
sys.path.append('../')
import logging
import numpy as np
import optuna
from joblib import dump, load

In [None]:
from util import load_pkl, get_pipline_rf, get_pipline_svr
logger = logging.getLogger('inference')
logger.setLevel(logging.DEBUG)  # Set the logging level

# Prevent adding multiple handlers if this cell is run multiple times
if not logger.handlers:
    # Create a handler that outputs to standard output
    handler = logging.StreamHandler(sys.stdout)

    # Define the log message format
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    )
    handler.setFormatter(formatter)

    # Add the handler to the logger
    logger.addHandler(handler)

In [30]:
logger.debug("This is a debug message.")

2025-04-27 00:30:59,589 - inference - DEBUG - This is a debug message.


In [13]:
period = 128
data_dir = f'../../processed_data/processed_data_{period}'

In [14]:
df_train_X_all = load_pkl(f"{data_dir}/df_train_X_all.pkl")
df_train_y_all = load_pkl(f"{data_dir}/df_train_y_all.pkl")
df_test_X_all = load_pkl(f"{data_dir}/df_test_X_all.pkl")
df_test_y_all = load_pkl(f"{data_dir}/df_test_y_all.pkl")

In [15]:
# load the valid tickers.
with open(f'{data_dir}/valid_tickers.txt', 'r') as f:
    valid_tickers = f.readlines()

In [22]:
print(f'Data preparation finished, found {len(valid_tickers)} assets with enough data.')
feature_file_path = os.path.join(data_dir, 'sorted_features.txt')

with open(feature_file_path, 'r') as file:
  sorted_features = np.array(file.read().split('\n'))
  sorted_features = sorted_features[sorted_features != '']

Data preparation finished, found 746 assets with enough data.


In [23]:
for i in range(len(valid_tickers)):
  df_train_X_all[i] = df_train_X_all[i][sorted_features]
  df_test_X_all[i] = df_test_X_all[i][sorted_features]

In [25]:
# You can load it back into memory with the following code
# mysql_url = "mysql://root@192.168.2.34:3306/mysql"
postgres_url = "postgresql+psycopg2://postgres:example@192.168.2.34:5432/app_db"
n_columns = len(df_train_X_all[0].columns)
study_rf_name = f'study_rf_columns_{n_columns}_stocks_{len(valid_tickers)}_period_{period}'
study_rf = optuna.create_study(study_name=study_rf_name, storage=postgres_url, load_if_exists=True)
if study_rf.best_trial is None:
  logger.error('No best trial found')
  sys.exit(2)

study_svm_name = f'study_svm_columns_{n_columns}_stocks_{len(valid_tickers)}_period_{period}'
study_svm = optuna.create_study(study_name=study_svm_name, storage=postgres_url, load_if_exists=True)
if study_svm.best_trial is None:
  logger.error('No best trial found')
  sys.exit(2)

[I 2025-04-27 00:23:11,587] Using an existing study with name 'study_rf_columns_28_stocks_746_period_128' instead of creating a new one.
[I 2025-04-27 00:23:11,741] Using an existing study with name 'study_svm_columns_28_stocks_746_period_128' instead of creating a new one.


In [28]:
best_pipeline_rf = get_pipline_rf(study_rf.best_params)
best_pipeline_svr = get_pipline_svr(study_svm.best_params)

In [31]:
for filename in os.listdir(f'{data_dir}/models'):
  if filename.endswith('.pkl'):
    os.remove(os.path.join(f'{data_dir}/models', filename))

In [None]:

from joblib import dump, load

for i in range(len(valid_tickers)):
  stock_name = valid_tickers[i].strip()
  df_train_X = df_train_X_all[i]
  df_train_y = df_train_y_all[i]
  df_test_X = df_test_X_all[i]
  df_test_y = df_test_y_all[i]

  X_train = df_train_X.copy().values
  y_train = df_train_y.copy().values.ravel()
  X_test = df_test_X.copy().values
  y_test = df_test_y.copy().values.ravel()

  if not os.path.exists(f'{data_dir}/models'):
    os.makedirs(f'{data_dir}/models')


  best_pipeline_rf = get_pipline_rf(study_rf.best_params)
  best_pipeline_rf.fit(X_train, y_train)

  # save the model using joblib

  dump(best_pipeline_rf, f'{data_dir}/models/{stock_name}_rf.joblib')
  logger.info(f"Model for {stock_name} saved successfully.")

  best_pipeline_svr = get_pipline_svr(study_svm.best_params)
  best_pipeline_svr.fit(X_train, y_train)

  # save the model using joblib
  dump(best_pipeline_svr, f'{data_dir}/models/{stock_name}_svr.joblib')
  logger.info(f"Model for {stock_name} saved successfully.")





In [None]:
for i in range(len(valid_tickers)):
  stock_name = valid_tickers[i].strip()
  df_train_X = df_train_X_all[i]
  df_train_y = df_train_y_all[i]
  df_test_X = df_test_X_all[i]
  df_test_y = df_test_y_all[i]

  X_train = df_train_X.copy().values
  y_train = df_train_y.copy().values.ravel()
  X_test = df_test_X.copy().values
  y_test = df_test_y.copy().values.ravel()

  # load the model using joblib
  loaded_model_rf = load(f'{data_dir}/models/{stock_name}_rf.joblib')

  # make predictions using the loaded model
  predictions_rf = loaded_model_rf.predict(X_test)
  loaded_model_svr = load(f'{data_dir}/models/{stock_name}_svr.joblib')
  predictions_svr = loaded_model_svr.predict(X_test)
  logger.info(f"Predictions for {stock_name} made successfully.")

In [None]:
import requests
import joblib
from io import BytesIO

# Define the base URL for your Azure Blob Storage
base_url = 'https://stockmodels.blob.core.windows.net/models/'

for i in range(len(valid_tickers)):
    stock_name = valid_tickers[i].strip()
    df_train_X = df_train_X_all[i]
    df_train_y = df_train_y_all[i]
    df_test_X = df_test_X_all[i]
    df_test_y = df_test_y_all[i]

    X_train = df_train_X.copy().values
    y_train = df_train_y.copy().values.ravel()
    X_test = df_test_X.copy().values
    y_test = df_test_y.copy().values.ravel()

    # Construct the full URL for the model
    model_filename = f'{stock_name}_rf.joblib'
    model_url = f'{base_url}{model_filename}'

    try:
        # Send a GET request to download the model file
        response = requests.get(model_url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Load the model from the downloaded content
        model_bytes = BytesIO(response.content)
        loaded_model_rf = joblib.load(model_bytes)

        # Make predictions using the loaded model
        predictions_rf = loaded_model_rf.predict(X_test)
        logger.info(f"Predictions for {stock_name} made successfully.")
        # (Optional) You can add your evaluation metrics here
        # For example:
        # from sklearn.metrics import mean_squared_error
        # mse = mean_squared_error(y_test, predictions_rf)
        # print(f'MSE for {stock_name}: {mse}')

    except requests.exceptions.HTTPError as http_err:
        print(f'HTTP error occurred while downloading {model_filename}: {http_err}')
    except Exception as err:
        print(f'An error occurred while processing {model_filename}: {err}')

2025-04-27 11:21:06,227 - inference - INFO - Predictions for ADS.DE made successfully.
2025-04-27 11:21:06,312 - inference - INFO - Predictions for AIR.DE made successfully.
2025-04-27 11:21:06,407 - inference - INFO - Predictions for ALV.DE made successfully.
2025-04-27 11:21:06,494 - inference - INFO - Predictions for BAS.DE made successfully.
2025-04-27 11:21:06,573 - inference - INFO - Predictions for BAYN.DE made successfully.
2025-04-27 11:21:06,656 - inference - INFO - Predictions for BEI.DE made successfully.
2025-04-27 11:21:06,731 - inference - INFO - Predictions for BMW.DE made successfully.
2025-04-27 11:21:06,804 - inference - INFO - Predictions for BNR.DE made successfully.
2025-04-27 11:21:06,883 - inference - INFO - Predictions for CBK.DE made successfully.
2025-04-27 11:21:06,969 - inference - INFO - Predictions for CON.DE made successfully.
2025-04-27 11:21:07,056 - inference - INFO - Predictions for DBK.DE made successfully.
2025-04-27 11:21:07,138 - inference - INFO