In [1]:
# !pip install yfinance


import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
import numpy as np

import sklearn

import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials, space_eval
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
import optuna
from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestRegressor

import pickle
# from mlflow import start_run, log_metric, log_param


In [5]:
pd.read_parquet('data/validation.parquet').head()

Unnamed: 0,SMA,EMA,MACD,RSI,UpperBB,LowerBB,Close
2023-07-01,1810.615143,1847.829035,13.199699,177.482886,1990.7629,1630.467385,1924.565918
2023-07-02,1820.360626,1856.363256,13.608082,167.460333,2006.004111,1634.717141,1937.438354
2023-07-03,1831.178204,1865.794294,14.15744,167.191118,2022.01046,1640.345949,1955.38916
2023-07-04,1845.483917,1872.54089,12.391575,214.2112,2021.624189,1669.343645,1936.633545
2023-07-05,1857.737329,1876.164425,8.773673,1075.936062,2014.157248,1701.317411,1910.588013


In [256]:
import pydantic

In [257]:
print(f'yfinance: {yf.__version__}')
print(f'pandas: {pd.__version__}')
print(f'numpy: {np.__version__}')
print(f'mlflow: {mlflow.__version__}')
print(f'optuna: {optuna.__version__}')
print(f'sklearn: {sklearn.__version__}')
print(f'prefect: {prefect.__version__}')
print(f'pydantic: {pydantic.__version__}')
# import sys
# print(sys.version_info)
# print(sys.version)

yfinance: 0.2.26
pandas: 1.5.3
numpy: 1.21.6
mlflow: 2.3.1
optuna: 3.1.1
sklearn: 1.2.2
prefect: 2.10.8
pydantic: 1.10.11


In [252]:
model_name = "rf-best-model-eth-prediction"
model_version = 1

client = mlflow.tracking.MlflowClient()
model_version_details = client.get_model_version(
    name=model_name,
    version=model_version
)

run_id = model_version_details.run_id
model_uri = f"runs:/{run_id}/model"

print(f"Model URI: {model_uri}")

Model URI: runs:/a13f37b4786a4ee886bbf494809f354c/model


### 1. Functions to load and save data

In [213]:
def load_data(symbol, start_date):
    """Load data from Yahoo Finance"""
    end_date = datetime.now() + timedelta(days=14)
    df = yf.download(symbol, start=start_date, end=end_date.strftime('%Y-%m-%d'))

    # Fill future dates with NaN
    future_dates = pd.date_range(start=df.index[-1]+timedelta(days=1), end=end_date)
    future_df = pd.DataFrame(index=future_dates)
    df = pd.concat([df, future_df], axis=0)

    return df

# def load_actual_future_data(symbol, start_date):
#     """Load data from Yahoo Finance from a specific date"""
#     today = datetime.today().strftime('%Y-%m-%d')  # Get the current date
#     df = yf.download(symbol, start=start_date, end=today)
#     return df


def split_data(df, validation_start_date, test_start_date, y_col_name):
    """Split data into train, validation and test sets."""
    train = df[df.index < validation_start_date]
    validation = df[(df.index >= validation_start_date) & (df.index < test_start_date)]
    test = df[df.index >= test_start_date]
    #test.drop(y_col_name, axis =1, inplace = True)
    return train, validation, test

def save_data(train, validation, test):
    """ Save datasets to parquet files """
    
    train.to_parquet('data/train.parquet')
    validation.to_parquet('data/validation.parquet')
    test.to_parquet('data/test.parquet')
    

def read_data(column_names):
    """Read train, validation and test data from parquet files."""
    train = pd.read_parquet('data/train.parquet')
    validation = pd.read_parquet('data/validation.parquet')
    test = pd.read_parquet('data/test.parquet')

    return train[column_names], validation[column_names], test[column_names]

In [15]:
SYMBOL = 'ETH-USD'
validation_start_date = '2023-07-01'
test_start_date = '2023-07-24'

asset_df = load_data(SYMBOL, '2022-07-01')
train_df, val_df, test_df = split_data(asset_df, validation_start_date, test_start_date)
save_data(train_df, val_df, test_df)
train_df, val_df, test_df = read_data()

[*********************100%***********************]  1 of 1 completed


In [10]:
val_df

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
2023-07-01,1933.323853,1942.701538,1910.848633,1924.565918,1924.565918,5136810000.0
2023-07-02,1924.44812,1958.160767,1895.906982,1937.438354,1937.438354,6343966000.0
2023-07-03,1937.883789,1974.775024,1934.688843,1955.38916,1955.38916,7858509000.0
2023-07-04,1955.52417,1966.365356,1932.611328,1936.633545,1936.633545,5683424000.0
2023-07-05,1936.796753,1942.432495,1897.124756,1910.588013,1910.588013,6034088000.0
2023-07-06,1910.417114,1956.012329,1847.850708,1848.636475,1848.636475,8905008000.0
2023-07-07,1847.512573,1876.963257,1832.025391,1870.602539,1870.602539,6468885000.0
2023-07-08,1871.002075,1872.501587,1844.641724,1865.539551,1865.539551,4299008000.0
2023-07-09,1865.594971,1878.668945,1857.748291,1863.009766,1863.009766,4392864000.0
2023-07-10,1863.240234,1905.460815,1848.777222,1880.556396,1880.556396,6336468000.0


# 2. Functions to prepare data

In [52]:
def fill_future_dates(df, column):
    """Fill future dates with a rolling average of the last known 30 days"""
    last_known_index = df[column].last_valid_index()
    for i in range(1, df.loc[last_known_index:].shape[0]):
        last_30_days_avg = df[column].iloc[-(30+i):-i].mean()
        df[column].iloc[-i] = last_30_days_avg
    return df


def add_EMA(df, column, span=20):
    """Add Exponential Moving Average (EMA)"""
    df['EMA'] = df[column].ewm(span=span).mean()
    return df


def add_MACD(df, column):
    """Add Moving Average Convergence Divergence (MACD)"""
    exp12 = df[column].ewm(span=12, adjust=False).mean()
    exp26 = df[column].ewm(span=26, adjust=False).mean()
    macd = exp12 - exp26
    signal = macd.ewm(span=9, adjust=False).mean()
    df['MACD'] = macd - signal
    return df


def add_SMA(df, column, window=20):
    """Add Simple Moving Average (SMA)"""
    df['SMA'] = df[column].rolling(window=window).mean()
    df = fill_future_dates(df, 'SMA')
    return df

def add_RSI(df, column, window=14):
    """Add Relative Strength Index (RSI)"""
    change = df[column].diff()
    gain = change.mask(change < 0, 0)
    loss = change.mask(change > 0, 0)
    average_gain = gain.rolling(window=14).mean()
    average_loss = loss.rolling(window=14).mean()
    rs = average_gain / average_loss
    df['RSI'] = 100 - (100 / (1 + rs))
    df = fill_future_dates(df, 'RSI')
    return df

def add_BollingerBands(df, column, window=20):
    """Add Bollinger Bands (upperBB, lowerBB)"""
    df['UpperBB'] = df[column].rolling(window=window).mean() + df[column].rolling(window=window).std() * 2
    df['LowerBB'] = df[column].rolling(window=window).mean() - df[column].rolling(window=window).std() * 2
    for band in ['UpperBB', 'LowerBB']:
        df = fill_future_dates(df, band)
    return df


#Main function to preprocess the data
def create_features(df, column):
    df = add_SMA(df, column)
    df = add_EMA(df, column)
    df = add_MACD(df, column)
    df = add_RSI(df, column)
    df = add_BollingerBands(df, column)
    return df[['SMA', 'EMA', 'MACD', 'RSI', 'UpperBB', 'LowerBB', 'Close']]



    
    

In [63]:
DEPENDENT_VARIABLE_NAME = 'Close'
SYMBOL = 'ETH-USD'
VALIDATION_START_DATE = '2023-07-01'
TEST_START_DATE = '2023-07-25'
COL_NAMES = ['SMA', 'EMA', 'MACD', 'RSI', 'UpperBB', 'LowerBB', 'Close']


asset_df = load_data(SYMBOL, '2022-07-01')
ta_df = create_features(asset_df, DEPENDENT_VARIABLE_NAME)
train_df, val_df, test_df = split_data(asset_df, VALIDATION_START_DATE, TEST_START_DATE, DEPENDENT_VARIABLE_NAME)
save_data(train_df, val_df, test_df)
train_df, val_df, test_df = read_data(COL_NAMES)


[*********************100%***********************]  1 of 1 completed


In [76]:
test_df.head(30)

Unnamed: 0,SMA,EMA,MACD,RSI,UpperBB,LowerBB,Close
2023-07-25,1860.138418,1888.837941,-7.793502,-741.94108,1979.921599,1740.355237,
2023-07-26,1862.650512,1888.837941,-6.234802,-774.999357,1981.286787,1744.014237,
2023-07-27,1865.183689,1888.837941,-4.987842,-809.755165,1982.308044,1748.059334,
2023-07-28,1867.95092,1888.837941,-3.990273,-848.98063,1983.550357,1752.351482,
2023-07-29,1870.820808,1888.837941,-3.192219,-889.847288,1984.578592,1757.063023,
2023-07-30,1873.956948,1888.837941,-2.553775,-932.69386,1985.767674,1762.146222,
2023-07-31,1877.329429,1888.837941,-2.04302,-978.943017,1986.996248,1767.66261,
2023-08-01,1880.602117,1888.837941,-1.634416,-1028.751166,1987.493857,1773.710376,
2023-08-02,1883.783343,1888.837941,-1.307533,-1083.579987,1987.345265,1780.221421,
2023-08-03,1886.803472,1888.837941,-1.046026,-1143.153335,1986.456748,1787.150197,


## 3. Training the model 

In [110]:

def split_xy(train, val, test):
    # convert to dict records
    train = train.dropna()
    X_train = train.drop('Close', axis=1)
    y_train = train['Close']

    X_val = val.drop('Close', axis=1)
    y_val = val['Close']
    
    X_test = test.drop('Close', axis=1)

    return X_train, y_train, X_val, y_val, X_test


In [118]:
TRACKING_URI = "http://127.0.0.1:5000"

mlflow.set_tracking_uri(TRACKING_URI)
mlflow.set_experiment("random-forest-train")

def run_train(train, val, test):
    
    X_train, y_train, X_val, y_val, X_test = split_xy(train, val, test)
    
    mlflow.sklearn.autolog()
#     X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
#     X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    with mlflow.start_run():

        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
        return y_pred

train_df, val_df, test_df = read_data(COL_NAMES)
run_train(train_df, val_df, test_df)

array([1904.55849475, 1927.69925613, 1934.30352432, 1920.77654061,
       1914.57763446, 1854.6440021 , 1838.27801903, 1825.0013493 ,
       1825.61329157, 1851.30508625, 1834.78914378, 1850.35960033,
       1908.77229894, 1895.15378091, 1892.02902375, 1879.96942692,
       1876.18108963, 1852.73852838, 1854.23995874, 1857.89502477,
       1856.43396808, 1836.42067187, 1832.39161195])

# 4. Optimizing hyperparameters

In [184]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [185]:
# MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
# client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
# mlflow.set_tracking_uri(TRACKING_URI)

HPO_EXPERIMENT_NAME = "mlops_zoomcamp_eth_prediction_hpo"
mlflow.set_experiment(HPO_EXPERIMENT_NAME)


def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
        'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
        'random_state': 42,
        'n_jobs': -1
    }
    with mlflow.start_run():
        mlflow.log_params(params)
        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        mlflow.sklearn.log_model(rf, "model")

    return rmse


def run_optimization(train, val, test, num_trials):    
    X_train, y_train, X_val, y_val, X_test = split_xy(train, val, test)
    mlflow.sklearn.autolog(disable=True)

    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=num_trials)
    

run_optimization(train_df, val_df, test_df, 20)


[32m[I 2023-07-26 23:56:15,711][0m A new study created in memory with name: no-name-72275950-cdcd-45c5-b19e-2d50f6ffb95f[0m
[32m[I 2023-07-26 23:56:19,401][0m Trial 0 finished with value: 40.42341730891297 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 40.42341730891297.[0m
[32m[I 2023-07-26 23:56:21,205][0m Trial 1 finished with value: 54.131999914545545 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 40.42341730891297.[0m
[32m[I 2023-07-26 23:56:23,149][0m Trial 2 finished with value: 33.84433348546697 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 33.84433348546697.[0m
[32m[I 2023-07-26 23:56:25,103][0m Trial 3 finished with value: 43.15768633352581 and parameters: {'n_estimators': 44, 'max_depth': 5, 'min_samples_split': 3, '

# 5. Model Registry

In [188]:
EXPERIMENT_NAME = "mlops_zoomcamp_eth_prediction"
mlflow.set_experiment(EXPERIMENT_NAME)
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state', 'n_jobs']
BEST_MODEL_NAME = "rf-best-model-eth-prediction"
STAGE = 'PRODUCTION'

mlflow.sklearn.autolog()



In [3]:
model.predict(X_train)

NameError: name 'model' is not defined

In [238]:

def train_and_log_model(X_train, y_train, X_val, y_val, X_test, params):
    
    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)
        # log the model
#         mlflow.sklearn.log_model(rf, "model")

        # evaluate model on the validation and test sets
        valid_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("valid_rmse", valid_rmse)
#         test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
#         mlflow.log_metric("test_rmse", test_rmse)
        



def register_model(train, val, test, top_n):

#     MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
#     client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
    
    X_train, y_train, X_val, y_val, X_test = split_xy(train, val, test)
    
    data_dict = {
    "X_train": X_train,
    "y_train": y_train,
    "X_val": X_val,
    "y_val": y_val,
    "X_test": X_test,
    }

    # Iterate over the dictionary and save each item
    for name, data in data_dict.items():
        with open(f'data/{name}.pkl', 'wb') as f:
            pickle.dump(data, f)

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(X_train, y_train, X_val, y_val, X_test, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.test_rmse ASC"]
    )[0]

    # Register the best model
    run_id = best_run.info.run_id
    model_uri = f"runs:/{run_id}/model"
    mlflow.register_model(model_uri, name=BEST_MODEL_NAME)
    model_version = 1
    client.transition_model_version_stage(
        name=BEST_MODEL_NAME,
        version=model_version,
        stage=STAGE,
        archive_existing_versions=False
    )
    return model_uri

def main():
#     train_df, val_df, test_df = read_data(COL_NAMES)
#     model_uri = register_model(train_df, val_df, test_df, 20)
    print(f"model_uri: {model_uri}")
#     model = mlflow.pyfunc.load_model(f"models:/{BEST_MODEL_NAME}/{STAGE}")
#     with open(f'data/X_test.pkl', 'rb') as f:
#         X_test = pickle.load(f)
#     y_pred = model.predict(X_test)
    

    try:
        # Try to load model
        model = mlflow.sklearn.load_model(model_uri= model_uri)
        print("Model successfully loaded.")
        
        
        # Save the model to a pickle file
#         with open(f'model/{BEST_MODEL_NAME}.pkl', 'wb') as f:
#             pickle.dump(model, f)
        
        print('Open X test')
        with open(f'data/X_test.pkl', 'rb') as f:
            X_test = pickle.load(f)
        
        future_actual_df = load_data(SYMBOL, TEST_START_DATE).dropna()#load_actual_future_data(SYMBOL, TEST_START_DATE)
        X_test = X_test[X_test .index.isin(future_actual_df.index)]
        #print(X_test.head(100))    
        print('predict with model')    
        y_pred = model.predict(X_test)
        print('load actual data')
        y_test = future_actual_df['Close']
        
        test_rmse = mean_squared_error(y_test, y_pred, squared=False)
        print(f"test_rmse from {TEST_START_DATE}: {test_rmse}")
        
            
    except Exception as e:
        print("Error while loading and saving the model:", str(e))
    
main()

    


#     loaded_model = mlflow.pyfunc.load_model(model_uri)

model_uri: runs:/3c22624c31fc429783fbdaaa5524e697/model
Model successfully loaded.
Open X test
[*********************100%***********************]  1 of 1 completed
predict with model
load actual data
test_rmse from 2023-07-25: 37.43493353472604


In [240]:
mlflow.pyfunc.load_model(f"models:/{BEST_MODEL_NAME}/{STAGE}").predict(X_test)

array([1821.57663371, 1826.37770287, 1835.26074518, 1835.26074518,
       1835.77904801, 1837.00488445, 1843.18858998, 1844.82237782,
       1844.82237782, 1844.82237782, 1843.59654137, 1865.95565699,
       1867.79899839, 1867.79899839, 1867.79899839])

In [228]:
with open(f'data/X_test.pkl', 'rb') as f:
    X_test = pickle.load(f)

In [229]:
X_test

Unnamed: 0,SMA,EMA,MACD,RSI,UpperBB,LowerBB
2023-07-25,1860.138418,1888.837941,-7.793502,-741.94108,1979.921599,1740.355237
2023-07-26,1862.650512,1888.837941,-6.234802,-774.999357,1981.286787,1744.014237
2023-07-27,1865.183689,1888.837941,-4.987842,-809.755165,1982.308044,1748.059334
2023-07-28,1867.95092,1888.837941,-3.990273,-848.98063,1983.550357,1752.351482
2023-07-29,1870.820808,1888.837941,-3.192219,-889.847288,1984.578592,1757.063023
2023-07-30,1873.956948,1888.837941,-2.553775,-932.69386,1985.767674,1762.146222
2023-07-31,1877.329429,1888.837941,-2.04302,-978.943017,1986.996248,1767.66261
2023-08-01,1880.602117,1888.837941,-1.634416,-1028.751166,1987.493857,1773.710376
2023-08-02,1883.783343,1888.837941,-1.307533,-1083.579987,1987.345265,1780.221421
2023-08-03,1886.803472,1888.837941,-1.046026,-1143.153335,1986.456748,1787.150197


In [237]:
load_data(SYMBOL, TEST_START_DATE)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
2023-07-27,1872.041992,1875.291748,1868.075684,1873.165039,1873.165039,5808109000.0
2023-07-28,,,,,,
2023-07-29,,,,,,
2023-07-30,,,,,,
2023-07-31,,,,,,
2023-08-01,,,,,,
2023-08-02,,,,,,
2023-08-03,,,,,,
2023-08-04,,,,,,
2023-08-05,,,,,,


In [211]:
X_test.join(eth_data[['Close']], how='inner')

Unnamed: 0,SMA,EMA,MACD,RSI,UpperBB,LowerBB,Close
2023-07-26,1862.650512,1888.837941,-6.234802,-774.999357,1981.286787,1744.014237,1857.763428


In [196]:
# from mlflow.tracking import MlflowClient


# MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
# client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# client.get_experiment(1)

runs = client.search_runs(
    experiment_ids='2',
    run_view_type=ViewType.ACTIVE_ONLY,
    order_by=["metrics.rmse ASC"]
)

for run in runs:
    print(f"run id: {run.info.run_id}")

run id: 2c036be1b5154428ba6f53acff02a819
run id: bacca0f405a8416cb22cc6405b02a166
run id: d6366d42d7ea4826baa96d57dfa36d59
run id: 65f9af5d9eb049389a728d9d7d4a2134
run id: 37327d31605649e48941c104af02d6c1
run id: 3061c26785b44540895c095fc05c98b3
run id: 7ae92e8b8b444834a70333dda11a50df
run id: 905555eb2e674c27a0095183233bfeaf
run id: 10eccddc482c40608e1dca7bb473677f
run id: c86d911c8e7545fab6577f8b869d9b12
run id: f7f37c3bcf7744569458cc04df90c70d
run id: 586c8e8e6e144af6a1fb2b0396789e0e
run id: cbcc6b7e5844471db0d0e55f9dd487e0
run id: fa5466112eb3455cb2e7d2e3491b28db
run id: cf5d644bb50a4a23afefad5e9620d115
run id: 770cc333b2874093a44f04da7089881f
run id: c4b8955c200d4d68bea17c5bc8708a75
run id: 83cd95163f904bd5ba9aa7af6e089784
run id: b41365b858824c81b7006d92fade8b27
run id: 0737679233734c8582c29287d243fc99
run id: db9a0ad5cc08455fb4d43fa9dcb18521
run id: 0ae4549a0e4546c79a1d30c0da44c506
run id: f57e9601a9bf449bbbeb6b03db218420
run id: 469d19363a004e419f2a66a4a2de18b0
run id: aee9d905

In [161]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

run_id = "3c22624c31fc429783fbdaaa5524e697"
model_uri = f"runs:/{run_id}/model"
#mlflow.register_model(model_uri=model_uri, name="rf_debug")

Successfully registered model 'rf_debug'.
2023/07/26 23:34:38 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: rf_debug, version 1
Created version '1' of model 'rf_debug'.


<ModelVersion: aliases=[], creation_timestamp=1690385678500, current_stage='None', description=None, last_updated_timestamp=1690385678500, name='rf_debug', run_id='6776a53920224fd199fe43f4ad8d327d', run_link=None, source='/Users/LouisReinaldo/Documents/mlops-zoomcamp-1/07-project/mlruns/1/6776a53920224fd199fe43f4ad8d327d/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [164]:
model_version = 1
new_stage = "Staging"
client.transition_model_version_stage(
    name="rf_debug",
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

<ModelVersion: aliases=[], creation_timestamp=1690385678500, current_stage='Staging', description=None, last_updated_timestamp=1690385753290, name='rf_debug', run_id='6776a53920224fd199fe43f4ad8d327d', run_link=None, source='/Users/LouisReinaldo/Documents/mlops-zoomcamp-1/07-project/mlruns/1/6776a53920224fd199fe43f4ad8d327d/artifacts/model', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [223]:
run_id = "3c22624c31fc429783fbdaaa5524e697"
model_uri = f"runs:/{run_id}/model"
model = mlflow.pyfunc.load_model(model_uri=model_uri)

In [226]:
model.predict(X_test)

array([1821.57663371, 1826.37770287, 1835.26074518, 1835.26074518,
       1835.77904801, 1837.00488445, 1843.18858998, 1844.82237782,
       1844.82237782, 1844.82237782, 1843.59654137, 1865.95565699,
       1867.79899839, 1867.79899839, 1867.79899839])

In [258]:
X_test.columns

Index(['SMA', 'EMA', 'MACD', 'RSI', 'UpperBB', 'LowerBB'], dtype='object')

In [259]:
future_actual_df = load_data(SYMBOL, TEST_START_DATE).dropna()

[*********************100%***********************]  1 of 1 completed


In [None]:
# !pip install evidently 
from evidently.report import Report
from evidently import ColumnMapping
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric, ColumnQuantileMetric

num_features = X_test.columns
column_mapping = ColumnMapping(
    prediction='prediction',
    numerical_features=num_features,
    target='Close'
)

report = Report(metrics = [
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric(),
    ColumnQuantileMetric('SMA', quantile = 0.5)
])

reference_data = pd.read_parquet('data/X_val.parquet')
reference_data['Close'] = pd.read_parquet('data/y_val.parquet')
current_data = pd.read_parquet('data/X_test.parquet')


report.run(reference_data = reference_data, current_data = current_data,
column_mapping=column_mapping)

result = report.as_dict()

In [148]:
client.get_registered_model(BEST_MODEL_NAME)

<RegisteredModel: aliases={}, creation_timestamp=1690377963153, description='', last_updated_timestamp=1690384266597, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1690384266575, current_stage='None', description='', last_updated_timestamp=1690384266575, name='rf-best-model-eth-prediction', run_id='734108e025f74bc1a4ba50a73ff992c9', run_link='', source='mlflow-artifacts:/392589530136770371/734108e025f74bc1a4ba50a73ff992c9/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='7'>,
 <ModelVersion: aliases=[], creation_timestamp=1690379447562, current_stage='Production', description='', last_updated_timestamp=1690384266597, name='rf-best-model-eth-prediction', run_id='734108e025f74bc1a4ba50a73ff992c9', run_link='', source='mlflow-artifacts:/392589530136770371/734108e025f74bc1a4ba50a73ff992c9/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='4'>], name='rf-best-model-eth-prediction', tags={}>

In [150]:
#from pprint import pprint

client = MlflowClient()
for rm in client.search_registered_models():
    pprint(dict(rm), indent=4)
    
BEST_MODEL_NAME = "rf-best-model-eth-prediction"
STAGE = 'PRODUCTION'

model = mlflow.pyfunc.load_model(model_uri=f"models:/{BEST_MODEL_NAME}/{STAGE}")

{   'aliases': {},
    'creation_timestamp': 1690377892929,
    'description': '',
    'last_updated_timestamp': 1690377892937,
    'latest_versions': [   <ModelVersion: aliases=[], creation_timestamp=1690377892937, current_stage='None', description='', last_updated_timestamp=1690377892937, name='BestModel', run_id='0fb5e7d9d69346acb476cabb8f95343b', run_link='', source='runs:/0fb5e7d9d69346acb476cabb8f95343b/model', status='READY', status_message='', tags={}, user_id='', version='1'>],
    'name': 'BestModel',
    'tags': {}}
{   'aliases': {},
    'creation_timestamp': 1690377963153,
    'description': '',
    'last_updated_timestamp': 1690384497484,
    'latest_versions': [   <ModelVersion: aliases=[], creation_timestamp=1690384497450, current_stage='None', description='', last_updated_timestamp=1690384497450, name='rf-best-model-eth-prediction', run_id='734108e025f74bc1a4ba50a73ff992c9', run_link='', source='mlflow-artifacts:/392589530136770371/734108e025f74bc1a4ba50a73ff992c9/arti

MlflowException: The following failures occurred while downloading one or more artifacts from http://127.0.0.1:5000/api/2.0/mlflow-artifacts/artifacts/392589530136770371/734108e025f74bc1a4ba50a73ff992c9/artifacts/model: {'': 'MlflowException("API request to http://127.0.0.1:5000/api/2.0/mlflow-artifacts/artifacts/392589530136770371/734108e025f74bc1a4ba50a73ff992c9/artifacts/model/ failed with exception HTTPConnectionPool(host=\'127.0.0.1\', port=5000): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/392589530136770371/734108e025f74bc1a4ba50a73ff992c9/artifacts/model/ (Caused by ResponseError(\'too many 500 error responses\'))")'}

In [None]:
source='mlflow-artifacts:/392589530136770371/734108e025f74bc1a4ba50a73ff992c9/artifacts/model'

In [80]:
# train_df = train_df.dropna(axis = 1)
# X_train = train_df.drop('Close', axis=1)
# y_train = train_df['Close']

# X_val = val_df.drop('Close', axis=1)
# y_val = val_df['Close']

# # Define the objective function for Hyperopt
# def objective(space):
#     with mlflow.start_run():
#         clf = xgb.XGBRegressor(n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
#                                reg_alpha = int(space['reg_alpha']),min_child_weight=space['min_child_weight'],
#                                colsample_bytree=space['colsample_bytree'])
#         evaluation = [(X_train, y_train), (X_val, y_val)]
        
#         clf.fit(X_train, y_train, eval_set=evaluation, eval_metric="rmse", early_stopping_rounds=10,verbose=False)
        
#         pred = clf.predict(X_val)
#         mse = mean_squared_error(y_val, pred)
#         rmse = np.sqrt(mse)

#         # Log the rmse in MLflow
#         log_param("rmse", rmse)

#         return{'loss':rmse, 'status': STATUS_OK }

# # Define the search space for Hyperopt
# space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
#        'gamma': hp.uniform ('gamma', 1,9),
#        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
#        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
#        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
#        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
#        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
#        'learning_rate': hp.loguniform('learning_rate', -5, 0),
#        }

# # Run the optimizer
# trials = Trials()
# best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

# print(best)

  0%|                                                                       | 0/100 [00:00<?, ?trial/s, best loss=?]



job exception: feature_names mismatch: ['EMA', 'MACD'] ['SMA', 'EMA', 'MACD', 'RSI', 'UpperBB', 'LowerBB']
training data did not have the following fields: SMA, UpperBB, RSI, LowerBB



  0%|                                                                       | 0/100 [00:00<?, ?trial/s, best loss=?]


ValueError: feature_names mismatch: ['EMA', 'MACD'] ['SMA', 'EMA', 'MACD', 'RSI', 'UpperBB', 'LowerBB']
training data did not have the following fields: SMA, UpperBB, RSI, LowerBB