In [3]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import sklearn as sk
import os
import mlflow
import xgboost as xgb



In [5]:
db_url = os.environ["DATABASE_URL"].rstrip("/")
engine = create_engine(f"{db_url}/credits_total_kz")
mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
mlflow.set_experiment(os.getenv('MLFLOW_EXPERIMENT', 'credits_total_kz'))

KeyboardInterrupt: 

In [None]:
def load_df():
    df = pd.read_sql('select * from macro_kz', engine)
    df = df.sort_values('month')

    df['lag_1'] = df['target_x'].shift(1)
    df['lag_2'] = df['target_x'].shift(2)
    df['lag_3'] = df['target_x'].shift(3)
    df['rolling_mean_3'] = df['target_x'].shift(1).rolling(3).mean()
    df['diff_1'] = df['target_x'].diff(1)
    df['pct_change_1'] = df['target_x'].pct_change(1)
    df['month_sin'] = np.sin(2*np.pi*df['month'].dt.month/12)
    df['month_cos'] = np.cos(2*np.pi*df['month'].dt.month/12)
    df['growth_1'] = df['target_x'].pct_change(1)
    df['growth_3'] = df['target_x'].pct_change(3)

    df['target_y'] = df['target_x'].pct_change(1).shift(-1)

    df = df.dropna()

    return df

In [None]:
def split_df(df):
    df.sort_values('month')
    x = df.drop(columns=['target_y'])
    y = df['target_y']

    x['year'] = x['month'].dt.year
    x['month_num'] = x['month'].dt.month
    x = x.drop(columns=['month'])
    
    val_months = 10
    x_tr, x_val = x.iloc[:-val_months], x.iloc[-val_months:]
    y_tr, y_val = y[:-val_months], y[-val_months:]

    return x_tr, x_val, y_tr, y_val

In [None]:
def get_metrics(y_true, y_pred):
    rmse = np.sqrt(sk.metrics.mean_squared_error(y_true, y_pred))
    r2 = sk.metrics.r2_score(y_true, y_pred)
    return rmse, r2

In [None]:
def train(model_name, estimator, x_tr, x_val, y_tr, y_val):
    pipe = sk.pipeline.Pipeline([
        ('scaler', sk.preprocessing.StandardScaler(with_mean=True)),
        ('model', estimator)
    ])

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param('model_name', model_name)

        pipe.fit(x_tr, y_tr)
        pred = pipe.predict(x_val)

        rmse, r2 = get_metrics(y_val, pred)
        mlflow.log_metrics({'rmse': rmse, 'r2': r2})
        mlflow.sklearn.log_model(pipe, 'model')
        return rmse, r2
   

In [None]:

def train(model_name, estimator, param_grid, x_tr, x_val, y_tr, y_val):
    pipe = sk.pipeline.Pipeline([
        ('scaler', sk.preprocessing.StandardScaler(with_mean=True)),
        ('model', estimator)
    ])

    grid = sk.model_selection.GridSearchCV(estimator=pipe, param_grid=param_grid, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1)

    with mlflow.start_run(run_name=model_name):
        mlflow.log_param('model_name', model_name)
        mlflow.log_params({f'grid_{k}': v for k, v in param_grid.items()})

        grid.fit(x_tr, y_tr)
        best_pipe = grid.best_estimator_
        best_params = grid.best_params_

        mlflow.log_params({f'best_{k}': v for k, v in best_params.items()})

        pred = best_pipe.predict(x_val)
        rmse, r2 = get_metrics(y_val, pred)

        mlflow.log_metrics({'rmse': rmse, 'r2': r2})
        mlflow.sklearn.log_model(best_pipe, 'model')
        return rmse, r2


In [None]:
models = [
    ('Ridge', sk.linear_model.Ridge(alpha=1.0, random_state=42)),
    ('ElasticNet', sk.linear_model.ElasticNet(alpha=0.01, l1_ratio=0.2, random_state=42)),
    ('RandomForest', sk.ensemble.RandomForestRegressor(n_estimators=300,max_depth=3, min_samples_leaf=5, random_state=42, n_jobs=-1)),
    ('GradientBoosting', sk.ensemble.GradientBoostingRegressor(n_estimators=200, max_depth=2, learning_rate=0.05, random_state=42)),
    ('HistGradientBoosting', sk.ensemble.HistGradientBoostingRegressor(random_state=42)),
    ('XGBoost', xgb.XGBRegressor(random_state=42)),
]

param_grids = {
    'Ridge': {
        'model__alpha': [0.01, 0.1, 1.0, 10.0, 30.0]
    },

    'ElasticNet': {
        'model__alpha': [0.001, 0.01, 0.1, 1.0],
        'model__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
    },

    'RandomForest': {
        'model__n_estimators': [300, 700],
        'model__max_depth': [None, 6, 12],
        'model__min_samples_leaf': [1, 3, 5]
    },

    'GradientBoosting': {
        'model__n_estimators': [200, 500],
        'model__learning_rate': [0.03, 0.1],
        'model__max_depth': [2, 3, 4]
    },

    'HistGradientBoosting': {
        'model__max_iter': [300, 800],
        'model__learning_rate': [0.03, 0.1],
        'model__max_depth': [None, 6, 10]
    },

    'XGBoost': {
        'model__n_estimators': [300, 700],
        'model__learning_rate': [0.03, 0.1],
        'model__max_depth': [4, 6, 8],
        'model__subsample': [0.8, 1.0]
    }
}


results = {}

x_tr, x_val, y_tr, y_val = split_df(load_df())

mask_tr = ~np.isnan(np.asarray(y_tr, dtype=float))
x_tr = x_tr[mask_tr]
y_tr = np.asarray(y_tr, dtype=float)[mask_tr]
mask_val = ~np.isnan(np.asarray(y_val, dtype=float))
x_val = x_val[mask_val]
y_val = np.asarray(y_val, dtype=float)[mask_val]

for name, est in models:
    results[name] = train(name, est, param_grids[name], x_tr, x_val, y_tr, y_val)

results

  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run Ridge at: http://localhost:5000/#/experiments/2/runs/acb2b8af55024b3a8134a9488d4f6cc5
üß™ View experiment at: http://localhost:5000/#/experiments/2


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run ElasticNet at: http://localhost:5000/#/experiments/2/runs/6d8db4d242fe4b2fb5b26b430bddd8e0
üß™ View experiment at: http://localhost:5000/#/experiments/2


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run RandomForest at: http://localhost:5000/#/experiments/2/runs/f39f2196794849b0b2914222f7222a05
üß™ View experiment at: http://localhost:5000/#/experiments/2


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run GradientBoosting at: http://localhost:5000/#/experiments/2/runs/1d2f14bed3be491eb594ac01750379f9
üß™ View experiment at: http://localhost:5000/#/experiments/2


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run HistGradientBoosting at: http://localhost:5000/#/experiments/2/runs/83db2847658f4fb28107c12a7a573d67
üß™ View experiment at: http://localhost:5000/#/experiments/2


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run XGBoost at: http://localhost:5000/#/experiments/2/runs/fb6c33f5e073444db65a53980947d7f0
üß™ View experiment at: http://localhost:5000/#/experiments/2
{'Ridge': (np.float64(0.12308902728520693), 0.467454123425185), 'ElasticNet': (np.float64(0.15462142541059032), 0.15965486108080162), 'RandomForest': (np.float64(0.09556415866434831), 0.6789974899390521), 'GradientBoosting': (np.float64(0.08243633375153292), 0.7611332587481536), 'HistGradientBoosting': (np.float64(0.1708193684074594), -0.025634380578977156), 'XGBoost': (np.float64(0.07330340759204164), 0.8111283934034209)}
(np.float64(0.3045570482045022), -2.2602856782083185)
