In [58]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import sklearn as sk
import os
os.environ["MLFLOW_ENABLE_LOGGED_MODELS"] = "false"
import mlflow
print(mlflow.get_tracking_uri())



http://localhost:5000


In [59]:
os.environ["DATABASE_URL"] = "postgresql+psycopg2://postgres:qwerty123@localhost:5432"
os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000"
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "http://localhost:5000"))
mlflow.set_experiment(os.getenv("MLFLOW_EXPERIMENT", "credits_total_kz"))

<Experiment: artifact_location='/mlflow/artifacts/1', creation_time=1770560189708, experiment_id='1', last_update_time=1770560189708, lifecycle_stage='active', name='credits_total_kz_rf', tags={}>

In [60]:
def load_df():
    engine = create_engine(os.getenv("DATABASE_URL")  + "/credits_total_kz")
    df = pd.read_sql("select * from macro_kz", engine)
    df = df.sort_values("month")

    df["lag_1"] = df["target_x"].shift(1)
    df["lag_2"] = df["target_x"].shift(2)
    df["lag_3"] = df["target_x"].shift(3)
    df["rolling_mean_3"] = df["target_x"].shift(1).rolling(3).mean()
    df["diff_1"] = df["target_x"].diff(1)
    df["pct_change_1"] = df["target_x"].pct_change(1)
    df["month_sin"] = np.sin(2*np.pi*df["month"].dt.month/12)
    df["month_cos"] = np.cos(2*np.pi*df["month"].dt.month/12)
    df["growth_1"] = df["target_x"].pct_change(1)
    df["growth_3"] = df["target_x"].pct_change(3)

    df["target_y"] = df["target_x"].pct_change(1).shift(-1)

    df = df.dropna()

    return df

In [None]:
from sklearn.model_selection import TimeSeriesSplit

def split_df(df, val_months=10, n_splits=4, mode="holdout"):
    df = df.sort_values("month").reset_index(drop=True)

    X = df.drop(columns=["target_y"]).copy()
    y = df["target_y"].astype(float).to_numpy()

    # time features
    X["year"] = X["month"].dt.year
    X["month_num"] = X["month"].dt.month
    X = X.drop(columns=["month"])

    if mode == "holdout":
        X_tr, X_val = X.iloc[:-val_months], X.iloc[-val_months:]
        y_tr, y_val = y[:-val_months], y[-val_months:]
        return X_tr, X_val, y_tr, y_val

    if mode == "tscv":
        tscv = TimeSeriesSplit(n_splits=n_splits)
        folds = []
        for tr_idx, val_idx in tscv.split(X):
            folds.append((X.iloc[tr_idx], X.iloc[val_idx], y[tr_idx], y[val_idx]))
        return folds

    raise ValueError("mode must be 'holdout' or 'tscv'")


In [62]:
def get_metrics(y_true, y_pred):
    rmse = np.sqrt(sk.metrics.mean_squared_error(y_true, y_pred))
    mape = sk.metrics.mean_absolute_percentage_error(y_true, y_pred) * 100
    r2 = sk.metrics.r2_score(y_true, y_pred)
    return rmse, mape, r2

In [None]:
import os
import mlflow

def train(model_name, estimator, x_tr, x_val, y_tr, y_val):
    pipe = sk.pipeline.Pipeline([
        ("scaler", sk.preprocessing.StandardScaler(with_mean=True)),
        ("model", estimator)
    ])

    with mlflow.start_run(run_name=f"{model_name}_holdout"):
        mlflow.log_param("model_name", model_name)

        pipe.fit(x_tr, y_tr)
        pred = pipe.predict(x_val)

        rmse, mape, r2 = get_metrics(y_val, pred)
        mlflow.log_metrics({"rmse": rmse, "mape": mape, "r2": r2})
        mlflow.sklearn.log_model(pipe, "model")
        return rmse, mape, r2
   

In [64]:
df = load_df()
print(df["target_x"].autocorr(lag=1))
print(df["target_x"].autocorr(lag=3))


0.7018741236721782
0.6034530005996804


In [None]:
models = [
    ("Ridge", sk.linear_model.Ridge(alpha=1.0, random_state=42)),
    ("ElasticNet", sk.linear_model.ElasticNet(alpha=0.01, l1_ratio=0.2, random_state=42)),
    ("RandomForest", sk.ensemble.RandomForestRegressor(n_estimators=300,max_depth=3, min_samples_leaf=5, random_state=42, n_jobs=-1)),
    ("GradientBoosting", sk.ensemble.GradientBoostingRegressor(n_estimators=200, max_depth=2, learning_rate=0.05, random_state=42)),
    ("HistGradientBoosting", sk.ensemble.HistGradientBoostingRegressor(random_state=42)),
    ("XGBoost", sk.ensemble.GradientBoostingRegressor(random_state=42)),
    ("LightGBM", sk.ensemble.GradientBoostingRegressor(random_state=42)),
]
results = {}

x_tr, x_val, y_tr, y_val = split_df(load_df(), mode="holdout")

mask_tr = ~np.isnan(np.asarray(y_tr, dtype=float))
x_tr = x_tr[mask_tr]
y_tr = np.asarray(y_tr, dtype=float)[mask_tr]
mask_val = ~np.isnan(np.asarray(y_val, dtype=float))
x_val = x_val[mask_val]
y_val = np.asarray(y_val, dtype=float)[mask_val]

for name, est in models:
    results[name] = train(name, est, x_tr, x_val, y_tr, y_val)

print(results)
y_pred_naive = x_val["growth_1"].values

print(get_metrics(y_val, y_pred_naive))

  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run Ridge at: http://localhost:5000/#/experiments/1/runs/aef5f7a4c8f649f1960aa798b86f5dd2
üß™ View experiment at: http://localhost:5000/#/experiments/1


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run ElasticNet at: http://localhost:5000/#/experiments/1/runs/c71a7c8c4ab64b178a9e7b7b13496439
üß™ View experiment at: http://localhost:5000/#/experiments/1


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run RandomForest at: http://localhost:5000/#/experiments/1/runs/bc12a656b0f04f87b1a6f4f369877ce4
üß™ View experiment at: http://localhost:5000/#/experiments/1


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run GradientBoosting at: http://localhost:5000/#/experiments/1/runs/3158498699bb40388e7db42e95c4882e
üß™ View experiment at: http://localhost:5000/#/experiments/1


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run HistGradientBoosting at: http://localhost:5000/#/experiments/1/runs/dae5de9bfadb4ab2bdf721761e678637
üß™ View experiment at: http://localhost:5000/#/experiments/1


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run XGBoost at: http://localhost:5000/#/experiments/1/runs/2ccd6151043b4c49b210cdfd35ebff24
üß™ View experiment at: http://localhost:5000/#/experiments/1


  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÉ View run LightGBM at: http://localhost:5000/#/experiments/1/runs/3615447bdd264996a4bcd6853c38d2ce
üß™ View experiment at: http://localhost:5000/#/experiments/1
{'Ridge': (np.float64(0.10898807700698689), 298.6406246397259, 0.5824809200201629), 'ElasticNet': (np.float64(0.09729108872401634), 158.94428473280632, 0.6672910576858395), 'RandomForest': (np.float64(0.13198767716716972), 185.55254306656803, 0.3876705903999831), 'GradientBoosting': (np.float64(0.10039177103792368), 229.8504206957078, 0.6457461501679383), 'HistGradientBoosting': (np.float64(0.1708193684074594), 123.16305374163396, -0.025634380578977156), 'XGBoost': (np.float64(0.08290726205258034), 198.25939821180708, 0.7583963487511186), 'LightGBM': (np.float64(0.08290726205258034), 198.25939821180708, 0.7583963487511186)}
(np.float64(0.3045570482045022), 425.4327755546961, -2.2602856782083185)
