In [1]:
from os.path import join
import subprocess

from itertools import product
from tqdm import tqdm
import yaml
from addict import Dict
import psutil
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from scipy.stats import norm
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np

In [2]:
def rmse_fn(true_y, pred_y):
    return np.sqrt(np.mean((true_y.ravel() - pred_y.ravel()) ** 2))

def msll_fn(true_y, pred_y, pred_std_y):
    return -norm.logpdf(true_y.ravel(), loc=pred_y.ravel(), scale=pred_std_y.ravel()).mean()

In [3]:
N_ESTIMATORS = 1000

REPO_ROOT = subprocess.check_output("git rev-parse --show-toplevel".split()).decode().strip()
CONFIG_ROOT = join(REPO_ROOT, "config")

DATA_CONFIG = Dict(yaml.load(open(join(CONFIG_ROOT, "data.yaml"), "r"), Loader=yaml.FullLoader))

EXP_PATH = join(DATA_CONFIG.artifacts_path, join("_".join(sorted(DATA_CONFIG.features) + [DATA_CONFIG.start_date, DATA_CONFIG.end_date])))

In [4]:
result_df = pd.DataFrame(columns=["rmse", "seed", "fold"])
result_df.set_index(["seed", "fold"], inplace=True)

def load_data(seed, fold):
    f = lambda name: pd.read_csv(join(EXP_PATH, f"seed_{seed}", f"fold_{fold}", f"{name}.csv"))
    train_x, train_y, test_x, test_y = map(f, ["train_x", "train_y", "test_x", "test_y"])
    
    if "time" in train_x:
        train_x["time"] = pd.to_datetime(train_x["time"]).astype(int)/1e18
        test_x["time"] = pd.to_datetime(test_x["time"]).astype(int)/1e18
        
    # print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)
    train_y = train_y.dropna()
    test_y = test_y.dropna()
    train_x = train_x.loc[train_y.index]
    test_x = test_x.loc[test_y.index]
    # print("reduced", train_x.shape, train_y.shape, test_x.shape, test_y.shape)
    
    return map(lambda x: x.values.squeeze(), (train_x, train_y, test_x, test_y))
    
seeds = list(range(DATA_CONFIG.n_seeds))
folds = list(range(DATA_CONFIG.n_folds))

pbar = tqdm(product(seeds, folds))
for seed, fold in pbar:
    # log seed and fold in tqdm progress bar inplace
    pbar.set_description(f"seed: {seed}, fold: {fold}")
    
    model = GradientBoostingRegressor(n_estimators=N_ESTIMATORS, random_state=seed)
    train_x, train_y, test_x, test_y = load_data(seed, fold)
    
    model.fit(train_x, train_y)
    pred_y = model.predict(test_x)
    
    result_df.loc[(seed, fold), "rmse"] = rmse_fn(test_y, pred_y)

seed: 2, fold: 3: : 12it [01:23,  6.99s/it]


In [5]:
result_df.to_csv(join(EXP_PATH, f"metrics_{model.__class__.__name__}.csv"))