In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

import time 
import numpy as np
from sklearn.metrics import r2_score
import pandas as pd

from incremental_learning.benchmark.generate_dataset_pipeline import generate_dataset
from incremental_learning.job import train, evaluate

2022-11-30 14:16:37,188 [I] incremental_learning >> Found data_frame_analyzer binary at /Users/valeriy/Documents/workspace/valeriy42/ml-cpp/cmake-build-relwithdebinfo/bin/data_frame_analyzer/data_frame_analyzer


In [2]:
# from datasets import load_dataset
# dataset = load_dataset("inria-soda/tabular-benchmark", data_files="reg_cat/house_sales.csv")



In [3]:
CONFIG_DEFAULT = {
    "train_prop": 0.70,
    "val_test_prop": 0.3,
    "max_val_samples": 50000,
    "max_test_samples": 50000,
}
benchmarks = [
    {
       
        "task": "regression",
        "dataset_size": "medium",
        "categorical": False,
        "datasets": "house_sales",
    }
]


In [4]:
def update_config(data_transform_config: dict, benchmark: dict) -> dict:
     # Use the appropriate model config
    model_config = { "path_to_dir": "data",}
    dataset_size=benchmark["dataset_size"],
    categorical=benchmark["categorical"],
    regression=benchmark["task"] == "regression"

    if dataset_size == "medium":
        data_transform_config["max_train_samples"] = 10000
    elif dataset_size == "large":
        data_transform_config["max_train_samples"] = 50000


    if categorical:
        data_transform_config["data__categorical"] =  True
    else:
        data_transform_config["data__categorical"] = False

    if regression:
        data_transform_config["regression"] = True
        data_transform_config["data__regression"] = True
    else:
        data_transform_config["regression"] = False
        data_transform_config["data__regression"] = False

    data_transform_config["data__keyword"] = benchmark['datasets']

    config = {
        "program": "run_experiment.py",
        "metric": {
            "name": "mean_test_score",
            "goal": "minimize"  # RMSE
        } if regression else {
            "name": "mean_test_score",
            "goal": "maximize"  # accuracy
        },
        "parameters": dict(model_config, **data_transform_config)
    }
    return config


In [5]:
data_transform_config = {
    "data__method_name": "real_data",
    "n_iter": 1
    # "n_iter":  "auto"
}

for benchmark in benchmarks:
    print(benchmark)
    print(
        update_config(
            data_transform_config=data_transform_config,
            benchmark=benchmark,
        )
    )



{'task': 'regression', 'dataset_size': 'medium', 'categorical': False, 'datasets': 'house_sales'}
{'program': 'run_experiment.py', 'metric': {'name': 'mean_test_score', 'goal': 'minimize'}, 'parameters': {'path_to_dir': 'data', 'data__method_name': 'real_data', 'n_iter': 1, 'data__categorical': True, 'regression': True, 'data__regression': True, 'data__keyword': 'house_sales'}}


In [6]:
config = update_config(
    data_transform_config=data_transform_config, benchmark=benchmarks[0]
)
config["parameters"] = {**config["parameters"], **CONFIG_DEFAULT}
print(config)
train_scores = []
val_scores = []
test_scores = []
r2_train_scores = []
r2_val_scores = []
r2_test_scores = []
times = []
if config["parameters"]["n_iter"] == "auto":
    (
        x_train,
        x_val,
        x_test,
        y_train,
        y_val,
        y_test,
        categorical_indicator,
    ) = generate_dataset(config["parameters"], np.random.RandomState(0))
    if x_test.shape[0] > 6000:
        n_iter = 1
    elif x_test.shape[0] > 3000:
        n_iter = 2
    elif x_test.shape[0] > 1000:
        n_iter = 3
    else:
        n_iter = 5
else:
    n_iter = config['parameters']["n_iter"]
for i in range(n_iter):
    # if config["log_training"]: #FIXME
    #    config["model__wandb_run"] = run
    rng = np.random.RandomState(i)
    print(rng.randn(1))
    # TODO: separate numeric and categorical features
    t = time.time()
    (
        x_train,
        x_val,
        x_test,
        y_train,
        y_val,
        y_test,
        categorical_indicator,
    ) = generate_dataset(config['parameters'], rng)
    data_generation_time = time.time() - t
    print("Data generation time:", data_generation_time)
    # print(y_train)
    print(x_train.shape)
    x_train_df = pd.DataFrame(x_train, columns=['f' + str(col_idx) for col_idx in range(x_train.shape[1])])
    categorical_fields = [field for field, indicator in zip(x_train_df.columns, categorical_indicator) if indicator]
    x_train_df['target'] = y_train
    train_config = {
        'job_id': benchmarks[0]['datasets'],
        'rows': (x_train.shape[0] + x_val.shape[0]),
        'cols': x_train.shape[1]+1,
        'memory_limit': 50000000,
        'threads': 8,
        'results_field': 'ml',
        'categorical_fields': categorical_fields,
        'analysis': {
            'name': benchmark['task'],
            'parameters': {
                'randomize_seed': rng.randint(100000000),
                'dependent_variable': 'target'
            }
        }
    }
    job = train(train_config['job_id'], x_train_df, config=train_config)
    job.wait_to_complete()

    predictions = job.get_predictions()
    print("R2 score train: ", r2_score(predictions, y_train))

    evaluate_job = evaluate(dataset_name=train_config['job_id'], dataset=x_train_df, original_job=job, config=train_config)
    evaluate_job.wait_to_complete()
    print("R2 score evaluate: ", r2_score(evaluate_job.get_predictions(), y_train))



0,1
stderr,output
"/Users/valeriy/Documents/workspace/valeriy42/ml-cpp/cmake-build-relwithdebinfo/bin/data_frame_analyzer/data_frame_analyzer --input /var/folders/_j/gcj6z4b950bdzpw7_fzrmpf40000gn/T/tmpvk926tdq --config /var/folders/_j/gcj6z4b950bdzpw7_fzrmpf40000gn/T/tmpt3b3ghlk --output /var/folders/_j/gcj6z4b950bdzpw7_fzrmpf40000gn/T/tmp4ibl_ngk --validElasticLicenseKeyConfirmed true --restore /var/folders/_j/gcj6z4b950bdzpw7_fzrmpf40000gn/T/tmpzz0tns09; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi The default interactive shell is now zsh. To update your account to use zsh, please run `chsh -s /bin/zsh`. For more details, please visit https://support.apple.com/kb/HT208050. Elastic-MBP:jupyter valeriy$ /Users/valeriy/Documents/workspace/valeriy42/ml-cpp/cmake-build-relwithdebinfo/bin/data_frame_analyzer/data_frame_analyzer --input /var/folders/_j/gcj6z4b950bdzpw7_fzrmpf40000gn/T/tmpvk926tdq --config /var/folders/_j/gcj6z4b950bdzpw7_fzrmpf40000gn/T/tmpt3b3ghlk --output /var/folders/_j/gcj6z4b950bdzpw7_fzrmpf40000gn/T/tmp4ibl_ngk --validElasticLicenseKeyConfirmed true --restore /var/folders/_j/gcj6z4b950bdzpw7_fzrmpf40000gn/T/tmpzz0tns09; if [ $? -eq 0 ]; then echo ""Success""; else echo ""Failure""; fi 2022-11-30 13:17:19,279103 UTC [49566] DEBUG /Users/valeriy/Documents/workspace/valeriy42/ml-cpp/bin/data_frame_analyzer/Main.cc@153 data_frame_analyzer (64 bit): Version based on 8.7.0-SNAPSHOT (Build DEVELOPMENT BUILD by valeriy) Copyright (c) 2022 Elasticsearch BV 2022-11-30 13:17:19,283416 UTC [49566] DEBUG /Users/valeriy/Documents/workspace/valeriy42/ml-cpp/lib/seccomp/CSystemCallFilter_MacOSX.cc@107 macOS sandbox initialized 2022-11-30 13:17:19,420402 UTC [49566] DEBUG /Users/valeriy/Documents/workspace/valeriy42/ml-cpp/lib/api/CDataFrameAnalyzer.cc@116 Received 16641 rows 2022-11-30 13:17:19,665159 UTC [49566] INFO /Users/valeriy/Documents/workspace/valeriy42/ml-cpp/bin/data_frame_analyzer/Main.cc@261 [{""name"":""E_DFTPMEstimatedPeakMemoryUsage"",""description"":""The upfront estimate of the peak memory training the predictive model would use"",""value"":1382586} ,{""name"":""E_DFTPMPeakMemoryUsage"",""description"":""The peak memory training the predictive model used"",""value"":3622930} ,{""name"":""E_DFTPMTimeToTrain"",""description"":""The time it took to train the predictive model"",""value"":92} ] 2022-11-30 13:17:19,665192 UTC [49566] DEBUG /Users/valeriy/Documents/workspace/valeriy42/ml-cpp/bin/data_frame_analyzer/Main.cc@266 ML data frame analyzer exiting Success Elastic-MBP:jupyter valeriy$",",{""row_results"":{""checksum"":0,""results"":{""ml"":{""target_prediction"":14.0570077896 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""target_prediction"":12.7662162780 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""target_prediction"":13.5055694580 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""target_prediction"":12.7506341934 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""target_prediction"":13.2866392135 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""target_prediction"":13.3751373291 ,{""row_results"":{""checksum"":0,""results"":{""ml"":{""target_prediction"":13.6291961669 ,{""model_metadata"":{""total_feature_importance"":[],""hyperparameters"":[],""train_pr ,{""compressed_data_summarization"":{""doc_num"":0,""data_summarization"":""H4sIAAAAAAA ]"


Job succeeded
R2 score evaluate:  0.9128884662658696
