In [1]:
import sys
sys.path.append("../")
import numpy as np
import time
import pandas as pd
import pickle
import math
from typing import Tuple


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.sparse import hstack

ANALYSIS_POSTFIX = "mined_no_drift_2024-09-09"

experiment_config = {
    "RS" : 42,
    "ANALYSIS_POSTFIX": ANALYSIS_POSTFIX,
    "FEATURE_MODE" : "CODE", # CODE_MODEL
}

t_models = ["lr", "svm", "lgbm", "catboost"]

In [2]:
def step_two(experiment_config, 
             X_train,
             y_train,
             model,
             X_val=None,
             y_val=None,
             save=False): 
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    
    training_start_time = time.time()
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor(max_depth=10, silent=True)
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)
    training_end_time = time.time()
    time_training = training_end_time - training_start_time

    
    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        inference_start_time = time.time()
        y_pred = reg.predict(X_val)
        inference_end_time = time.time()
        time_inference = inference_end_time - inference_start_time

        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse, "time_training" : time_training, "time_inference" : time_inference}
    

def cv_step_2(experiment_config:dict, cv_df:pd.DataFrame) -> Tuple:

    t_models = ["lr", "svm", "lgbm", "catboost"]

    results = {}

    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    for test_fold in range(cv_df.fold.max()+1):
        print(test_fold)

        # Prepare the input data
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])

        if FEATURE_MODE=="CODE_MODEL":
            X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_train = hstack([X_train_column_sparse, X_train_tfidf])
        elif FEATURE_MODE=="CODE":
            X_train = X_train_tfidf
            
        y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
        
        X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
        if FEATURE_MODE=="CODE_MODEL":
            X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_val = hstack([X_val_column_sparse, X_val_tfidf])
        elif FEATURE_MODE=="CODE":
            X_val = X_val_tfidf
            
        y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

        results[test_fold] = {}
        for model in t_models:
            print(model)
            preds_df = step_two(experiment_config=experiment_config,
                                X_train=X_train,
                                y_train=y_train,
                                X_val=X_val,
                                y_val=y_val,
                                model=model)
            cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
            results[test_fold][model] = preds_df

    cv_df = cv_df.reset_index(drop=True)

    return cv_df

def full_step_2(cv_df:pd.DataFrame,
                experiment_config:dict,
                t_models:list = ["lr", "svm", "lgbm", "catboost"]) -> None:
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    # TRAIN ON ALL PREDICTIONS AT ONCE

    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.model_set!="ensemble", "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.model_set!="ensemble", "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_train = hstack([X_train_column_sparse, X_train_tfidf])
    elif FEATURE_MODE=="CODE":
        X_train = X_train_tfidf
        
    y_train = cv_df.loc[cv_df.model_set!="ensemble", "rouge"]
        
    with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
        pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
        
    for model in t_models:
        print(model)
        preds_df = step_two(experiment_config=experiment_config,
                            X_train=X_train,
                            y_train=y_train,
                            model=model,
                            save=True)
        
def pred_perf(experiment_config,
              X,
              model): 

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]

    with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','rb') as f:
            reg = pickle.load(f)

    y_pred = reg.predict(X)
    y_pred[y_pred<0] = 0
    return y_pred

def meta_predict(experiment_config:dict, 
                 test_df: pd.DataFrame,
                 base_models_names: list,
                 t_models:list = ["lr", "svm", "lgbm", "catboost"]) -> pd.DataFrame:

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    FEATURE_MODE = experiment_config["FEATURE_MODE"]
    
    for model_i, model_set in enumerate(base_models_names):

        set_df = test_df.copy()
        set_df["model_set"] = model_set
        # Prepare the input data
        with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "rb") as file:
            vectorizer = pickle.load(file)

        if model_i==0:
            meta_preds_df = set_df.copy()
        else: 
            meta_preds_df = pd.concat([meta_preds_df, set_df])
            
    X_test_tfidf = vectorizer.transform(meta_preds_df.loc[:, "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_test_column_sparse = pd.get_dummies(meta_preds_df.loc[:, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_test = hstack([X_test_column_sparse, X_test_tfidf])
    elif FEATURE_MODE=="CODE":
        X_test = X_test_tfidf

    for model in t_models:
        print(model)
        meta_preds_df[f"{model}_preds"] = pred_perf(experiment_config=experiment_config, 
                                                    X=X_test,
                                                    model=model)

    meta_preds_df = meta_preds_df.reset_index(drop=True)
    return meta_preds_df

In [3]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)


In [4]:
COLUMNS_TEST = ['question_id', 'parent_answer_post_id', 'prob', 'input_sequence',
       'output_sequence', 'id', 'snippet_len', 'intent_len', 'snippet_token_n',
       'intent_token_n', 'cluster', 'input_ids', 'attention_mask', 'labels',
       'prediction', 'rouge', 'model_set']

COLUMNS_CV = COLUMNS_TEST.copy()
COLUMNS_CV.append("fold")

#### Preprocessing

In [5]:
cv_predictions = cv_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_CV]
test_predictions = test_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_TEST]

# Code Only

We have 9 base lerner settings models that we compare learning of 1, splitting to two meta models,  all together. 

In [6]:
MODELS_LIST = [0, 1, 2, 5, 10, 'cluster_[0]', 'cluster_[3]', 'cluster_[0, 3]']
MODE = ["ONE-BY-ONE", "TWO-MODELS", "ALL"]

In [7]:
results_cv_df = pd.DataFrame()

for model_base in MODELS_LIST:

    temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df = cv_step_2(experiment_config=experiment_config,
              cv_df=temp_df)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])


            results_cv_df = pd.concat([results_cv_df, t_res], axis=0)
        


    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])


        results_cv_df = pd.concat([results_cv_df, t_res], axis=0)

results_cv_df = results_cv_df.sort_values(["model_meta", "cluster"])


0
lr
svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 137
[LightGBM] [Info] Start training from score 0.130477
catboost
Learning rate set to 0.052224


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


0:	learn: 0.1367718	total: 55.2ms	remaining: 55.1s
1:	learn: 0.1365993	total: 61.5ms	remaining: 30.7s
2:	learn: 0.1364633	total: 67.6ms	remaining: 22.5s
3:	learn: 0.1362696	total: 73.1ms	remaining: 18.2s
4:	learn: 0.1361340	total: 79.3ms	remaining: 15.8s
5:	learn: 0.1359809	total: 85ms	remaining: 14.1s
6:	learn: 0.1358420	total: 90.6ms	remaining: 12.9s
7:	learn: 0.1356787	total: 97ms	remaining: 12s
8:	learn: 0.1355912	total: 102ms	remaining: 11.3s
9:	learn: 0.1354412	total: 108ms	remaining: 10.7s
10:	learn: 0.1353112	total: 114ms	remaining: 10.2s
11:	learn: 0.1352077	total: 119ms	remaining: 9.84s
12:	learn: 0.1350857	total: 126ms	remaining: 9.54s
13:	learn: 0.1349548	total: 132ms	remaining: 9.27s
14:	learn: 0.1349132	total: 138ms	remaining: 9.03s
15:	learn: 0.1348129	total: 143ms	remaining: 8.8s
16:	learn: 0.1346944	total: 149ms	remaining: 8.61s
17:	learn: 0.1346767	total: 154ms	remaining: 8.4s
18:	learn: 0.1346483	total: 161ms	remaining: 8.29s
19:	learn: 0.1346243	total: 166ms	remaini

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.041770 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 137
[LightGBM] [Info] Start training from score 0.128817
catboost
Learning rate set to 0.052226
0:	learn: 0.1379647	total: 7.73ms	remaining: 7.72s
1:	learn: 0.1377920	total: 13.5ms	remaining: 6.73s
2:	learn: 0.1376220	total: 19.5ms	remaining: 6.48s
3:	learn: 0.1374591	total: 25ms	remaining: 6.23s
4:	learn: 0.1373101	total: 31.1ms	remaining: 6.2s
5:	learn: 0.1371604	total: 36.7ms	remaining: 6.09s
6:	learn: 0.1370541	total: 42.6ms	remaining: 6.04s
7:	learn: 0.1369163	total: 48.3ms	remaining: 5.98s
8:	learn: 0.1368403	total: 53.9ms	remaining: 5.93s
9:	learn: 0.1366991	total: 59.4ms	remaining: 5.88s
10:	learn: 0.1365975	total: 64.9ms	remaining: 5.83s
11:	learn: 0.1364813	total: 70.3ms	remaining: 5.79s

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 137
[LightGBM] [Info] Start training from score 0.269834
catboost
Learning rate set to 0.052224
0:	learn: 0.1585684	total: 8.51ms	remaining: 8.5s
1:	learn: 0.1584701	total: 14.2ms	remaining: 7.09s
2:	learn: 0.1583648	total: 19.7ms	remaining: 6.55s
3:	learn: 0.1582166	total: 25.4ms	remaining: 6.32s
4:	learn: 0.1581965	total: 31.2ms	remaining: 6.2s
5:	learn: 0.1581211	total: 36.6ms	remaining: 6.07s
6:	learn: 0.1580299	total: 42.3ms	remaining: 5.99s
7:	learn: 0.1579520	total: 48ms	remaining: 5.96s
8:	learn: 0.1578671	total: 53.9ms	remaining: 5.93s
9:	learn: 0.1577826	total: 59.7ms	remaining: 5.91s
10:	learn: 0.1576835	total: 65.3ms	remaining

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.000066 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 137
[LightGBM] [Info] Start training from score 0.269614
catboost
Learning rate set to 0.052226
0:	learn: 0.1634687	total: 8.29ms	remaining: 8.28s
1:	learn: 0.1633535	total: 14.2ms	remaining: 7.11s
2:	learn: 0.1632700	total: 20.2ms	remaining: 6.71s
3:	learn: 0.1631448	total: 25.8ms	remaining: 6.42s
4:	learn: 0.1630468	total: 31.6ms	remaining: 6.29s
5:	learn: 0.1629687	total: 37.5ms	remaining: 6.22s
6:	learn: 0.1629025	total: 43.2ms	remaining: 6.13s
7:	learn: 0.1628160	total: 48.7ms	remaining: 6.03s
8:	learn: 0.1627318	total: 54ms	remaining: 5.95s
9:	learn: 0.1626994	total: 59.9ms	remaining: 5.93s
10:	learn: 0.1625959	total: 65.7ms	rem

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.157531 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 137
[LightGBM] [Info] Start training from score 0.279299
catboost
Learning rate set to 0.052224
0:	learn: 0.1608962	total: 7.41ms	remaining: 7.4s
1:	learn: 0.1607899	total: 13.2ms	remaining: 6.6s
2:	learn: 0.1607043	total: 18.6ms	remaining: 6.19s
3:	learn: 0.1606156	total: 24.3ms	remaining: 6.04s
4:	learn: 0.1605414	total: 29.9ms	remaining: 5.95s
5:	learn: 0.1604736	total: 35.3ms	remaining: 5.85s
6:	learn: 0.1603947	total: 40.8ms	remaining: 5.79s
7:	learn: 0.1602871	total: 46.6ms	remaining: 5.77s
8:	learn: 0.1602035	total: 51.7ms	remaining: 5.69s
9:	learn: 0.1601150	total: 57.2ms	remaining: 5.66s
10:	learn: 0.1600092	total: 63.1ms	remaini

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


23:	learn: 0.1593217	total: 134ms	remaining: 5.44s
24:	learn: 0.1592596	total: 139ms	remaining: 5.43s
25:	learn: 0.1592219	total: 144ms	remaining: 5.41s
26:	learn: 0.1591797	total: 150ms	remaining: 5.4s
27:	learn: 0.1591352	total: 155ms	remaining: 5.38s
28:	learn: 0.1590990	total: 160ms	remaining: 5.37s
29:	learn: 0.1590446	total: 166ms	remaining: 5.36s
30:	learn: 0.1589944	total: 172ms	remaining: 5.37s
31:	learn: 0.1589741	total: 177ms	remaining: 5.35s
32:	learn: 0.1589313	total: 182ms	remaining: 5.34s
33:	learn: 0.1588720	total: 187ms	remaining: 5.33s
34:	learn: 0.1588413	total: 193ms	remaining: 5.31s
35:	learn: 0.1587896	total: 198ms	remaining: 5.31s
36:	learn: 0.1587440	total: 204ms	remaining: 5.3s
37:	learn: 0.1586908	total: 209ms	remaining: 5.29s
38:	learn: 0.1586573	total: 214ms	remaining: 5.28s
39:	learn: 0.1586210	total: 220ms	remaining: 5.27s
40:	learn: 0.1585807	total: 225ms	remaining: 5.27s
41:	learn: 0.1585376	total: 231ms	remaining: 5.26s
42:	learn: 0.1585060	total: 236ms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 137
[LightGBM] [Info] Start training from score 0.278963
catboost
Learning rate set to 0.052226
0:	learn: 0.1652550	total: 7.44ms	remaining: 7.43s
1:	learn: 0.1651335	total: 13.4ms	remaining: 6.67s
2:	learn: 0.1650511	total: 19.2ms	remaining: 6.38s
3:	learn: 0.1649443	total: 25ms	remaining: 6.24s
4:	learn: 0.1648540	total: 30.8ms	remaining: 6.14s
5:	learn: 0.1647585	total: 36.6ms	remaining: 6.07s
6:	learn: 0.1646771	total: 42.7ms	remaining: 6.05s
7:	learn: 0.1645718	total: 48.2ms	remaining: 5.98s
8:	learn: 0.1644905	total: 54.1ms	remaining: 5.96s
9:	learn: 0.1644089	total: 59.8ms	remaining: 5.92s
10:	learn: 0.1643274	total: 65.3ms	remaining: 5.87s
11:	learn: 0.1643092	total: 70.9ms	remaining: 5.83

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 137
[LightGBM] [Info] Start training from score 0.283733
catboost
Learning rate set to 0.052224
0:	learn: 0.1636997	total: 7.41ms	remaining: 7.4s
1:	learn: 0.1635727	total: 12.9ms	remaining: 6.42s
2:	learn: 0.1634759	total: 18.3ms	remaining: 6.08s
3:	learn: 0.1633451	total: 23.8ms	remaining: 5.92s
4:	learn: 0.1632496	total: 30.1ms	remaining: 5.99s
5:	learn: 0.1631581	total: 35.8ms	remaining: 5.94s
6:	learn: 0.1630818	total: 41.3ms	remaining: 5.85s
7:	learn: 0.1630041	total: 46.9ms	remaining: 5.82s
8:	learn: 0.1629364	total: 53.2ms	remaining: 5.86s
9:	learn: 0.1628446	total: 58.9ms	remaining: 5.83s
10:	learn: 0.1627557	total: 64.5ms	remaining: 5.8s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


11:	learn: 0.1626723	total: 91.7ms	remaining: 7.55s
12:	learn: 0.1626175	total: 97.4ms	remaining: 7.4s
13:	learn: 0.1625367	total: 103ms	remaining: 7.25s
14:	learn: 0.1624848	total: 108ms	remaining: 7.11s
15:	learn: 0.1624335	total: 114ms	remaining: 6.99s
16:	learn: 0.1623686	total: 119ms	remaining: 6.88s
17:	learn: 0.1622944	total: 124ms	remaining: 6.79s
18:	learn: 0.1622432	total: 131ms	remaining: 6.75s
19:	learn: 0.1622020	total: 136ms	remaining: 6.68s
20:	learn: 0.1621560	total: 142ms	remaining: 6.62s
21:	learn: 0.1620950	total: 148ms	remaining: 6.58s
22:	learn: 0.1620377	total: 154ms	remaining: 6.53s
23:	learn: 0.1619785	total: 159ms	remaining: 6.48s
24:	learn: 0.1619168	total: 165ms	remaining: 6.43s
25:	learn: 0.1618674	total: 171ms	remaining: 6.4s
26:	learn: 0.1618288	total: 177ms	remaining: 6.36s
27:	learn: 0.1617715	total: 182ms	remaining: 6.33s
28:	learn: 0.1617308	total: 188ms	remaining: 6.29s
29:	learn: 0.1616859	total: 194ms	remaining: 6.27s
30:	learn: 0.1616715	total: 199

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 137
[LightGBM] [Info] Start training from score 0.283892
catboost
Learning rate set to 0.052226
0:	learn: 0.1647735	total: 7.47ms	remaining: 7.46s
1:	learn: 0.1646554	total: 13.2ms	remaining: 6.59s
2:	learn: 0.1645642	total: 18.7ms	remaining: 6.22s
3:	learn: 0.1644516	total: 24.2ms	remaining: 6.01s
4:	learn: 0.1643779	total: 29.6ms	remaining: 5.9s
5:	learn: 0.1642744	total: 35.3ms	remaining: 5.85s
6:	learn: 0.1641924	total: 40.7ms	remaining: 5.77s
7:	learn: 0.1640886	total: 46ms	remaining: 5.71s
8:	learn: 0.1639968	total: 51.4ms	remaining: 5.66s
9:	learn: 0.1639086	total: 56.8ms	remaining: 5.63s
10:	learn: 0.1638347	total: 62.2ms	remaining: 5.59s
11:	learn: 0.1637185	total: 67.6ms	remaining: 5.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 137
[LightGBM] [Info] Start training from score 0.283675
catboost
Learning rate set to 0.052224
0:	learn: 0.1650314	total: 7.49ms	remaining: 7.48s
1:	learn: 0.1649378	total: 13.4ms	remaining: 6.68s
2:	learn: 0.1648624	total: 19ms	remaining: 6.33s
3:	learn: 0.1647548	total: 24.7ms	remaining: 6.14s
4:	learn: 0.1646741	total: 30.6ms	remaining: 6.09s
5:	learn: 0.1645791	total: 36.4ms	remaining: 6.03s
6:	learn: 0.1645014	total: 42ms	remaining: 5.96s
7:	learn: 0.1644236	total: 47.7ms	remaining: 5.91s
8:	learn: 0.1643545	total: 53.5ms	remaining: 5.89s
9:	learn: 0.1642369	total: 58.8ms	remaining: 5.82s
10:	learn: 0.1641612	total: 64.3ms	remaining

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


16:	learn: 0.1637864	total: 97.6ms	remaining: 5.64s
17:	learn: 0.1637294	total: 103ms	remaining: 5.64s
18:	learn: 0.1636700	total: 109ms	remaining: 5.62s
19:	learn: 0.1636211	total: 115ms	remaining: 5.62s
20:	learn: 0.1635718	total: 120ms	remaining: 5.6s
21:	learn: 0.1635543	total: 126ms	remaining: 5.59s
22:	learn: 0.1634983	total: 131ms	remaining: 5.58s
23:	learn: 0.1634467	total: 137ms	remaining: 5.58s
24:	learn: 0.1633817	total: 143ms	remaining: 5.57s
25:	learn: 0.1633401	total: 149ms	remaining: 5.58s
26:	learn: 0.1633029	total: 155ms	remaining: 5.58s
27:	learn: 0.1632514	total: 161ms	remaining: 5.58s
28:	learn: 0.1632133	total: 166ms	remaining: 5.56s
29:	learn: 0.1631303	total: 172ms	remaining: 5.55s
30:	learn: 0.1630667	total: 177ms	remaining: 5.54s
31:	learn: 0.1630244	total: 183ms	remaining: 5.53s
32:	learn: 0.1629782	total: 188ms	remaining: 5.51s
33:	learn: 0.1629254	total: 194ms	remaining: 5.5s
34:	learn: 0.1628735	total: 199ms	remaining: 5.5s
35:	learn: 0.1628133	total: 205ms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003185 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 137
[LightGBM] [Info] Start training from score 0.284032
catboost
Learning rate set to 0.052226
0:	learn: 0.1644722	total: 7.97ms	remaining: 7.97s
1:	learn: 0.1643412	total: 14.4ms	remaining: 7.16s
2:	learn: 0.1642635	total: 20.2ms	remaining: 6.7s
3:	learn: 0.1641426	total: 25.9ms	remaining: 6.44s
4:	learn: 0.1640664	total: 31.4ms	remaining: 6.25s
5:	learn: 0.1639579	total: 37ms	remaining: 6.13s
6:	learn: 0.1638654	total: 42.7ms	remaining: 6.06s
7:	learn: 0.1637981	total: 48.6ms	remaining: 6.02s
8:	learn: 0.1637115	total: 53.9ms	remaining: 5.94s
9:	learn: 0.1636243	total: 59.3ms	remaining: 5.87s
10:	learn: 0.1635729	total: 64.5ms	rema

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.128121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 137
[LightGBM] [Info] Start training from score 0.197260
catboost


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


Learning rate set to 0.052224
0:	learn: 0.1604323	total: 7.55ms	remaining: 7.54s
1:	learn: 0.1602421	total: 13.3ms	remaining: 6.64s
2:	learn: 0.1600779	total: 18.9ms	remaining: 6.29s
3:	learn: 0.1599347	total: 24.7ms	remaining: 6.14s
4:	learn: 0.1598946	total: 30.3ms	remaining: 6.03s
5:	learn: 0.1597467	total: 35.7ms	remaining: 5.91s
6:	learn: 0.1596162	total: 41.5ms	remaining: 5.89s
7:	learn: 0.1595133	total: 47.4ms	remaining: 5.87s
8:	learn: 0.1594002	total: 53ms	remaining: 5.83s
9:	learn: 0.1592791	total: 58.7ms	remaining: 5.81s
10:	learn: 0.1591372	total: 64.3ms	remaining: 5.79s
11:	learn: 0.1590304	total: 70.6ms	remaining: 5.81s
12:	learn: 0.1589969	total: 76ms	remaining: 5.77s
13:	learn: 0.1589613	total: 81.3ms	remaining: 5.72s
14:	learn: 0.1588660	total: 86.5ms	remaining: 5.68s
15:	learn: 0.1587887	total: 91.9ms	remaining: 5.65s
16:	learn: 0.1587391	total: 97.5ms	remaining: 5.64s
17:	learn: 0.1586802	total: 103ms	remaining: 5.63s
18:	learn: 0.1586274	total: 109ms	remaining: 5.61

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.009599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 137
[LightGBM] [Info] Start training from score 0.196306
catboost
Learning rate set to 0.052226
0:	learn: 0.1607780	total: 7.28ms	remaining: 7.27s
1:	learn: 0.1605729	total: 13ms	remaining: 6.49s
2:	learn: 0.1604261	total: 18.9ms	remaining: 6.26s
3:	learn: 0.1603029	total: 24.5ms	remaining: 6.1s
4:	learn: 0.1601420	total: 29.9ms	remaining: 5.95s
5:	learn: 0.1599974	total: 35.4ms	remaining: 5.86s
6:	learn: 0.1599269	total: 40.7ms	remaining: 5.78s
7:	learn: 0.1598900	total: 46.1ms	remaining: 5.72s
8:	learn: 0.1598164	total: 52.1ms	remaining: 5.74s
9:	learn: 0.1597920	total: 57.9ms	remaining: 5.73s
10:	learn: 0.1596810	total: 63.7ms	r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003078 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 137
[LightGBM] [Info] Start training from score 0.249255
catboost
Learning rate set to 0.052224
0:	learn: 0.1484304	total: 7.01ms	remaining: 7.01s
1:	learn: 0.1483164	total: 12.5ms	remaining: 6.26s
2:	learn: 0.1482078	total: 17.9ms	remaining: 5.94s
3:	learn: 0.1480693	total: 23.2ms	remaining: 5.78s
4:	learn: 0.1479665	total: 28.7ms	remaining: 5.72s
5:	learn: 0.1478952	total: 34.1ms	remaining: 5.64s
6:	learn: 0.1478064	total: 39.9ms	remaining: 5.66s
7:	learn: 0.1476980	total: 45.5ms	remaining: 5.64s
8:	learn: 0.1476307	total: 51.2ms	remaining: 5.64s
9:	learn: 0.1475184	total: 56.8ms	remaining: 5.62s
10:	learn: 0.1474144	total: 62.6ms	remai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


24:	learn: 0.1465765	total: 140ms	remaining: 5.44s
25:	learn: 0.1465323	total: 146ms	remaining: 5.45s
26:	learn: 0.1464676	total: 151ms	remaining: 5.45s
27:	learn: 0.1464294	total: 157ms	remaining: 5.44s
28:	learn: 0.1463889	total: 162ms	remaining: 5.44s
29:	learn: 0.1463717	total: 168ms	remaining: 5.43s
30:	learn: 0.1462961	total: 173ms	remaining: 5.42s
31:	learn: 0.1462847	total: 179ms	remaining: 5.41s
32:	learn: 0.1462298	total: 185ms	remaining: 5.41s
33:	learn: 0.1461763	total: 190ms	remaining: 5.41s
34:	learn: 0.1461235	total: 196ms	remaining: 5.4s
35:	learn: 0.1460896	total: 201ms	remaining: 5.39s
36:	learn: 0.1460716	total: 207ms	remaining: 5.38s
37:	learn: 0.1460252	total: 212ms	remaining: 5.37s
38:	learn: 0.1459896	total: 217ms	remaining: 5.36s
39:	learn: 0.1459765	total: 223ms	remaining: 5.34s
40:	learn: 0.1459520	total: 228ms	remaining: 5.33s
41:	learn: 0.1459300	total: 233ms	remaining: 5.33s
42:	learn: 0.1458978	total: 239ms	remaining: 5.31s
43:	learn: 0.1458696	total: 244m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 137
[LightGBM] [Info] Start training from score 0.249333
catboost
Learning rate set to 0.052226
0:	learn: 0.1515832	total: 7.04ms	remaining: 7.04s
1:	learn: 0.1514599	total: 12.8ms	remaining: 6.4s
2:	learn: 0.1513778	total: 18.7ms	remaining: 6.22s
3:	learn: 0.1512502	total: 24.4ms	remaining: 6.08s
4:	learn: 0.1511528	total: 29.8ms	remaining: 5.92s
5:	learn: 0.1510710	total: 35ms	remaining: 5.8s
6:	learn: 0.1509908	total: 40.4ms	remaining: 5.73s
7:	learn: 0.1508859	total: 45.8ms	remaining: 5.67s
8:	learn: 0.1508005	total: 51.2ms	remaining: 5.63s
9:	learn: 0.1507305	total: 56.8ms	remaining: 5.62s
10:	learn: 0.1506403	total: 62.5ms	remaining: 5.62s
11:	learn: 0.1505600	total: 68.4ms	remaining: 5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003249 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2548
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 137
[LightGBM] [Info] Start training from score 0.270370
catboost
Learning rate set to 0.052224
0:	learn: 0.1592903	total: 6.86ms	remaining: 6.86s
1:	learn: 0.1591582	total: 12.3ms	remaining: 6.13s
2:	learn: 0.1590533	total: 17.7ms	remaining: 5.9s
3:	learn: 0.1589375	total: 23ms	remaining: 5.72s
4:	learn: 0.1588243	total: 28.8ms	remaining: 5.74s
5:	learn: 0.1587430	total: 34.6ms	remaining: 5.74s
6:	learn: 0.1586618	total: 40.3ms	remaining: 5.71s
7:	learn: 0.1585684	total: 46.1ms	remaining: 5.72s
8:	learn: 0.1585093	total: 51.7ms	remaining: 5.69s
9:	learn: 0.1584304	total: 57.2ms	remaining: 5.66s
10:	learn: 0.1583565	total: 63.1ms	remaining: 5.67s
11:	learn: 0.1582436	total: 68.6ms	remaining: 5.65s
12:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


13:	learn: 0.1581268	total: 79.8ms	remaining: 5.62s
14:	learn: 0.1580780	total: 85.3ms	remaining: 5.6s
15:	learn: 0.1580194	total: 91ms	remaining: 5.6s
16:	learn: 0.1579794	total: 96.3ms	remaining: 5.57s
17:	learn: 0.1579543	total: 102ms	remaining: 5.55s
18:	learn: 0.1579007	total: 107ms	remaining: 5.53s
19:	learn: 0.1578879	total: 112ms	remaining: 5.5s
20:	learn: 0.1578236	total: 118ms	remaining: 5.48s
21:	learn: 0.1577764	total: 123ms	remaining: 5.46s
22:	learn: 0.1577578	total: 128ms	remaining: 5.46s
23:	learn: 0.1577144	total: 134ms	remaining: 5.46s
24:	learn: 0.1576698	total: 140ms	remaining: 5.46s
25:	learn: 0.1576198	total: 145ms	remaining: 5.45s
26:	learn: 0.1575749	total: 151ms	remaining: 5.44s
27:	learn: 0.1575230	total: 157ms	remaining: 5.44s
28:	learn: 0.1574959	total: 163ms	remaining: 5.44s
29:	learn: 0.1574766	total: 168ms	remaining: 5.43s
30:	learn: 0.1574317	total: 174ms	remaining: 5.42s
31:	learn: 0.1574157	total: 179ms	remaining: 5.42s
32:	learn: 0.1573634	total: 185m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.091505 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2487
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 137
[LightGBM] [Info] Start training from score 0.270058
catboost
Learning rate set to 0.052226
0:	learn: 0.1625356	total: 7.08ms	remaining: 7.07s
1:	learn: 0.1624014	total: 12.9ms	remaining: 6.42s
2:	learn: 0.1622749	total: 18.6ms	remaining: 6.19s
3:	learn: 0.1621410	total: 24.3ms	remaining: 6.04s
4:	learn: 0.1620477	total: 29.9ms	remaining: 5.94s
5:	learn: 0.1619785	total: 35.2ms	remaining: 5.84s
6:	learn: 0.1618966	total: 40.7ms	remaining: 5.78s
7:	learn: 0.1618750	total: 46.3ms	remaining: 5.74s
8:	learn: 0.1618576	total: 51.8ms	remaining: 5.71s
9:	learn: 0.1617617	total: 57.6ms	remaining: 5.71s
10:	learn: 0.1616672	total: 63.4ms	r

In [8]:
print("MAE")
display(results_cv_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

MAE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,8.0,0.136224,0.009917,0.119762,0.134172,0.137363,0.138562,0.153295
1,catboost,1,8.0,0.128335,0.00832,0.111877,0.124057,0.13182,0.134226,0.135463
2,catboost,2,8.0,0.121364,0.008774,0.1025,0.118518,0.125172,0.126679,0.128958
3,catboost,3,8.0,0.10873,0.008166,0.091259,0.108557,0.11188,0.113468,0.115176
4,catboost,full,8.0,0.125618,0.007792,0.108553,0.124039,0.12887,0.130864,0.131092
5,lgbm,0,8.0,0.136895,0.010643,0.11845,0.134586,0.138632,0.139967,0.154445
6,lgbm,1,8.0,0.12829,0.008184,0.111699,0.12496,0.131633,0.133762,0.135322
7,lgbm,2,8.0,0.121284,0.009362,0.100687,0.119318,0.125207,0.126331,0.128996
8,lgbm,3,8.0,0.109599,0.008146,0.091563,0.110043,0.112599,0.113667,0.115793
9,lgbm,full,8.0,0.125841,0.008039,0.107918,0.124949,0.129014,0.130772,0.131566


In [9]:
print("RMSE")
display(results_cv_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())

RMSE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,8.0,0.167769,0.014481,0.146193,0.163787,0.169459,0.17107,0.194607
1,catboost,1,8.0,0.159824,0.009918,0.139287,0.157077,0.163379,0.166263,0.168843
2,catboost,2,8.0,0.152412,0.011434,0.128535,0.149768,0.157771,0.159674,0.16078
3,catboost,3,8.0,0.138009,0.010653,0.117105,0.13603,0.142632,0.14496,0.145775
4,catboost,full,8.0,0.157066,0.009724,0.135606,0.156927,0.160654,0.163041,0.164033
5,lgbm,0,8.0,0.168116,0.015345,0.14451,0.164083,0.170499,0.171471,0.196142
6,lgbm,1,8.0,0.159849,0.010075,0.138849,0.157456,0.163823,0.166281,0.168393
7,lgbm,2,8.0,0.152192,0.01197,0.126823,0.150667,0.157452,0.159229,0.160987
8,lgbm,3,8.0,0.138399,0.010865,0.115813,0.137351,0.142885,0.145017,0.146398
9,lgbm,full,8.0,0.157165,0.01012,0.134574,0.157561,0.160911,0.162988,0.164188


In [10]:
results_test_df = pd.DataFrame()


for model_base in MODELS_LIST:

    cv_temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df =  test_predictions.loc[test_predictions.model_set==model_base]
    full_step_2(experiment_config=experiment_config,
                            cv_df=cv_temp_df,
                            t_models=t_models)
    temp_df = meta_predict(experiment_config=experiment_config,
                            test_df=temp_df,
                            base_models_names=[model_base],
                            t_models=t_models)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])

            results_test_df = pd.concat([results_test_df, t_res], axis=0)
        
    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_preds"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_preds"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])

        results_test_df = pd.concat([results_test_df, t_res], axis=0)


results_test_df = results_test_df.sort_values(["model_meta", "cluster"])


lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4408
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 219
[LightGBM] [Info] Start training from score 0.130373
catboost
Learning rate set to 0.055681
0:	learn: 0.1374716	total: 8.88ms	remaining: 8.87s
1:	learn: 0.1372441	total: 16.1ms	remaining: 8.04s
2:	learn: 0.1371071	total: 23ms	remaining: 7.64s
3:	learn: 0.1369195	total: 29.7ms	remaining: 7.38s
4:	learn: 0.1367959	total: 36.3ms	remaining: 7.22s
5:	learn: 0.1366091	total: 42.9ms	remaining: 7.11s
6:	learn: 0.1364451	total: 49.5ms	remaining: 7.03s
7:	learn: 0.1363480	total: 56.5ms	remaining: 7.01s
8:	learn: 0.1362049	total: 63.4ms	remaining: 6.98s
9:	learn: 0.1360789	total: 70.1ms	remaining: 6.94s
10:	learn: 0.1359911	total: 76.8ms	

In [11]:
print("MAE")
display(results_test_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

MAE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,8.0,0.132662,0.008924,0.119197,0.129715,0.132969,0.133471,0.150981
1,catboost,1,8.0,0.126923,0.009509,0.108011,0.124531,0.129911,0.133397,0.135626
2,catboost,2,8.0,0.122449,0.011715,0.098147,0.118868,0.126723,0.130243,0.132317
3,catboost,3,8.0,0.10564,0.007677,0.08996,0.103436,0.109605,0.110702,0.110967
4,catboost,full,8.0,0.123543,0.008488,0.105069,0.122284,0.126577,0.129312,0.129578
5,lgbm,0,8.0,0.131936,0.010041,0.116099,0.129422,0.131354,0.134089,0.151915
6,lgbm,1,8.0,0.126405,0.00894,0.109315,0.123574,0.129065,0.132146,0.135824
7,lgbm,2,8.0,0.124397,0.01176,0.100518,0.120946,0.129569,0.132381,0.133763
8,lgbm,3,8.0,0.10445,0.007537,0.088491,0.103016,0.107948,0.108841,0.110206
9,lgbm,full,8.0,0.12336,0.008295,0.105347,0.121984,0.126199,0.128531,0.130213


In [12]:
print("RMSE")
display(results_test_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())

RMSE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,8.0,0.163204,0.013271,0.145527,0.159043,0.162588,0.163627,0.192039
1,catboost,1,8.0,0.158178,0.012432,0.132725,0.154895,0.162572,0.16616,0.16927
2,catboost,2,8.0,0.153548,0.015705,0.121934,0.148432,0.160391,0.163774,0.16633
3,catboost,3,8.0,0.132716,0.009992,0.11263,0.13074,0.137377,0.13889,0.140683
4,catboost,full,8.0,0.154397,0.011054,0.129966,0.153191,0.158723,0.16085,0.162807
5,lgbm,0,8.0,0.161925,0.014327,0.142221,0.156875,0.161137,0.164641,0.19156
6,lgbm,1,8.0,0.157567,0.011838,0.133947,0.154668,0.161171,0.16486,0.168784
7,lgbm,2,8.0,0.156519,0.015635,0.126178,0.151791,0.163796,0.166912,0.168642
8,lgbm,3,8.0,0.130864,0.01032,0.109717,0.129654,0.135331,0.136595,0.140007
9,lgbm,full,8.0,0.154198,0.010938,0.130282,0.153378,0.157632,0.160571,0.163048
