In [1]:
import sys
sys.path.append("../")
import numpy as np
import time
import pandas as pd
import pickle
import math
from typing import Tuple


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.sparse import hstack

ANALYSIS_POSTFIX = "mined_sudden_2024-08-26"

experiment_config = {
    "RS" : 42,
    "ANALYSIS_POSTFIX": ANALYSIS_POSTFIX,
    "FEATURE_MODE" : "CODE", # CODE_MODEL
}

In [2]:
def step_two(experiment_config, 
             X_train,
             y_train,
             model,
             X_val=None,
             y_val=None,
             save=False): 
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    
    training_start_time = time.time()
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor(max_depth=10, silent=True)
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)
    training_end_time = time.time()
    time_training = training_end_time - training_start_time

    
    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        inference_start_time = time.time()
        y_pred = reg.predict(X_val)
        inference_end_time = time.time()
        time_inference = inference_end_time - inference_start_time

        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse, "time_training" : time_training, "time_inference" : time_inference}
    

def cv_step_2(experiment_config:dict, cv_df:pd.DataFrame) -> Tuple:

    t_models = ["lr", "svm", "lgbm", "catboost"]

    results = {}

    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    for test_fold in range(cv_df.fold.max()+1):
        print(test_fold)

        # Prepare the input data
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])

        if FEATURE_MODE=="CODE_MODEL":
            X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_train = hstack([X_train_column_sparse, X_train_tfidf])
        elif FEATURE_MODE=="CODE":
            X_train = X_train_tfidf
            
        y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
        
        X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
        if FEATURE_MODE=="CODE_MODEL":
            X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_val = hstack([X_val_column_sparse, X_val_tfidf])
        elif FEATURE_MODE=="CODE":
            X_val = X_val_tfidf
            
        y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

        results[test_fold] = {}
        for model in t_models:
            print(model)
            preds_df = step_two(experiment_config=experiment_config,
                                X_train=X_train,
                                y_train=y_train,
                                X_val=X_val,
                                y_val=y_val,
                                model=model)
            cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
            results[test_fold][model] = preds_df

    cv_df = cv_df.reset_index(drop=True)

    return cv_df

def full_step_2(cv_df:pd.DataFrame,
                experiment_config:dict) -> None:
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    # TRAIN ON ALL PREDICTIONS AT ONCE

    t_models = ["lr", "svm", "lgbm", "catboost"]
    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.model_set!="ensemble", "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.model_set!="ensemble", "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_train = hstack([X_train_column_sparse, X_train_tfidf])
    elif FEATURE_MODE=="CODE":
        X_train = X_train_tfidf
        
    y_train = cv_df.loc[cv_df.model_set!="ensemble", "rouge"]
        
    with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
        pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
        
    for model in t_models:
        print(model)
        preds_df = step_two(experiment_config=experiment_config,
                            X_train=X_train,
                            y_train=y_train,
                            model=model,
                            save=True)
        
def pred_perf(experiment_config,
              X,
              model): 

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]

    with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','rb') as f:
            reg = pickle.load(f)

    y_pred = reg.predict(X)
    y_pred[y_pred<0] = 0
    return y_pred

def meta_predict(experiment_config:dict, 
                 test_df: pd.DataFrame,
                 base_models_names: list,
                 t_models:list = ["svm", "catboost"]) -> pd.DataFrame:

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    FEATURE_MODE = experiment_config["FEATURE_MODE"]
    
    for model_i, model_set in enumerate(base_models_names):

        set_df = test_df.copy()
        set_df["model_set"] = model_set
        # Prepare the input data
        with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "rb") as file:
            vectorizer = pickle.load(file)

        if model_i==0:
            meta_preds_df = set_df.copy()
        else: 
            meta_preds_df = pd.concat([meta_preds_df, set_df])
            
    X_test_tfidf = vectorizer.transform(meta_preds_df.loc[:, "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_test_column_sparse = pd.get_dummies(meta_preds_df.loc[:, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_test = hstack([X_test_column_sparse, X_test_tfidf])
    elif FEATURE_MODE=="CODE":
        X_test = X_test_tfidf

    for model in t_models:
        print(model)
        meta_preds_df[f"{model}_preds"] = pred_perf(experiment_config=experiment_config, 
                                                    X=X_test,
                                                    model=model)

    meta_preds_df = meta_preds_df.reset_index(drop=True)
    return meta_preds_df

In [3]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)


In [4]:
COLUMNS_TEST = ['question_id', 'parent_answer_post_id', 'prob', 'input_sequence',
       'output_sequence', 'id', 'snippet_len', 'intent_len', 'snippet_token_n',
       'intent_token_n', 'cluster', 'input_ids', 'attention_mask', 'labels',
       'prediction', 'rouge', 'model_set']

COLUMNS_CV = COLUMNS_TEST.copy()
COLUMNS_CV.append("fold")

#### Preprocessing

In [5]:
cv_predictions = cv_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_CV]
test_predictions = test_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_TEST]

# Code Only

We have 9 base lerner settings models that we compare learning of 1, splitting to two meta models,  all together. 

In [6]:
MODELS_LIST = [0, 1, 2, 5, 10, 'cluster_[1]', 'cluster_[4]', 'cluster_[3]', 'cluster_[0, 1, 4]']
MODE = ["ONE-BY-ONE", "TWO-MODELS", "ALL"]

In [7]:
results_cv_df = pd.DataFrame()

t_models = ["lr", "svm", "lgbm", "catboost"]

for model_base in MODELS_LIST:

    temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df = cv_step_2(experiment_config=experiment_config,
              cv_df=temp_df)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])


            results_cv_df = pd.concat([results_cv_df, t_res], axis=0)
        


    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])


        results_cv_df = pd.concat([results_cv_df, t_res], axis=0)

results_cv_df = results_cv_df.sort_values(["model_meta", "cluster"])


0
lr
svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.113686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.125930
catboost
Learning rate set to 0.052224
0:	learn: 0.1358222	total: 56.1ms	remaining: 56s
1:	learn: 0.1356581	total: 62.4ms	remaining: 31.1s
2:	learn: 0.1355014	total: 68.3ms	remaining: 22.7s
3:	learn: 0.1353616	total: 74.8ms	remaining: 18.6s
4:	learn: 0.1352198	total: 80.8ms	remaining: 16.1s
5:	learn: 0.1350561	total: 86.4ms	remaining: 14.3s
6:	learn: 0.1349760	total: 92.6ms	remaining: 13.1s
7:	learn: 0.1348568	total: 98ms	remaining: 12.1s
8:	learn: 0.1347464	total: 104ms	remaining: 11.4s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


9:	learn: 0.1346470	total: 109ms	remaining: 10.8s
10:	learn: 0.1345521	total: 115ms	remaining: 10.3s
11:	learn: 0.1344898	total: 121ms	remaining: 9.95s
12:	learn: 0.1344055	total: 126ms	remaining: 9.59s
13:	learn: 0.1343192	total: 132ms	remaining: 9.27s
14:	learn: 0.1342655	total: 137ms	remaining: 8.99s
15:	learn: 0.1341750	total: 142ms	remaining: 8.75s
16:	learn: 0.1340632	total: 148ms	remaining: 8.54s
17:	learn: 0.1340416	total: 153ms	remaining: 8.36s
18:	learn: 0.1339812	total: 159ms	remaining: 8.22s
19:	learn: 0.1339590	total: 165ms	remaining: 8.07s
20:	learn: 0.1338644	total: 170ms	remaining: 7.92s
21:	learn: 0.1337826	total: 176ms	remaining: 7.81s
22:	learn: 0.1337359	total: 182ms	remaining: 7.73s
23:	learn: 0.1337133	total: 187ms	remaining: 7.62s
24:	learn: 0.1336781	total: 193ms	remaining: 7.51s
25:	learn: 0.1336040	total: 199ms	remaining: 7.46s
26:	learn: 0.1335776	total: 205ms	remaining: 7.38s
27:	learn: 0.1335173	total: 210ms	remaining: 7.31s
28:	learn: 0.1334442	total: 217m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.051424 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.128259
catboost
Learning rate set to 0.052226
0:	learn: 0.1354522	total: 8.21ms	remaining: 8.2s
1:	learn: 0.1353413	total: 13.8ms	remaining: 6.88s
2:	learn: 0.1352304	total: 19.2ms	remaining: 6.37s
3:	learn: 0.1351174	total: 24.8ms	remaining: 6.16s
4:	learn: 0.1350268	total: 30.3ms	remaining: 6.02s
5:	learn: 0.1349477	total: 35.6ms	remaining: 5.9s
6:	learn: 0.1348705	total: 41ms	remaining: 5.82s
7:	learn: 0.1348488	total: 46.4ms	remaining: 5.75s
8:	learn: 0.1347808	total: 51.8ms	remaining: 5.7s
9:	learn: 0.1346646	total: 57.2ms	remaining: 5.67s
10:	learn: 0.1346370	total: 62.6ms	remaining: 5.63s
11:	learn: 0.1345402	total: 68.1ms	remaining: 5.6s
12

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.917449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.270421
catboost
Learning rate set to 0.052224
0:	learn: 0.1622040	total: 8.12ms	remaining: 8.11s
1:	learn: 0.1620564	total: 13.6ms	remaining: 6.81s
2:	learn: 0.1619225	total: 19.3ms	remaining: 6.42s
3:	learn: 0.1617985	total: 24.9ms	remaining: 6.19s
4:	learn: 0.1616766	total: 30.3ms	remaining: 6.03s
5:	learn: 0.1615065	total: 36.2ms	remaining: 6s
6:	learn: 0.1614535	total: 42.1ms	remaining: 5.98s
7:	learn: 0.1613345	total: 47.6ms	remaining: 5.9s
8:	learn: 0.1612310	total: 53ms	remaining: 5.83s
9:	learn: 0.1611365	total: 58.5ms	remaining: 5.79s
10:	learn: 0.1610600	total: 63.8ms	remaining: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


14:	learn: 0.1607512	total: 85.4ms	remaining: 5.61s
15:	learn: 0.1607053	total: 91.4ms	remaining: 5.62s
16:	learn: 0.1606196	total: 96.8ms	remaining: 5.6s
17:	learn: 0.1605618	total: 102ms	remaining: 5.58s
18:	learn: 0.1605035	total: 108ms	remaining: 5.57s
19:	learn: 0.1604728	total: 114ms	remaining: 5.58s
20:	learn: 0.1604070	total: 119ms	remaining: 5.56s
21:	learn: 0.1603636	total: 125ms	remaining: 5.57s
22:	learn: 0.1603045	total: 131ms	remaining: 5.55s
23:	learn: 0.1602610	total: 136ms	remaining: 5.54s
24:	learn: 0.1602071	total: 142ms	remaining: 5.53s
25:	learn: 0.1601394	total: 147ms	remaining: 5.52s
26:	learn: 0.1600871	total: 153ms	remaining: 5.5s
27:	learn: 0.1600465	total: 159ms	remaining: 5.5s
28:	learn: 0.1599784	total: 164ms	remaining: 5.51s
29:	learn: 0.1599474	total: 170ms	remaining: 5.5s
30:	learn: 0.1599153	total: 175ms	remaining: 5.48s
31:	learn: 0.1599044	total: 181ms	remaining: 5.47s
32:	learn: 0.1598620	total: 186ms	remaining: 5.46s
33:	learn: 0.1598097	total: 192m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.036319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.271194
catboost
Learning rate set to 0.052226
0:	learn: 0.1622103	total: 7.44ms	remaining: 7.43s
1:	learn: 0.1620595	total: 12.9ms	remaining: 6.46s
2:	learn: 0.1619539	total: 18.4ms	remaining: 6.11s
3:	learn: 0.1618694	total: 23.9ms	remaining: 5.94s
4:	learn: 0.1617807	total: 29.1ms	remaining: 5.8s
5:	learn: 0.1617159	total: 34.5ms	remaining: 5.72s
6:	learn: 0.1616192	total: 39.9ms	remaining: 5.66s
7:	learn: 0.1615431	total: 45.2ms	remaining: 5.6s
8:	learn: 0.1614639	total: 50.5ms	remaining: 5.56s
9:	learn: 0.1613781	total: 56ms	remaining: 5.54s
10:	learn: 0.1613241	total: 61.5ms	re

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.061107 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.280744
catboost
Learning rate set to 0.052224
0:	learn: 0.1633455	total: 7.72ms	remaining: 7.72s
1:	learn: 0.1632697	total: 13.4ms	remaining: 6.7s
2:	learn: 0.1631424	total: 19.1ms	remaining: 6.35s
3:	learn: 0.1630199	total: 24.9ms	remaining: 6.2s
4:	learn: 0.1629183	total: 30.2ms	remaining: 6.01s
5:	learn: 0.1628125	total: 35.5ms	remaining: 5.87s
6:	learn: 0.1627484	total: 40.7ms	remaining: 5.77s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


7:	learn: 0.1626621	total: 46.3ms	remaining: 5.75s
8:	learn: 0.1625660	total: 51.8ms	remaining: 5.7s
9:	learn: 0.1624828	total: 57.2ms	remaining: 5.66s
10:	learn: 0.1624090	total: 62.6ms	remaining: 5.63s
11:	learn: 0.1623441	total: 68.1ms	remaining: 5.61s
12:	learn: 0.1622910	total: 73.6ms	remaining: 5.58s
13:	learn: 0.1621864	total: 79.2ms	remaining: 5.58s
14:	learn: 0.1621472	total: 84.7ms	remaining: 5.56s
15:	learn: 0.1621073	total: 90.2ms	remaining: 5.54s
16:	learn: 0.1620402	total: 95.5ms	remaining: 5.52s
17:	learn: 0.1619996	total: 101ms	remaining: 5.51s
18:	learn: 0.1619279	total: 107ms	remaining: 5.5s
19:	learn: 0.1618834	total: 112ms	remaining: 5.5s
20:	learn: 0.1618405	total: 118ms	remaining: 5.48s
21:	learn: 0.1618009	total: 123ms	remaining: 5.46s
22:	learn: 0.1617536	total: 129ms	remaining: 5.46s
23:	learn: 0.1617018	total: 134ms	remaining: 5.46s
24:	learn: 0.1616515	total: 139ms	remaining: 5.44s
25:	learn: 0.1616019	total: 145ms	remaining: 5.44s
26:	learn: 0.1615700	total:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.280123
catboost
Learning rate set to 0.052226
0:	learn: 0.1633249	total: 7.84ms	remaining: 7.83s
1:	learn: 0.1631895	total: 15.6ms	remaining: 7.78s
2:	learn: 0.1630932	total: 21.2ms	remaining: 7.03s
3:	learn: 0.1630386	total: 26.5ms	remaining: 6.61s
4:	learn: 0.1629522	total: 32.2ms	remaining: 6.41s
5:	learn: 0.1628884	total: 37.6ms	remaining: 6.24s
6:	learn: 0.1627972	total: 43.1ms	remaining: 6.11s
7:	learn: 0.1627298	total: 49.3ms	remaining: 6.12s
8:	learn: 0.1626597	total: 55.1ms	remaining: 6.07s
9:	learn: 0.1625592	total: 60.7ms	remaining: 6.01s
10:	learn: 0.1624823	total: 66.2ms	r

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.286097
catboost
Learning rate set to 0.052224
0:	learn: 0.1647460	total: 8.62ms	remaining: 8.61s
1:	learn: 0.1646564	total: 14.5ms	remaining: 7.23s
2:	learn: 0.1645277	total: 20ms	remaining: 6.65s
3:	learn: 0.1644480	total: 26.3ms	remaining: 6.56s
4:	learn: 0.1643577	total: 32.1ms	remaining: 6.38s
5:	learn: 0.1642612	total: 37.7ms	remaining: 6.24s
6:	learn: 0.1641977	total: 43.4ms	remaining: 6.16s
7:	learn: 0.1641131	total: 49ms	remaining: 6.08s
8:	learn: 0.1640114	total: 54.5ms	remaining: 6s
9:	learn: 0.1639356	total: 59.8ms	remaining: 5.92s
10:	learn: 0.1638489	total: 65.2ms	remaining: 5.86s
11:	learn: 0.1638037	total: 70.5ms	remaining: 5.81s
12:	lea

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


13:	learn: 0.1636803	total: 81.6ms	remaining: 5.75s
14:	learn: 0.1636071	total: 87.1ms	remaining: 5.72s
15:	learn: 0.1635638	total: 92.5ms	remaining: 5.69s
16:	learn: 0.1634850	total: 98ms	remaining: 5.67s
17:	learn: 0.1634345	total: 103ms	remaining: 5.64s
18:	learn: 0.1633815	total: 109ms	remaining: 5.62s
19:	learn: 0.1633390	total: 114ms	remaining: 5.6s
20:	learn: 0.1633019	total: 120ms	remaining: 5.58s
21:	learn: 0.1632680	total: 125ms	remaining: 5.57s
22:	learn: 0.1632137	total: 131ms	remaining: 5.56s
23:	learn: 0.1631581	total: 136ms	remaining: 5.54s
24:	learn: 0.1631064	total: 141ms	remaining: 5.52s
25:	learn: 0.1630590	total: 147ms	remaining: 5.5s
26:	learn: 0.1630186	total: 152ms	remaining: 5.49s
27:	learn: 0.1629579	total: 158ms	remaining: 5.48s
28:	learn: 0.1629041	total: 163ms	remaining: 5.47s
29:	learn: 0.1628640	total: 169ms	remaining: 5.46s
30:	learn: 0.1628270	total: 174ms	remaining: 5.45s
31:	learn: 0.1628046	total: 180ms	remaining: 5.44s
32:	learn: 0.1627530	total: 185

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.285399
catboost
Learning rate set to 0.052226
0:	learn: 0.1628445	total: 7.3ms	remaining: 7.29s
1:	learn: 0.1627473	total: 13.2ms	remaining: 6.57s
2:	learn: 0.1626680	total: 18.6ms	remaining: 6.18s
3:	learn: 0.1626438	total: 24ms	remaining: 5.97s
4:	learn: 0.1625893	total: 29.6ms	remaining: 5.88s
5:	learn: 0.1624957	total: 35ms	remaining: 5.8s
6:	learn: 0.1624119	total: 40.3ms	remaining: 5.72s
7:	learn: 0.1623668	total: 45.6ms	remaining: 5.65s
8:	learn: 0.1622924	total: 50.8ms	remaining: 5.6s
9:	learn: 0.1622197	total: 56.3ms	remaining: 5.57s
10:	learn: 0.1621441	total: 61.6ms	remaining: 5.54s
11:	learn: 0.1620739	total: 66.9ms	remaining: 5.51

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.148517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.284876
catboost
Learning rate set to 0.052224
0:	learn: 0.1647540	total: 7.84ms	remaining: 7.84s
1:	learn: 0.1646261	total: 13.3ms	remaining: 6.66s
2:	learn: 0.1644735	total: 19ms	remaining: 6.3s
3:	learn: 0.1643509	total: 24.3ms	remaining: 6.05s
4:	learn: 0.1642464	total: 29.7ms	remaining: 5.91s
5:	learn: 0.1641630	total: 35.1ms	remaining: 5.82s
6:	learn: 0.1640953	total: 40.4ms	remaining: 5.74s
7:	learn: 0.1640151	total: 45.8ms	remaining: 5.68s
8:	learn: 0.1639112	total: 51.3ms	remaining: 5.64s
9:	learn: 0.1638367	total: 56.6ms	remaining: 5.61s
10:	learn: 0.1637188	total: 62.1ms	remainin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


20:	learn: 0.1631107	total: 117ms	remaining: 5.43s
21:	learn: 0.1630628	total: 122ms	remaining: 5.43s
22:	learn: 0.1630224	total: 128ms	remaining: 5.42s
23:	learn: 0.1629780	total: 134ms	remaining: 5.43s
24:	learn: 0.1629392	total: 139ms	remaining: 5.42s
25:	learn: 0.1629007	total: 144ms	remaining: 5.41s
26:	learn: 0.1628662	total: 150ms	remaining: 5.39s
27:	learn: 0.1628242	total: 155ms	remaining: 5.38s
28:	learn: 0.1627734	total: 160ms	remaining: 5.37s
29:	learn: 0.1627332	total: 166ms	remaining: 5.36s
30:	learn: 0.1626977	total: 171ms	remaining: 5.34s
31:	learn: 0.1626598	total: 176ms	remaining: 5.33s
32:	learn: 0.1626250	total: 182ms	remaining: 5.32s
33:	learn: 0.1625870	total: 187ms	remaining: 5.31s
34:	learn: 0.1625434	total: 192ms	remaining: 5.29s
35:	learn: 0.1625257	total: 197ms	remaining: 5.28s
36:	learn: 0.1624858	total: 203ms	remaining: 5.27s
37:	learn: 0.1624437	total: 208ms	remaining: 5.26s
38:	learn: 0.1624164	total: 213ms	remaining: 5.25s
39:	learn: 0.1624043	total: 218

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.990786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.284865
catboost
Learning rate set to 0.052226
0:	learn: 0.1622992	total: 7.47ms	remaining: 7.46s
1:	learn: 0.1621842	total: 13.1ms	remaining: 6.55s
2:	learn: 0.1621032	total: 18.5ms	remaining: 6.15s
3:	learn: 0.1620250	total: 23.9ms	remaining: 5.95s
4:	learn: 0.1619246	total: 29.3ms	remaining: 5.82s
5:	learn: 0.1618797	total: 34.7ms	remaining: 5.74s
6:	learn: 0.1617997	total: 40.1ms	remaining: 5.69s
7:	learn: 0.1617291	total: 45.4ms	remaining: 5.63s
8:	learn: 0.1616757	total: 51ms	remaining: 5.61s
9:	learn: 0.1615741	total: 56.3ms	remaining: 5.58s
10:	learn: 0.1615021	total: 61.7ms	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.068999 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.145822
catboost
Learning rate set to 0.052224
0:	learn: 0.1443853	total: 7.32ms	remaining: 7.31s
1:	learn: 0.1442011	total: 13.1ms	remaining: 6.51s
2:	learn: 0.1440323	total: 18.5ms	remaining: 6.13s
3:	learn: 0.1438655	total: 24.1ms	remaining: 6s
4:	learn: 0.1437240	total: 29.6ms	remaining: 5.88s
5:	learn: 0.1435589	total: 35.1ms	remaining: 5.81s
6:	learn: 0.1434668	total: 40.3ms	remaining: 5.72s
7:	learn: 0.1433495	total: 45.6ms	remaining: 5.66s
8:	learn: 0.1432327	total: 51.2ms	remaining: 5.63s
9:	learn: 0.1431524	total: 56.4ms	remaining: 5.59s
10:	learn: 0.1430331	total: 62.4ms	remaining: 5.61s
11:	learn: 0.1429470	total: 67.7ms	remaining: 5.58s
12:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


14:	learn: 0.1426977	total: 83.9ms	remaining: 5.51s
15:	learn: 0.1425775	total: 89.4ms	remaining: 5.5s
16:	learn: 0.1424739	total: 94.7ms	remaining: 5.48s
17:	learn: 0.1423884	total: 100ms	remaining: 5.47s
18:	learn: 0.1423326	total: 106ms	remaining: 5.46s
19:	learn: 0.1422673	total: 111ms	remaining: 5.44s
20:	learn: 0.1421743	total: 116ms	remaining: 5.43s
21:	learn: 0.1421224	total: 122ms	remaining: 5.42s
22:	learn: 0.1420597	total: 127ms	remaining: 5.4s
23:	learn: 0.1420113	total: 133ms	remaining: 5.39s
24:	learn: 0.1419879	total: 138ms	remaining: 5.38s
25:	learn: 0.1419075	total: 143ms	remaining: 5.37s
26:	learn: 0.1418496	total: 149ms	remaining: 5.36s
27:	learn: 0.1417856	total: 155ms	remaining: 5.37s
28:	learn: 0.1417263	total: 160ms	remaining: 5.36s
29:	learn: 0.1417009	total: 166ms	remaining: 5.35s
30:	learn: 0.1416779	total: 171ms	remaining: 5.34s
31:	learn: 0.1416233	total: 176ms	remaining: 5.34s
32:	learn: 0.1415549	total: 182ms	remaining: 5.33s
33:	learn: 0.1415039	total: 18

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.001196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.147037
catboost
Learning rate set to 0.052226
0:	learn: 0.1431683	total: 7.59ms	remaining: 7.58s
1:	learn: 0.1430417	total: 13.2ms	remaining: 6.58s
2:	learn: 0.1429299	total: 18.5ms	remaining: 6.15s
3:	learn: 0.1428325	total: 23.8ms	remaining: 5.93s
4:	learn: 0.1427233	total: 29.2ms	remaining: 5.82s
5:	learn: 0.1426356	total: 34.7ms	remaining: 5.75s
6:	learn: 0.1425205	total: 40.2ms	remaining: 5.71s
7:	learn: 0.1424228	total: 45.5ms	remaining: 5.65s
8:	learn: 0.1423383	total: 50.9ms	remaining: 5.61s
9:	learn: 0.1422357	total: 56.3ms	remaining: 5.58s
10:	learn: 0.1421613	total: 61.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.585673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.172752
catboost
Learning rate set to 0.052224
0:	learn: 0.1511842	total: 7.05ms	remaining: 7.05s
1:	learn: 0.1509194	total: 12.6ms	remaining: 6.28s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


2:	learn: 0.1506913	total: 18.1ms	remaining: 6.02s
3:	learn: 0.1505024	total: 23.6ms	remaining: 5.88s
4:	learn: 0.1503357	total: 28.9ms	remaining: 5.75s
5:	learn: 0.1501624	total: 34.3ms	remaining: 5.69s
6:	learn: 0.1499989	total: 39.9ms	remaining: 5.67s
7:	learn: 0.1498453	total: 45.4ms	remaining: 5.62s
8:	learn: 0.1496717	total: 50.9ms	remaining: 5.6s
9:	learn: 0.1495748	total: 56.2ms	remaining: 5.57s
10:	learn: 0.1494304	total: 61.5ms	remaining: 5.53s
11:	learn: 0.1492902	total: 67ms	remaining: 5.51s
12:	learn: 0.1491746	total: 72.4ms	remaining: 5.5s
13:	learn: 0.1490176	total: 77.9ms	remaining: 5.49s
14:	learn: 0.1489104	total: 83.2ms	remaining: 5.46s
15:	learn: 0.1487883	total: 88.6ms	remaining: 5.45s
16:	learn: 0.1486866	total: 94.3ms	remaining: 5.45s
17:	learn: 0.1486700	total: 99.6ms	remaining: 5.44s
18:	learn: 0.1485729	total: 105ms	remaining: 5.42s
19:	learn: 0.1484725	total: 110ms	remaining: 5.41s
20:	learn: 0.1483691	total: 116ms	remaining: 5.4s
21:	learn: 0.1483150	total: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.788045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.173461
catboost
Learning rate set to 0.052226
0:	learn: 0.1504195	total: 7.49ms	remaining: 7.49s
1:	learn: 0.1502089	total: 13.1ms	remaining: 6.52s
2:	learn: 0.1500686	total: 18.4ms	remaining: 6.13s
3:	learn: 0.1499005	total: 23.9ms	remaining: 5.95s
4:	learn: 0.1497579	total: 29.4ms	remaining: 5.84s
5:	learn: 0.1496393	total: 34.7ms	remaining: 5.75s
6:	learn: 0.1494883	total: 40.2ms	remaining: 5.71s
7:	learn: 0.1493316	total: 45.7ms	remaining: 5.66s
8:	learn: 0.1492129	total: 51ms	remaining: 5.61s
9:	learn: 0.1490708	total: 56.4ms	remaining: 5.58s
10:	learn: 0.1489549	total: 61.7ms	rem

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.322377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.262215
catboost
Learning rate set to 0.052224
0:	learn: 0.1673214	total: 7.74ms	remaining: 7.73s
1:	learn: 0.1671612	total: 14.9ms	remaining: 7.43s
2:	learn: 0.1669556	total: 20.2ms	remaining: 6.72s
3:	learn: 0.1667533	total: 25.9ms	remaining: 6.46s
4:	learn: 0.1665896	total: 31.4ms	remaining: 6.25s
5:	learn: 0.1664427	total: 36.6ms	remaining: 6.07s
6:	learn: 0.1663285	total: 41.9ms	remaining: 5.94s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


7:	learn: 0.1661819	total: 47.4ms	remaining: 5.88s
8:	learn: 0.1660387	total: 53.1ms	remaining: 5.85s
9:	learn: 0.1659427	total: 58.5ms	remaining: 5.79s
10:	learn: 0.1658327	total: 64ms	remaining: 5.76s
11:	learn: 0.1658134	total: 69.7ms	remaining: 5.73s
12:	learn: 0.1657068	total: 75ms	remaining: 5.7s
13:	learn: 0.1656130	total: 80.5ms	remaining: 5.67s
14:	learn: 0.1655015	total: 85.9ms	remaining: 5.64s
15:	learn: 0.1654168	total: 91.2ms	remaining: 5.61s
16:	learn: 0.1652863	total: 96.7ms	remaining: 5.59s
17:	learn: 0.1652068	total: 102ms	remaining: 5.57s
18:	learn: 0.1651085	total: 107ms	remaining: 5.54s
19:	learn: 0.1650390	total: 113ms	remaining: 5.53s
20:	learn: 0.1650200	total: 118ms	remaining: 5.51s
21:	learn: 0.1649313	total: 123ms	remaining: 5.49s
22:	learn: 0.1648627	total: 129ms	remaining: 5.48s
23:	learn: 0.1647944	total: 134ms	remaining: 5.46s
24:	learn: 0.1647263	total: 139ms	remaining: 5.44s
25:	learn: 0.1646657	total: 145ms	remaining: 5.44s
26:	learn: 0.1646072	total: 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.573290 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.260857
catboost
Learning rate set to 0.052226
0:	learn: 0.1642474	total: 7.71ms	remaining: 7.71s
1:	learn: 0.1641001	total: 13.4ms	remaining: 6.67s
2:	learn: 0.1640108	total: 18.8ms	remaining: 6.25s
3:	learn: 0.1639309	total: 24.3ms	remaining: 6.04s
4:	learn: 0.1637785	total: 29.6ms	remaining: 5.9s
5:	learn: 0.1636625	total: 35.1ms	remaining: 5.81s
6:	learn: 0.1635363	total: 40.4ms	remaining: 5.73s
7:	learn: 0.1634223	total: 45.7ms	remaining: 5.67s
8:	learn: 0.1633138	total: 51.7ms	remaining: 5.69s
9:	learn: 0.1631823	total: 57.1ms	remaining: 5.65s
10:	learn: 0.1630862	total: 62.5

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.967213 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.240867
catboost
Learning rate set to 0.052224
0:	learn: 0.1628246	total: 7.21ms	remaining: 7.2s
1:	learn: 0.1627444	total: 12.7ms	remaining: 6.34s
2:	learn: 0.1625821	total: 18ms	remaining: 5.98s
3:	learn: 0.1624773	total: 23.2ms	remaining: 5.78s
4:	learn: 0.1623439	total: 28.7ms	remaining: 5.71s
5:	learn: 0.1622622	total: 34.1ms	remaining: 5.65s
6:	learn: 0.1621856	total: 39.9ms	remaining: 5.66s
7:	learn: 0.1621189	total: 45.2ms	remaining: 5.61s
8:	learn: 0.1620351	total: 50.6ms	remaining: 5.57s
9:	learn: 0.1619383	total: 56.1ms	remaining: 5.55s
10:	learn: 0.1618873	total: 61.6ms	remainin

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


18:	learn: 0.1612994	total: 105ms	remaining: 5.42s
19:	learn: 0.1612333	total: 110ms	remaining: 5.41s
20:	learn: 0.1611748	total: 116ms	remaining: 5.4s
21:	learn: 0.1611151	total: 121ms	remaining: 5.39s
22:	learn: 0.1610568	total: 127ms	remaining: 5.38s
23:	learn: 0.1610132	total: 132ms	remaining: 5.37s
24:	learn: 0.1609707	total: 138ms	remaining: 5.36s
25:	learn: 0.1609201	total: 143ms	remaining: 5.36s
26:	learn: 0.1608764	total: 148ms	remaining: 5.34s
27:	learn: 0.1608244	total: 154ms	remaining: 5.34s
28:	learn: 0.1607710	total: 159ms	remaining: 5.33s
29:	learn: 0.1607213	total: 164ms	remaining: 5.31s
30:	learn: 0.1607049	total: 170ms	remaining: 5.31s
31:	learn: 0.1606748	total: 175ms	remaining: 5.3s
32:	learn: 0.1606168	total: 181ms	remaining: 5.29s
33:	learn: 0.1605790	total: 186ms	remaining: 5.28s
34:	learn: 0.1605330	total: 191ms	remaining: 5.28s
35:	learn: 0.1604894	total: 197ms	remaining: 5.27s
36:	learn: 0.1604434	total: 202ms	remaining: 5.26s
37:	learn: 0.1604040	total: 207ms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.054026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.244362
catboost
Learning rate set to 0.052226
0:	learn: 0.1626861	total: 7.85ms	remaining: 7.84s
1:	learn: 0.1625799	total: 13.3ms	remaining: 6.64s
2:	learn: 0.1624954	total: 18.7ms	remaining: 6.22s
3:	learn: 0.1623893	total: 24.2ms	remaining: 6.02s
4:	learn: 0.1622601	total: 29.7ms	remaining: 5.91s
5:	learn: 0.1621959	total: 35.1ms	remaining: 5.81s
6:	learn: 0.1620828	total: 40.6ms	remaining: 5.76s
7:	learn: 0.1619700	total: 45.9ms	remaining: 5.69s
8:	learn: 0.1618896	total: 51.2ms	remaining: 5.63s
9:	learn: 0.1617762	total: 56.6ms	remaining: 5.6s
10:	learn: 0.1617051	total: 61.9ms	remaining: 5.57s
11:	learn: 0.1616305	total: 67.3ms	remaining: 5.5

In [8]:
print("MAE")
display(results_cv_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

MAE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.118702,0.013585,0.094472,0.110511,0.124659,0.128642,0.131838
1,catboost,1,9.0,0.124568,0.018065,0.091059,0.114208,0.129422,0.135969,0.1436
2,catboost,2,9.0,0.123986,0.010899,0.105216,0.112683,0.130097,0.131667,0.132326
3,catboost,3,9.0,0.128527,0.007383,0.114903,0.123806,0.131789,0.133732,0.136306
4,catboost,4,9.0,0.129966,0.012629,0.101976,0.128909,0.132864,0.13607,0.144533
5,catboost,full,9.0,0.125645,0.009843,0.107373,0.117284,0.131591,0.132126,0.132697
6,lgbm,0,9.0,0.117401,0.013617,0.09257,0.110821,0.124408,0.127109,0.130118
7,lgbm,1,9.0,0.121329,0.015229,0.092765,0.115476,0.125191,0.130848,0.140001
8,lgbm,2,9.0,0.123957,0.010929,0.104743,0.113392,0.130213,0.131641,0.132044
9,lgbm,3,9.0,0.128401,0.006966,0.115004,0.123805,0.130787,0.132393,0.13623


In [9]:
print("RMSE")
display(results_cv_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())

RMSE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.146854,0.017553,0.112115,0.142296,0.152608,0.158619,0.164319
1,catboost,1,9.0,0.15634,0.02336,0.112536,0.138759,0.168495,0.169469,0.18086
2,catboost,2,9.0,0.153934,0.013872,0.13064,0.139461,0.161766,0.164074,0.164443
3,catboost,3,9.0,0.160873,0.008764,0.144274,0.156638,0.163208,0.167278,0.17085
4,catboost,4,9.0,0.16073,0.015719,0.125793,0.158634,0.163668,0.167928,0.178489
5,catboost,full,9.0,0.156539,0.012165,0.133906,0.146998,0.164123,0.164427,0.165436
6,lgbm,0,9.0,0.14581,0.016009,0.112056,0.144008,0.151588,0.157188,0.159157
7,lgbm,1,9.0,0.153347,0.021138,0.111786,0.141441,0.162259,0.165784,0.177934
8,lgbm,2,9.0,0.153793,0.013755,0.130136,0.140757,0.16163,0.163766,0.16414
9,lgbm,3,9.0,0.16026,0.008563,0.143615,0.155861,0.162463,0.165851,0.170334


In [11]:
results_test_df = pd.DataFrame()

t_models = ["lr", "svm", "lgbm", "catboost"]

for model_base in MODELS_LIST:

    cv_temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df =  test_predictions.loc[test_predictions.model_set==model_base]
    full_step_2(experiment_config=experiment_config,
                            cv_df=cv_temp_df)
    temp_df = meta_predict(experiment_config=experiment_config,
                            test_df=temp_df,
                            base_models_names=[model_base],
                            t_models=t_models)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])

            results_test_df = pd.concat([results_test_df, t_res], axis=0)
        
    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_preds"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_preds"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])

        results_test_df = pd.concat([results_test_df, t_res], axis=0)


results_test_df = results_test_df.sort_values(["model_meta", "cluster"])


lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.070208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4443
[LightGBM] [Info] Number of data points in the train set: 7000, number of used features: 207
[LightGBM] [Info] Start training from score 0.126339
catboost
Learning rate set to 0.055681
0:	learn: 0.1355712	total: 9.17ms	remaining: 9.16s
1:	learn: 0.1354194	total: 16.7ms	remaining: 8.34s
2:	learn: 0.1352577	total: 23.5ms	remaining: 7.8s
3:	learn: 0.1351338	total: 30.5ms	remaining: 7.6s
4:	learn: 0.1349898	total: 37.3ms	remaining: 7.42s
5:	learn: 0.1348265	total: 44.1ms	remaining: 7.31s
6:	learn: 0.1347318	total: 50.8ms	remaining: 7.2s
7:	learn: 0.1346181	total: 57.4ms	remaining: 7.11s
8:	learn: 0.1345409	total: 64ms	remaining: 7.04s
9:	learn: 0.1344137	total: 70.7ms	remaining: 7s
10:	learn: 0.1343242	total: 77.4ms	remain

In [12]:
print("MAE")
display(results_test_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

MAE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.099943,0.012294,0.082306,0.093802,0.099587,0.110344,0.114731
1,catboost,1,9.0,0.125461,0.022718,0.091342,0.11076,0.126465,0.137644,0.164268
2,catboost,2,9.0,0.121976,0.01002,0.101762,0.116272,0.121297,0.129501,0.132367
3,catboost,3,9.0,0.128567,0.008831,0.109071,0.125337,0.129826,0.135653,0.137317
4,catboost,4,9.0,0.12578,0.010436,0.101919,0.126765,0.128117,0.130919,0.138723
5,catboost,full,9.0,0.125402,0.009876,0.102227,0.127252,0.128291,0.129254,0.136591
6,lgbm,0,9.0,0.101503,0.011873,0.085727,0.092795,0.098006,0.113473,0.115499
7,lgbm,1,9.0,0.12813,0.017924,0.102463,0.126106,0.126733,0.136146,0.159502
8,lgbm,2,9.0,0.122381,0.009781,0.101824,0.118742,0.123296,0.129533,0.133236
9,lgbm,3,9.0,0.128404,0.008755,0.109629,0.127575,0.129813,0.133205,0.137177


In [13]:
print("RMSE")
display(results_test_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())

RMSE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.120759,0.014657,0.097468,0.105388,0.123114,0.132817,0.135928
1,catboost,1,9.0,0.150873,0.021973,0.119166,0.139747,0.147317,0.15646,0.196427
2,catboost,2,9.0,0.14908,0.014553,0.116408,0.14721,0.149288,0.159773,0.16277
3,catboost,3,9.0,0.160826,0.010634,0.136926,0.158415,0.161251,0.168358,0.17199
4,catboost,4,9.0,0.155109,0.0132,0.124062,0.156147,0.157498,0.15959,0.173144
5,catboost,full,9.0,0.154675,0.012724,0.123981,0.15666,0.158203,0.158757,0.170306
6,lgbm,0,9.0,0.123899,0.011081,0.103186,0.120836,0.123776,0.133226,0.139913
7,lgbm,1,9.0,0.155761,0.020467,0.129412,0.144882,0.150646,0.171278,0.19459
8,lgbm,2,9.0,0.149613,0.014395,0.117437,0.147992,0.150723,0.159092,0.165071
9,lgbm,3,9.0,0.159787,0.010295,0.138624,0.158627,0.159997,0.167016,0.170951
