In [1]:
import sys
sys.path.append("../")
import numpy as np
import time
import pandas as pd
import pickle
import math
from typing import Tuple


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.sparse import hstack

ANALYSIS_POSTFIX = "mined_sudden_2024-08-26"

experiment_config = {
    "RS" : 42,
    "ANALYSIS_POSTFIX": ANALYSIS_POSTFIX,
    "FEATURE_MODE" : "CODE", # CODE_MODEL
}

In [2]:
def step_two(experiment_config, 
             X_train,
             y_train,
             model,
             X_val=None,
             y_val=None,
             save=False): 
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    
    training_start_time = time.time()
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor(max_depth=10, silent=True)
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)
    training_end_time = time.time()
    time_training = training_end_time - training_start_time

    
    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        inference_start_time = time.time()
        y_pred = reg.predict(X_val)
        inference_end_time = time.time()
        time_inference = inference_end_time - inference_start_time

        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse, "time_training" : time_training, "time_inference" : time_inference}
    

def cv_step_2(experiment_config:dict, cv_df:pd.DataFrame) -> Tuple:

    t_models = ["lr", "svm", "lgbm", "catboost"]

    results = {}

    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    for test_fold in range(cv_df.fold.max()+1):
        print(test_fold)

        # Prepare the input data
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])

        if FEATURE_MODE=="CODE_MODEL":
            X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_train = hstack([X_train_column_sparse, X_train_tfidf])
        elif FEATURE_MODE=="CODE":
            X_train = X_train_tfidf
            
        y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
        
        X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
        if FEATURE_MODE=="CODE_MODEL":
            X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_val = hstack([X_val_column_sparse, X_val_tfidf])
        elif FEATURE_MODE=="CODE":
            X_val = X_val_tfidf
            
        y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

        results[test_fold] = {}
        for model in t_models:
            print(model)
            preds_df = step_two(experiment_config=experiment_config,
                                X_train=X_train,
                                y_train=y_train,
                                X_val=X_val,
                                y_val=y_val,
                                model=model)
            cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
            results[test_fold][model] = preds_df

    cv_df = cv_df.reset_index(drop=True)

    return cv_df

def full_step_2(cv_df:pd.DataFrame,
                experiment_config:dict) -> None:
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    # TRAIN ON ALL PREDICTIONS AT ONCE

    t_models = ["lr", "svm", "lgbm", "catboost"]
    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.model_set!="ensemble", "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.model_set!="ensemble", "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_train = hstack([X_train_column_sparse, X_train_tfidf])
    elif FEATURE_MODE=="CODE":
        X_train = X_train_tfidf
        
    y_train = cv_df.loc[cv_df.model_set!="ensemble", "rouge"]
        
    with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
        pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
        
    for model in t_models:
        print(model)
        preds_df = step_two(experiment_config=experiment_config,
                            X_train=X_train,
                            y_train=y_train,
                            model=model,
                            save=True)
        
def pred_perf(experiment_config,
              X,
              model): 

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]

    with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','rb') as f:
            reg = pickle.load(f)

    y_pred = reg.predict(X)
    y_pred[y_pred<0] = 0
    return y_pred

def meta_predict(experiment_config:dict, 
                 test_df: pd.DataFrame,
                 base_models_names: list,
                 t_models:list = ["svm", "catboost"]) -> pd.DataFrame:

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    FEATURE_MODE = experiment_config["FEATURE_MODE"]
    
    for model_i, model_set in enumerate(base_models_names):

        set_df = test_df.copy()
        set_df["model_set"] = model_set
        # Prepare the input data
        with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "rb") as file:
            vectorizer = pickle.load(file)

        if model_i==0:
            meta_preds_df = set_df.copy()
        else: 
            meta_preds_df = pd.concat([meta_preds_df, set_df])
            
    X_test_tfidf = vectorizer.transform(meta_preds_df.loc[:, "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_test_column_sparse = pd.get_dummies(meta_preds_df.loc[:, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_test = hstack([X_test_column_sparse, X_test_tfidf])
    elif FEATURE_MODE=="CODE":
        X_test = X_test_tfidf

    for model in t_models:
        print(model)
        meta_preds_df[f"{model}_preds"] = pred_perf(experiment_config=experiment_config, 
                                                    X=X_test,
                                                    model=model)

    meta_preds_df = meta_preds_df.reset_index(drop=True)
    return meta_preds_df

In [3]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)


In [4]:
COLUMNS_TEST = ['question_id', 'parent_answer_post_id', 'prob', 'input_sequence',
       'output_sequence', 'id', 'snippet_len', 'intent_len', 'snippet_token_n',
       'intent_token_n', 'cluster', 'input_ids', 'attention_mask', 'labels',
       'prediction', 'rouge', 'model_set']

COLUMNS_CV = COLUMNS_TEST.copy()
COLUMNS_CV.append("fold")

#### Preprocessing

In [5]:
cv_predictions = cv_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_CV]
test_predictions = test_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_TEST]

# Code Only

We have 9 base lerner settings models that we compare learning of 1, splitting to two meta models,  all together. 

In [6]:
MODELS_LIST = [0, 1, 2, 5, 10, 'cluster_[1]', 'cluster_[4]', 'cluster_[3]', 'cluster_[0, 1, 4]']
MODE = ["ONE-BY-ONE", "TWO-MODELS", "ALL"]

In [7]:
results_cv_df = pd.DataFrame()

t_models = ["lr", "svm", "lgbm", "catboost"]

for model_base in MODELS_LIST:

    temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df = cv_step_2(experiment_config=experiment_config,
              cv_df=temp_df)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])


            results_cv_df = pd.concat([results_cv_df, t_res], axis=0)
        


    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])


        results_cv_df = pd.concat([results_cv_df, t_res], axis=0)

results_cv_df = results_cv_df.sort_values(["model_meta", "cluster"])


0
lr
svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.112004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.125930
catboost
Learning rate set to 0.052224


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


0:	learn: 0.1358222	total: 54.6ms	remaining: 54.5s
1:	learn: 0.1356581	total: 60.3ms	remaining: 30.1s
2:	learn: 0.1355014	total: 65.7ms	remaining: 21.8s
3:	learn: 0.1353616	total: 71ms	remaining: 17.7s
4:	learn: 0.1352198	total: 76.5ms	remaining: 15.2s
5:	learn: 0.1350561	total: 82.1ms	remaining: 13.6s
6:	learn: 0.1349760	total: 87.5ms	remaining: 12.4s
7:	learn: 0.1348568	total: 92.7ms	remaining: 11.5s
8:	learn: 0.1347464	total: 97.7ms	remaining: 10.8s
9:	learn: 0.1346470	total: 103ms	remaining: 10.2s
10:	learn: 0.1345521	total: 109ms	remaining: 9.77s
11:	learn: 0.1344898	total: 114ms	remaining: 9.39s
12:	learn: 0.1344055	total: 119ms	remaining: 9.07s
13:	learn: 0.1343192	total: 125ms	remaining: 8.81s
14:	learn: 0.1342655	total: 130ms	remaining: 8.56s
15:	learn: 0.1341750	total: 136ms	remaining: 8.36s
16:	learn: 0.1340632	total: 141ms	remaining: 8.17s
17:	learn: 0.1340416	total: 147ms	remaining: 8s
18:	learn: 0.1339812	total: 152ms	remaining: 7.85s
19:	learn: 0.1339590	total: 158ms	rem

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.035206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.128259
catboost
Learning rate set to 0.052226
0:	learn: 0.1354522	total: 7.2ms	remaining: 7.19s
1:	learn: 0.1353413	total: 12.7ms	remaining: 6.33s
2:	learn: 0.1352304	total: 18.3ms	remaining: 6.07s
3:	learn: 0.1351174	total: 23.5ms	remaining: 5.86s
4:	learn: 0.1350268	total: 28.9ms	remaining: 5.75s
5:	learn: 0.1349477	total: 34.4ms	remaining: 5.7s
6:	learn: 0.1348705	total: 39.8ms	remaining: 5.64s
7:	learn: 0.1348488	total: 45.1ms	remaining: 5.6s
8:	learn: 0.1347808	total: 50.9ms	remaining: 5.61s
9:	learn: 0.1346646	total: 56.3ms	remaining: 5.57s
10:	learn: 0.1346370	total: 61.7ms	remaining: 5.54s
11:	learn: 0.1345402	total: 66.9ms	remaining: 5.51s

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.033739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.270421
catboost
Learning rate set to 0.052224


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


0:	learn: 0.1622040	total: 7.23ms	remaining: 7.23s
1:	learn: 0.1620564	total: 12.9ms	remaining: 6.44s
2:	learn: 0.1619225	total: 18.7ms	remaining: 6.21s
3:	learn: 0.1617985	total: 24.1ms	remaining: 5.99s
4:	learn: 0.1616766	total: 29.6ms	remaining: 5.88s
5:	learn: 0.1615065	total: 34.9ms	remaining: 5.79s
6:	learn: 0.1614535	total: 40.4ms	remaining: 5.73s
7:	learn: 0.1613345	total: 45.7ms	remaining: 5.67s
8:	learn: 0.1612310	total: 51.2ms	remaining: 5.63s
9:	learn: 0.1611365	total: 56.7ms	remaining: 5.61s
10:	learn: 0.1610600	total: 62ms	remaining: 5.57s
11:	learn: 0.1609792	total: 67.3ms	remaining: 5.54s
12:	learn: 0.1609035	total: 72.6ms	remaining: 5.51s
13:	learn: 0.1607941	total: 78.1ms	remaining: 5.5s
14:	learn: 0.1607512	total: 83.5ms	remaining: 5.48s
15:	learn: 0.1607053	total: 89.1ms	remaining: 5.48s
16:	learn: 0.1606196	total: 94.6ms	remaining: 5.47s
17:	learn: 0.1605618	total: 99.9ms	remaining: 5.45s
18:	learn: 0.1605035	total: 105ms	remaining: 5.44s
19:	learn: 0.1604728	total

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.046183 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.271194
catboost
Learning rate set to 0.052226
0:	learn: 0.1622103	total: 7.76ms	remaining: 7.75s
1:	learn: 0.1620595	total: 13.3ms	remaining: 6.64s
2:	learn: 0.1619539	total: 18.8ms	remaining: 6.25s
3:	learn: 0.1618694	total: 24.2ms	remaining: 6.02s
4:	learn: 0.1617807	total: 29.6ms	remaining: 5.89s
5:	learn: 0.1617159	total: 34.9ms	remaining: 5.78s
6:	learn: 0.1616192	total: 40.1ms	remaining: 5.7s
7:	learn: 0.1615431	total: 45.4ms	remaining: 5.63s
8:	learn: 0.1614639	total: 50.6ms	remaining: 5.58s
9:	learn: 0.1613781	total: 55.9ms	remaining: 5.54s
10:	learn: 0.1613241	total: 61.4ms	remaining: 5.52s
11:	learn: 0.1612361	total: 66.7ms	remaining

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.088205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.280744
catboost
Learning rate set to 0.052224
0:	learn: 0.1633455	total: 7.49ms	remaining: 7.48s
1:	learn: 0.1632697	total: 13ms	remaining: 6.49s
2:	learn: 0.1631424	total: 18.5ms	remaining: 6.14s
3:	learn: 0.1630199	total: 23.8ms	remaining: 5.93s
4:	learn: 0.1629183	total: 29.2ms	remaining: 5.81s
5:	learn: 0.1628125	total: 34.5ms	remaining: 5.71s
6:	learn: 0.1627484	total: 39.9ms	remaining: 5.66s
7:	learn: 0.1626621	total: 45.5ms	remaining: 5.64s
8:	learn: 0.1625660	total: 51.1ms	remaining: 5.62s
9:	learn: 0.1624828	total: 56.5ms	remaining: 5.6s
10:	learn: 0.1624090	total: 62.3ms	remaining: 5.6s
11:	learn: 0.1623441	total: 67.6ms	remaining: 5.57s
12:	

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


17:	learn: 0.1619996	total: 100ms	remaining: 5.48s
18:	learn: 0.1619279	total: 106ms	remaining: 5.47s
19:	learn: 0.1618834	total: 111ms	remaining: 5.46s
20:	learn: 0.1618405	total: 117ms	remaining: 5.44s
21:	learn: 0.1618009	total: 122ms	remaining: 5.43s
22:	learn: 0.1617536	total: 128ms	remaining: 5.42s
23:	learn: 0.1617018	total: 133ms	remaining: 5.41s
24:	learn: 0.1616515	total: 139ms	remaining: 5.4s
25:	learn: 0.1616019	total: 144ms	remaining: 5.39s
26:	learn: 0.1615700	total: 149ms	remaining: 5.38s
27:	learn: 0.1615324	total: 155ms	remaining: 5.37s
28:	learn: 0.1615042	total: 160ms	remaining: 5.36s
29:	learn: 0.1614699	total: 165ms	remaining: 5.35s
30:	learn: 0.1614335	total: 171ms	remaining: 5.34s
31:	learn: 0.1614166	total: 176ms	remaining: 5.33s
32:	learn: 0.1613722	total: 181ms	remaining: 5.32s
33:	learn: 0.1613445	total: 187ms	remaining: 5.31s
34:	learn: 0.1613069	total: 192ms	remaining: 5.3s
35:	learn: 0.1612852	total: 198ms	remaining: 5.29s
36:	learn: 0.1612657	total: 203ms

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.484973 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.280123
catboost
Learning rate set to 0.052226
0:	learn: 0.1633249	total: 6.91ms	remaining: 6.9s
1:	learn: 0.1631895	total: 12.3ms	remaining: 6.16s
2:	learn: 0.1630932	total: 17.8ms	remaining: 5.92s
3:	learn: 0.1630386	total: 23.2ms	remaining: 5.78s
4:	learn: 0.1629522	total: 28.6ms	remaining: 5.69s
5:	learn: 0.1628884	total: 33.9ms	remaining: 5.62s
6:	learn: 0.1627972	total: 39.3ms	remaining: 5.57s
7:	learn: 0.1627298	total: 44.7ms	remaining: 5.54s
8:	learn: 0.1626597	total: 50ms	remaining: 5.51s
9:	learn: 0.1625592	total: 55.4ms	remaining: 5.48s
10:	learn: 0.1624823	total: 60.8ms	rema

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.989710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.286097
catboost


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


Learning rate set to 0.052224
0:	learn: 0.1647460	total: 7.62ms	remaining: 7.62s
1:	learn: 0.1646564	total: 13.3ms	remaining: 6.61s
2:	learn: 0.1645277	total: 18.7ms	remaining: 6.22s
3:	learn: 0.1644480	total: 24.2ms	remaining: 6.02s
4:	learn: 0.1643577	total: 29.6ms	remaining: 5.88s
5:	learn: 0.1642612	total: 34.9ms	remaining: 5.79s
6:	learn: 0.1641977	total: 40.5ms	remaining: 5.75s
7:	learn: 0.1641131	total: 46ms	remaining: 5.7s
8:	learn: 0.1640114	total: 51.5ms	remaining: 5.68s
9:	learn: 0.1639356	total: 57.1ms	remaining: 5.66s
10:	learn: 0.1638489	total: 62.7ms	remaining: 5.64s
11:	learn: 0.1638037	total: 68.2ms	remaining: 5.62s
12:	learn: 0.1637586	total: 73.6ms	remaining: 5.59s
13:	learn: 0.1636803	total: 79ms	remaining: 5.57s
14:	learn: 0.1636071	total: 84.6ms	remaining: 5.55s
15:	learn: 0.1635638	total: 89.8ms	remaining: 5.53s
16:	learn: 0.1634850	total: 95.2ms	remaining: 5.5s
17:	learn: 0.1634345	total: 100ms	remaining: 5.48s
18:	learn: 0.1633815	total: 106ms	remaining: 5.47s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012317 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.285399
catboost
Learning rate set to 0.052226
0:	learn: 0.1628445	total: 7.14ms	remaining: 7.13s
1:	learn: 0.1627473	total: 12.7ms	remaining: 6.33s
2:	learn: 0.1626680	total: 18.1ms	remaining: 6s
3:	learn: 0.1626438	total: 24.2ms	remaining: 6.03s
4:	learn: 0.1625893	total: 29.8ms	remaining: 5.92s
5:	learn: 0.1624957	total: 35.1ms	remaining: 5.82s
6:	learn: 0.1624119	total: 40.6ms	remaining: 5.76s
7:	learn: 0.1623668	total: 46.2ms	remaining: 5.73s
8:	learn: 0.1622924	total: 51.4ms	remaining: 5.66s
9:	learn: 0.1622197	total: 57.1ms	remaining: 5.66s
10:	learn: 0.1621441	total: 62.7ms	remaining: 5.64s
11:	learn: 0.1620739	total: 68.2ms	remaining: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.091138 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.284876
catboost
Learning rate set to 0.052224
0:	learn: 0.1647540	total: 7.74ms	remaining: 7.73s
1:	learn: 0.1646261	total: 13.2ms	remaining: 6.56s
2:	learn: 0.1644735	total: 18.8ms	remaining: 6.25s
3:	learn: 0.1643509	total: 24.2ms	remaining: 6.02s
4:	learn: 0.1642464	total: 30.5ms	remaining: 6.07s
5:	learn: 0.1641630	total: 35.8ms	remaining: 5.93s
6:	learn: 0.1640953	total: 41.1ms	remaining: 5.83s
7:	learn: 0.1640151	total: 46.4ms	remaining: 5.75s
8:	learn: 0.1639112	total: 51.9ms	remaining: 5.72s
9:	learn: 0.1638367	total: 57.3ms	remaining: 5.67s
10:	learn: 0.1637188	total: 62.4ms	remai

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


16:	learn: 0.1633128	total: 94.8ms	remaining: 5.48s
17:	learn: 0.1632679	total: 100ms	remaining: 5.46s
18:	learn: 0.1631983	total: 105ms	remaining: 5.45s
19:	learn: 0.1631545	total: 111ms	remaining: 5.42s
20:	learn: 0.1631107	total: 116ms	remaining: 5.4s
21:	learn: 0.1630628	total: 121ms	remaining: 5.38s
22:	learn: 0.1630224	total: 126ms	remaining: 5.37s
23:	learn: 0.1629780	total: 132ms	remaining: 5.35s
24:	learn: 0.1629392	total: 137ms	remaining: 5.34s
25:	learn: 0.1629007	total: 142ms	remaining: 5.33s
26:	learn: 0.1628662	total: 148ms	remaining: 5.32s
27:	learn: 0.1628242	total: 153ms	remaining: 5.31s
28:	learn: 0.1627734	total: 158ms	remaining: 5.29s
29:	learn: 0.1627332	total: 163ms	remaining: 5.28s
30:	learn: 0.1626977	total: 168ms	remaining: 5.26s
31:	learn: 0.1626598	total: 174ms	remaining: 5.26s
32:	learn: 0.1626250	total: 179ms	remaining: 5.25s
33:	learn: 0.1625870	total: 184ms	remaining: 5.23s
34:	learn: 0.1625434	total: 189ms	remaining: 5.22s
35:	learn: 0.1625257	total: 195

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.903018 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.284865
catboost
Learning rate set to 0.052226
0:	learn: 0.1622992	total: 7.58ms	remaining: 7.57s
1:	learn: 0.1621842	total: 13ms	remaining: 6.49s
2:	learn: 0.1621032	total: 18.4ms	remaining: 6.1s
3:	learn: 0.1620250	total: 23.8ms	remaining: 5.92s
4:	learn: 0.1619246	total: 29.2ms	remaining: 5.82s
5:	learn: 0.1618797	total: 34.6ms	remaining: 5.72s
6:	learn: 0.1617997	total: 40ms	remaining: 5.67s
7:	learn: 0.1617291	total: 45.3ms	remaining: 5.62s
8:	learn: 0.1616757	total: 50.7ms	remaining: 5.58s
9:	learn: 0.1615741	total: 56.1ms	remaining: 5.55s
10:	learn: 0.1615021	total: 61.6ms	rem

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025515 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.145822
catboost


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


Learning rate set to 0.052224
0:	learn: 0.1443853	total: 7.54ms	remaining: 7.54s
1:	learn: 0.1442011	total: 13ms	remaining: 6.48s
2:	learn: 0.1440323	total: 18.5ms	remaining: 6.16s
3:	learn: 0.1438655	total: 24ms	remaining: 5.97s
4:	learn: 0.1437240	total: 29.3ms	remaining: 5.83s
5:	learn: 0.1435589	total: 34.7ms	remaining: 5.74s
6:	learn: 0.1434668	total: 40.1ms	remaining: 5.68s
7:	learn: 0.1433495	total: 45.5ms	remaining: 5.64s
8:	learn: 0.1432327	total: 50.9ms	remaining: 5.6s
9:	learn: 0.1431524	total: 56.2ms	remaining: 5.56s
10:	learn: 0.1430331	total: 61.7ms	remaining: 5.55s
11:	learn: 0.1429470	total: 67.1ms	remaining: 5.52s
12:	learn: 0.1428630	total: 72.5ms	remaining: 5.5s
13:	learn: 0.1427604	total: 77.9ms	remaining: 5.48s
14:	learn: 0.1426977	total: 83.1ms	remaining: 5.46s
15:	learn: 0.1425775	total: 88.4ms	remaining: 5.44s
16:	learn: 0.1424739	total: 93.7ms	remaining: 5.42s
17:	learn: 0.1423884	total: 99.1ms	remaining: 5.4s
18:	learn: 0.1423326	total: 104ms	remaining: 5.39s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.538480 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.147037
catboost
Learning rate set to 0.052226
0:	learn: 0.1431683	total: 7.4ms	remaining: 7.39s
1:	learn: 0.1430417	total: 13ms	remaining: 6.5s
2:	learn: 0.1429299	total: 18.4ms	remaining: 6.12s
3:	learn: 0.1428325	total: 23.8ms	remaining: 5.93s
4:	learn: 0.1427233	total: 29.3ms	remaining: 5.83s
5:	learn: 0.1426356	total: 35.5ms	remaining: 5.88s
6:	learn: 0.1425205	total: 41.2ms	remaining: 5.84s
7:	learn: 0.1424228	total: 46.8ms	remaining: 5.8s
8:	learn: 0.1423383	total: 52.2ms	remaining: 5.75s
9:	learn: 0.1422357	total: 57.5ms	remaining: 5.69s
10:	learn: 0.1421613	total: 62.9ms	remaining: 5.65s
11:	learn: 0.1421391	total: 68.2ms	remaining: 5.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.105931 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.172752
catboost
Learning rate set to 0.052224
0:	learn: 0.1511842	total: 7.51ms	remaining: 7.5s
1:	learn: 0.1509194	total: 13.2ms	remaining: 6.58s
2:	learn: 0.1506913	total: 18.6ms	remaining: 6.17s
3:	learn: 0.1505024	total: 24ms	remaining: 5.97s
4:	learn: 0.1503357	total: 29.4ms	remaining: 5.85s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


5:	learn: 0.1501624	total: 34.9ms	remaining: 5.78s
6:	learn: 0.1499989	total: 40.3ms	remaining: 5.72s
7:	learn: 0.1498453	total: 45.8ms	remaining: 5.68s
8:	learn: 0.1496717	total: 51.2ms	remaining: 5.63s
9:	learn: 0.1495748	total: 56.7ms	remaining: 5.61s
10:	learn: 0.1494304	total: 62.2ms	remaining: 5.59s
11:	learn: 0.1492902	total: 67.6ms	remaining: 5.57s
12:	learn: 0.1491746	total: 73.1ms	remaining: 5.55s
13:	learn: 0.1490176	total: 78.5ms	remaining: 5.53s
14:	learn: 0.1489104	total: 84ms	remaining: 5.51s
15:	learn: 0.1487883	total: 89.3ms	remaining: 5.49s
16:	learn: 0.1486866	total: 94.7ms	remaining: 5.47s
17:	learn: 0.1486700	total: 100ms	remaining: 5.46s
18:	learn: 0.1485729	total: 106ms	remaining: 5.45s
19:	learn: 0.1484725	total: 111ms	remaining: 5.43s
20:	learn: 0.1483691	total: 116ms	remaining: 5.42s
21:	learn: 0.1483150	total: 121ms	remaining: 5.4s
22:	learn: 0.1482365	total: 127ms	remaining: 5.38s
23:	learn: 0.1481597	total: 132ms	remaining: 5.37s
24:	learn: 0.1480659	total:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.608032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.173461
catboost
Learning rate set to 0.052226
0:	learn: 0.1504195	total: 7.18ms	remaining: 7.18s
1:	learn: 0.1502089	total: 12.7ms	remaining: 6.33s
2:	learn: 0.1500686	total: 18.2ms	remaining: 6.04s
3:	learn: 0.1499005	total: 23.6ms	remaining: 5.88s
4:	learn: 0.1497579	total: 29.1ms	remaining: 5.79s
5:	learn: 0.1496393	total: 34.4ms	remaining: 5.7s
6:	learn: 0.1494883	total: 39.8ms	remaining: 5.65s
7:	learn: 0.1493316	total: 46.3ms	remaining: 5.75s
8:	learn: 0.1492129	total: 51.7ms	remaining: 5.7s
9:	learn: 0.1490708	total: 57.2ms	remaining: 5.66s
10:	learn: 0.1489549	total: 62.7ms	remaining: 5.64s
11:	learn: 0.1487909	total: 68.1ms	remaining:

Exception ignored on calling ctypes callback function: <function _log_callback at 0x7f4d22c39990>
Traceback (most recent call last):
  File "/home/RDC/zinovyee.hub/.conda/envs/ensemble/lib/python3.10/site-packages/lightgbm/basic.py", line 224, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


No further splits with positive gain, best gain: -inf
catboost
Learning rate set to 0.052226
0:	learn: 0.1511796	total: 7.84ms	remaining: 7.83s
1:	learn: 0.1509729	total: 13.4ms	remaining: 6.68s
2:	learn: 0.1508082	total: 18.9ms	remaining: 6.29s
3:	learn: 0.1506285	total: 24.5ms	remaining: 6.11s
4:	learn: 0.1504702	total: 30.3ms	remaining: 6.04s
5:	learn: 0.1502596	total: 36ms	remaining: 5.97s
6:	learn: 0.1501161	total: 41.6ms	remaining: 5.9s
7:	learn: 0.1499289	total: 47ms	remaining: 5.83s
8:	learn: 0.1497772	total: 52.7ms	remaining: 5.8s
9:	learn: 0.1496126	total: 58.3ms	remaining: 5.77s
10:	learn: 0.1494831	total: 63.8ms	remaining: 5.74s
11:	learn: 0.1493637	total: 69.3ms	remaining: 5.7s
12:	learn: 0.1492648	total: 74.7ms	remaining: 5.67s
13:	learn: 0.1491490	total: 80.2ms	remaining: 5.65s
14:	learn: 0.1490580	total: 85.7ms	remaining: 5.63s
15:	learn: 0.1489673	total: 91.2ms	remaining: 5.61s
16:	learn: 0.1488929	total: 96.7ms	remaining: 5.59s
17:	learn: 0.1488035	total: 102ms	remain

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm


In [None]:
print("MAE")
display(results_cv_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

MAE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.118702,0.013585,0.094472,0.110511,0.124659,0.128642,0.131838
1,catboost,1,9.0,0.124568,0.018065,0.091059,0.114208,0.129422,0.135969,0.1436
2,catboost,2,9.0,0.123986,0.010899,0.105216,0.112683,0.130097,0.131667,0.132326
3,catboost,3,9.0,0.128527,0.007383,0.114903,0.123806,0.131789,0.133732,0.136306
4,catboost,4,9.0,0.129966,0.012629,0.101976,0.128909,0.132864,0.13607,0.144533
5,catboost,full,9.0,0.125645,0.009843,0.107373,0.117284,0.131591,0.132126,0.132697
6,lgbm,0,9.0,0.117401,0.013617,0.09257,0.110821,0.124408,0.127109,0.130118
7,lgbm,1,9.0,0.121329,0.015229,0.092765,0.115476,0.125191,0.130848,0.140001
8,lgbm,2,9.0,0.123957,0.010929,0.104743,0.113392,0.130213,0.131641,0.132044
9,lgbm,3,9.0,0.128401,0.006966,0.115004,0.123805,0.130787,0.132393,0.13623


In [None]:
print("RMSE")
display(results_cv_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())

RMSE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.146854,0.017553,0.112115,0.142296,0.152608,0.158619,0.164319
1,catboost,1,9.0,0.15634,0.02336,0.112536,0.138759,0.168495,0.169469,0.18086
2,catboost,2,9.0,0.153934,0.013872,0.13064,0.139461,0.161766,0.164074,0.164443
3,catboost,3,9.0,0.160873,0.008764,0.144274,0.156638,0.163208,0.167278,0.17085
4,catboost,4,9.0,0.16073,0.015719,0.125793,0.158634,0.163668,0.167928,0.178489
5,catboost,full,9.0,0.156539,0.012165,0.133906,0.146998,0.164123,0.164427,0.165436
6,lgbm,0,9.0,0.14581,0.016009,0.112056,0.144008,0.151588,0.157188,0.159157
7,lgbm,1,9.0,0.153347,0.021138,0.111786,0.141441,0.162259,0.165784,0.177934
8,lgbm,2,9.0,0.153793,0.013755,0.130136,0.140757,0.16163,0.163766,0.16414
9,lgbm,3,9.0,0.16026,0.008563,0.143615,0.155861,0.162463,0.165851,0.170334


In [None]:
temp_df

In [None]:
results_test_df = pd.DataFrame()

t_models = ["lr", "svm", "lgbm", "catboost"]

for model_base in MODELS_LIST:

    cv_temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df =  test_predictions.loc[test_predictions.model_set==model_base]
    full_step_2(experiment_config=experiment_config,
                            cv_df=cv_temp_df)
    temp_df = meta_predict(experiment_config=experiment_config,
                            test_df=temp_df,
                            base_models_names=[model_base],
                            t_models=t_models)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])

            results_test_df = pd.concat([results_test_df, t_res], axis=0)
        
    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_preds"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_preds"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])

        results_test_df = pd.concat([results_test_df, t_res], axis=0)


results_test_df = results_test_df.sort_values(["model_meta", "cluster"])


In [None]:
print("MAE")
display(results_test_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

In [None]:
print("RMSE")
display(results_test_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())