In [1]:
import sys
sys.path.append("../")
import numpy as np
import time
import pandas as pd
import pickle
import math
from typing import Tuple


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.sparse import hstack

ANALYSIS_POSTFIX = "mined_sudden_2024-08-26"

experiment_config = {
    "RS" : 42,
    "ANALYSIS_POSTFIX": ANALYSIS_POSTFIX
}

In [2]:
def step_two(experiment_config, 
             X_train,
             y_train,
             model,
             X_val=None,
             y_val=None,
             save=False): 
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    
    training_start_time = time.time()
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor(max_depth=10, silent=True)
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)
    training_end_time = time.time()
    time_training = training_end_time - training_start_time

    

    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        inference_start_time = time.time()
        y_pred = reg.predict(X_val)
        inference_end_time = time.time()
        time_inference = inference_end_time - inference_start_time

        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse, "time_training" : time_training, "time_inference" : time_inference}
    

def cv_step_2(experiment_config:dict, cv_df:pd.DataFrame) -> Tuple:

    t_models = ["lr", "svm", "lgbm", "catboost"]

    results = {}


    for test_fold in range(cv_df.fold.max()+1):
        print(test_fold)

        # Prepare the input data
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
        X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_train = hstack([X_train_column_sparse, X_train_tfidf])
        y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
        
        X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
        X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_val = hstack([X_val_column_sparse, X_val_tfidf])
        y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

        results[test_fold] = {}
        for model in t_models:
            print(model)
            preds_df = step_two(experiment_config=experiment_config,
                                X_train=X_train,
                                y_train=y_train,
                                X_val=X_val,
                                y_val=y_val,
                                model=model)
            cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
            results[test_fold][model] = preds_df

    cv_df = cv_df.reset_index(drop=True)

    return cv_df

def full_step_2(cv_df:pd.DataFrame,
                experiment_config:dict) -> None:
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    # TRAIN ON ALL PREDICTIONS AT ONCE

    t_models = ["lr", "svm", "lgbm", "catboost"]

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.model_set!="ensemble", "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.model_set!="ensemble", "model_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.model_set!="ensemble", "rouge"]
        
    with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
        pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
        
    for model in t_models:
        print(model)
        preds_df = step_two(experiment_config=experiment_config,
                            X_train=X_train,
                            y_train=y_train,
                            model=model,
                            save=True)

In [3]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)


In [4]:
COLUMNS_TEST = ['question_id', 'parent_answer_post_id', 'prob', 'input_sequence',
       'output_sequence', 'id', 'snippet_len', 'intent_len', 'snippet_token_n',
       'intent_token_n', 'cluster', 'input_ids', 'attention_mask', 'labels',
       'prediction', 'rouge', 'model_set']

COLUMNS_CV = COLUMNS_TEST.copy()
COLUMNS_CV.append("fold")

#### Preprocessing

In [5]:
cv_predictions = cv_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_CV]
test_predictions = test_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_TEST]

# Code Only

We have 9 base lerner settings models that we compare learning of 1, splitting to two meta models,  all together. 

In [6]:
MODELS_LIST = [0, 1, 2, 5, 10, 'cluster_[1]', 'cluster_[4]', 'cluster_[3]', 'cluster_[0, 1, 4]']
MODE = ["ONE-BY-ONE", "TWO-MODELS", "ALL"]

In [7]:
results_df = pd.DataFrame()

t_models = ["lr", "svm", "lgbm", "catboost"]

for model_base in MODELS_LIST:

    temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df = cv_step_2(experiment_config=experiment_config,
              cv_df=temp_df)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])


            results_df = pd.concat([results_df, t_res], axis=0)
        


    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])


        results_df = pd.concat([results_df, t_res], axis=0)

    break

results_df = results_df.sort_values(["model_meta", "cluster"])


0
lr
svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003610 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.125930
catboost
Learning rate set to 0.052224
0:	learn: 0.1358222	total: 65.3ms	remaining: 1m 5s
1:	learn: 0.1356581	total: 71.1ms	remaining: 35.5s
2:	learn: 0.1355014	total: 76.9ms	remaining: 25.5s
3:	learn: 0.1353616	total: 83.6ms	remaining: 20.8s
4:	learn: 0.1352198	total: 89.7ms	remaining: 17.9s
5:	learn: 0.1350561	total: 96ms	remaining: 15.9s
6:	learn: 0.1349760	total: 102ms	remaining: 14.5s
7:	learn: 0.1348568	total: 108ms	remaining: 13.3s
8:	learn: 0.1347464	total: 113ms	remaining: 12.5s
9:	learn: 0.1346470	total: 119ms	remaining: 11.8s
10:	learn: 0.1345521	total: 125ms	remaining: 11.2s
11:	learn: 0.1344898	total: 131ms	remaining: 10.8s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


12:	learn: 0.1344055	total: 137ms	remaining: 10.4s
13:	learn: 0.1343192	total: 143ms	remaining: 10.1s
14:	learn: 0.1342655	total: 150ms	remaining: 9.85s
15:	learn: 0.1341750	total: 156ms	remaining: 9.59s
16:	learn: 0.1340632	total: 162ms	remaining: 9.37s
17:	learn: 0.1340416	total: 168ms	remaining: 9.17s
18:	learn: 0.1339812	total: 174ms	remaining: 9s
19:	learn: 0.1339590	total: 180ms	remaining: 8.83s
20:	learn: 0.1338644	total: 187ms	remaining: 8.72s
21:	learn: 0.1337826	total: 193ms	remaining: 8.58s
22:	learn: 0.1337359	total: 199ms	remaining: 8.44s
23:	learn: 0.1337133	total: 204ms	remaining: 8.31s
24:	learn: 0.1336781	total: 210ms	remaining: 8.2s
25:	learn: 0.1336040	total: 217ms	remaining: 8.12s
26:	learn: 0.1335776	total: 223ms	remaining: 8.03s
27:	learn: 0.1335173	total: 229ms	remaining: 7.95s
28:	learn: 0.1334442	total: 235ms	remaining: 7.86s
29:	learn: 0.1334211	total: 241ms	remaining: 7.78s
30:	learn: 0.1333734	total: 246ms	remaining: 7.7s
31:	learn: 0.1333214	total: 252ms	re

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]


1
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003491 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2597
[LightGBM] [Info] Number of data points in the train set: 4667, number of used features: 136
[LightGBM] [Info] Start training from score 0.128259
catboost
Learning rate set to 0.052226
0:	learn: 0.1354522	total: 7.69ms	remaining: 7.69s
1:	learn: 0.1353413	total: 13.6ms	remaining: 6.78s
2:	learn: 0.1352304	total: 19.1ms	remaining: 6.34s
3:	learn: 0.1351174	total: 24.8ms	remaining: 6.18s
4:	learn: 0.1350268	total: 30.5ms	remaining: 6.08s
5:	learn: 0.1349477	total: 36ms	remaining: 5.97s
6:	learn: 0.1348705	total: 41.9ms	remaining: 5.95s
7:	learn: 0.1348488	total: 47.8ms	remaining: 5.93s
8:	learn: 0.1347808	total: 53.6ms	remaining: 5.91s
9:	learn: 0.1346646	total: 59.3ms	remaining: 5.87s
10:	learn: 0.1346370	total: 64.9ms	remaining: 5.84s
11:	learn: 0.1345402	total: 70.6ms	remaining:

In [8]:
print("MAE")
display(results_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

MAE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,1.0,0.094472,,0.094472,0.094472,0.094472,0.094472,0.094472
1,catboost,1,1.0,0.091059,,0.091059,0.091059,0.091059,0.091059,0.091059
2,catboost,2,1.0,0.105216,,0.105216,0.105216,0.105216,0.105216,0.105216
3,catboost,3,1.0,0.114903,,0.114903,0.114903,0.114903,0.114903,0.114903
4,catboost,4,1.0,0.101976,,0.101976,0.101976,0.101976,0.101976,0.101976
5,catboost,full,1.0,0.107373,,0.107373,0.107373,0.107373,0.107373,0.107373
6,lgbm,0,1.0,0.09257,,0.09257,0.09257,0.09257,0.09257,0.09257
7,lgbm,1,1.0,0.092765,,0.092765,0.092765,0.092765,0.092765,0.092765
8,lgbm,2,1.0,0.104743,,0.104743,0.104743,0.104743,0.104743,0.104743
9,lgbm,3,1.0,0.115004,,0.115004,0.115004,0.115004,0.115004,0.115004


In [9]:
print("RMSE")
display(results_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())

RMSE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,1.0,0.112115,,0.112115,0.112115,0.112115,0.112115,0.112115
1,catboost,1,1.0,0.112536,,0.112536,0.112536,0.112536,0.112536,0.112536
2,catboost,2,1.0,0.13064,,0.13064,0.13064,0.13064,0.13064,0.13064
3,catboost,3,1.0,0.144274,,0.144274,0.144274,0.144274,0.144274,0.144274
4,catboost,4,1.0,0.125793,,0.125793,0.125793,0.125793,0.125793,0.125793
5,catboost,full,1.0,0.133906,,0.133906,0.133906,0.133906,0.133906,0.133906
6,lgbm,0,1.0,0.112056,,0.112056,0.112056,0.112056,0.112056,0.112056
7,lgbm,1,1.0,0.111786,,0.111786,0.111786,0.111786,0.111786,0.111786
8,lgbm,2,1.0,0.130136,,0.130136,0.130136,0.130136,0.130136,0.130136
9,lgbm,3,1.0,0.143615,,0.143615,0.143615,0.143615,0.143615,0.143615
