In [9]:
import sys
sys.path.append("../")
import numpy as np
import time
import pandas as pd
import pickle
import math
from typing import Tuple


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.sparse import hstack

ANALYSIS_POSTFIX = "mined_no_drift_2024-09-09"

experiment_config = {
    "RS" : 42,
    "ANALYSIS_POSTFIX": ANALYSIS_POSTFIX,
    "FEATURE_MODE" : "CODE_MODEL", # CODE_MODEL
}

t_models = ["lr", "svm", "lgbm", "catboost"]

In [10]:
def step_two(experiment_config, 
             X_train,
             y_train,
             model,
             X_val=None,
             y_val=None,
             save=False): 
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    
    training_start_time = time.time()
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor(max_depth=10, silent=True)
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)
    training_end_time = time.time()
    time_training = training_end_time - training_start_time

    
    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        inference_start_time = time.time()
        y_pred = reg.predict(X_val)
        inference_end_time = time.time()
        time_inference = inference_end_time - inference_start_time

        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse, "time_training" : time_training, "time_inference" : time_inference}
    

def cv_step_2(experiment_config:dict, cv_df:pd.DataFrame, t_models:list = ["lr", "svm", "lgbm", "catboost"]) -> Tuple:

    results = {}

    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    for test_fold in range(cv_df.fold.max()+1):
        print(test_fold)

        # Prepare the input data
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])

        if FEATURE_MODE=="CODE_MODEL":
            X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_train = hstack([X_train_column_sparse, X_train_tfidf])
        elif FEATURE_MODE=="CODE":
            X_train = X_train_tfidf
            
        y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
        
        X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
        if FEATURE_MODE=="CODE_MODEL":
            X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_val = hstack([X_val_column_sparse, X_val_tfidf])
        elif FEATURE_MODE=="CODE":
            X_val = X_val_tfidf
            
        y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

        results[test_fold] = {}
        for model in t_models:
            print(model)
            preds_df = step_two(experiment_config=experiment_config,
                                X_train=X_train,
                                y_train=y_train,
                                X_val=X_val,
                                y_val=y_val,
                                model=model)
            cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
            results[test_fold][model] = preds_df

    cv_df = cv_df.reset_index(drop=True)

    return cv_df

def full_step_2(cv_df:pd.DataFrame,
                experiment_config:dict,
               t_models:list = ["lr", "svm", "lgbm", "catboost"]) -> None:
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    # TRAIN ON ALL PREDICTIONS AT ONCE

    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.model_set!="ensemble", "input_sequence"])

    if FEATURE_MODE=="CODE_MODEL":
        X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.model_set!="ensemble", "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_train = hstack([X_train_column_sparse, X_train_tfidf])
    elif FEATURE_MODE=="CODE":
        X_train = X_train_tfidf
        
    print(X_train.shape)
    y_train = cv_df.loc[cv_df.model_set!="ensemble", "rouge"]
        
    with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
        pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
        
    for model in t_models:
        print(model)
        preds_df = step_two(experiment_config=experiment_config,
                            X_train=X_train,
                            y_train=y_train,
                            model=model,
                            save=True)
        
def pred_perf(experiment_config,
              X,
              model): 

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]

    with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','rb') as f:
            reg = pickle.load(f)

    y_pred = reg.predict(X)
    y_pred[y_pred<0] = 0
    return y_pred

def meta_predict(experiment_config:dict, 
                 test_df: pd.DataFrame,
                 base_models_names: list,
                 t_models:list = ["lr", "svm", "lgbm", "catboost"]) -> pd.DataFrame:

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    FEATURE_MODE = experiment_config["FEATURE_MODE"]
    
    for model_i, model_set in enumerate(base_models_names):

        set_df = test_df.copy()
        set_df["model_set"] = model_set
        # Prepare the input data
        with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "rb") as file:
            vectorizer = pickle.load(file)

        if model_i==0:
            meta_preds_df = set_df.copy()
        else: 
            meta_preds_df = pd.concat([meta_preds_df, set_df])
            
    X_test_tfidf = vectorizer.transform(meta_preds_df.loc[:, "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_test_column_sparse = pd.get_dummies(meta_preds_df.loc[:, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_test = hstack([X_test_column_sparse, X_test_tfidf])
    elif FEATURE_MODE=="CODE":
        X_test = X_test_tfidf

    print(X_test.shape)

    for model in t_models:
        print(model)
        meta_preds_df[f"{model}_preds"] = pred_perf(experiment_config=experiment_config, 
                                                    X=X_test,
                                                    model=model)

    meta_preds_df = meta_preds_df.reset_index(drop=True)
    return meta_preds_df

In [11]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)


In [12]:
COLUMNS_TEST = ['question_id', 'parent_answer_post_id', 'prob', 'input_sequence',
       'output_sequence', 'id', 'snippet_len', 'intent_len', 'snippet_token_n',
       'intent_token_n', 'cluster', 'input_ids', 'attention_mask', 'labels',
       'prediction', 'rouge', 'model_set']

COLUMNS_CV = COLUMNS_TEST.copy()
COLUMNS_CV.append("fold")

#### Preprocessing

In [13]:
cv_predictions = cv_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_CV]
test_predictions = test_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_TEST]

In [14]:
cv_predictions.model_set.unique()

array([0, 1, 2, 5, 10, 'cluster_[0]', 'cluster_[3]', 'cluster_[0, 3]'],
      dtype=object)

# Code Only

We have 9 base lerner settings models that we compare learning of 1, splitting to two meta models,  all together. 

In [15]:
MODELS_LIST = [0, 1, 2, 5, 10, 'cluster_[0]', 'cluster_[3]', 'cluster_[0, 3]']
MODE = ["ONE-BY-ONE", "TWO-MODELS", "ALL"]

In [16]:
results_cv_df = pd.DataFrame()

temp_df =  cv_predictions.copy()
temp_df = cv_step_2(experiment_config=experiment_config,
                    cv_df=temp_df,
                   t_models=t_models)

0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029655 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13018
[LightGBM] [Info] Number of data points in the train set: 37328, number of used features: 1024
[LightGBM] [Info] Start training from score 0.245488
catboost
Learning rate set to 0.072537
0:	learn: 0.1639212	total: 66.1ms	remaining: 1m 5s
1:	learn: 0.1629041	total: 73.9ms	remaining: 36.9s
2:	learn: 0.1620000	total: 80.5ms	remaining: 26.8s
3:	learn: 0.1611945	total: 87.2ms	remaining: 21.7s
4:	learn: 0.1605028	total: 95ms	remaining: 18.9s
5:	learn: 0.1598611	total: 102ms	remaining: 16.9s
6:	learn: 0.1593115	total: 109ms	remaining: 15.4s
7:	learn: 0.1588391	total: 115ms	remaining: 14.3s
8:	learn: 0.1584001	total: 122ms	remaining: 13.5s
9:	learn: 0.1580357	total: 130ms	remaining: 12.9s
10:	learn: 0.1577346	total: 137ms	remaining: 12.3s
11:	learn: 0.1574196	total: 144ms	remaining: 11.

In [17]:
for model_meta in t_models:
    for cluster in sorted(temp_df.cluster.unique()):

        print(cluster)
        cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]

        mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                    y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"])
        
        rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                    y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"]))

        folds_mae = []
        folds_rmse = []

        for fold in temp_df.fold.unique():
            fold_temp_df = cluster_temp_df.loc[cluster_temp_df.fold==fold, :]

            fold_mae = mean_absolute_error(y_true=fold_temp_df.loc[:, "rouge"],
                                        y_pred=fold_temp_df.loc[:, f"{model_meta}_perf_hat"])
            
            fold_rmse = math.sqrt(mean_squared_error(y_true=fold_temp_df.loc[:, "rouge"],
                                        y_pred=fold_temp_df.loc[:, f"{model_meta}_perf_hat"]))
            
            folds_mae.append(fold_mae)
            folds_rmse.append(fold_rmse)
        
        t_res = pd.DataFrame(data={"model_base": "all",
                                    "model_meta": model_meta,
                                    "cluster": cluster,
                                     "rmse_mean": rmse,
                                     "rmse_std": np.array(folds_rmse).std(),
                                     "mae_mean": mae,
                                     "mae_std" : np.array(folds_mae).std()}, index=[0])


        results_cv_df = pd.concat([results_cv_df, t_res], axis=0)

0
1
2
3
0
1
2
3
0
1
2
3
0
1
2
3


In [18]:
for model_meta in t_models:
    folds_mae = []
    folds_rmse = []

    for fold in temp_df.fold.unique():
        fold_temp_df = temp_df.loc[temp_df.fold==fold, :]
        mae = mean_absolute_error(y_true=fold_temp_df.loc[:, "rouge"],
                                    y_pred=fold_temp_df.loc[:, f"{model_meta}_perf_hat"])
    
        rmse = math.sqrt(mean_squared_error(y_true=fold_temp_df.loc[:, "rouge"],
                                        y_pred=fold_temp_df.loc[:, f"{model_meta}_perf_hat"]))
        folds_mae.append(mae)
        folds_rmse.append(rmse)


    mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                    y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"])
    
    rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                    y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"]))
    
    t_res = pd.DataFrame(data={"model_base": "all",
                               "model_meta": model_meta,
                               "cluster": "full",
                               "rmse_mean": rmse,
                               "rmse_std": np.array(folds_rmse).std(),
                               "mae_mean": mae,
                               "mae_std": np.array(folds_mae).std()}, index=[0])


    results_cv_df = pd.concat([results_cv_df, t_res], axis=0)

results_cv_df = results_cv_df.sort_values(["model_meta", "cluster"])

In [19]:
results_cv_df

Unnamed: 0,model_base,model_meta,cluster,rmse_mean,rmse_std,mae_mean,mae_std
0,all,catboost,0,0.169868,0.004299,0.137541,0.002681
0,all,catboost,1,0.160995,0.00169,0.129049,0.001406
0,all,catboost,2,0.153527,0.004155,0.122136,0.003238
0,all,catboost,3,0.139629,0.004045,0.109859,0.002707
0,all,catboost,full,0.158382,0.002115,0.126522,0.001571
0,all,lgbm,0,0.168741,0.004867,0.136713,0.002981
0,all,lgbm,1,0.160543,0.001079,0.128744,0.000783
0,all,lgbm,2,0.153673,0.003339,0.122226,0.00304
0,all,lgbm,3,0.139103,0.004282,0.109334,0.002676
0,all,lgbm,full,0.157891,0.001554,0.126157,0.001231


In [20]:
results_test_df = pd.DataFrame()

cv_temp_df =  cv_predictions.copy()
temp_df =  test_predictions.copy()
full_step_2(experiment_config=experiment_config,
                        cv_df=cv_temp_df,
                       t_models=t_models)
temp_df = meta_predict(experiment_config=experiment_config,
                        test_df=temp_df,
                        base_models_names=MODELS_LIST,
                      t_models=t_models)

for model_meta in t_models:
    for cluster in sorted(temp_df.cluster.unique()):

        print(cluster)
        cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


        mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                    y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"])
        
        rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                    y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"]))
        
        t_res = pd.DataFrame(data={"model_base": "all", "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])

        results_test_df = pd.concat([results_test_df, t_res], axis=0)
    
for model_meta in t_models:

    mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                    y_pred=temp_df.loc[:, f"{model_meta}_preds"])
    
    rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                    y_pred=temp_df.loc[:, f"{model_meta}_preds"]))
    
    t_res = pd.DataFrame(data={"model_base": "all", "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])

    results_test_df = pd.concat([results_test_df, t_res], axis=0)


results_test_df = results_test_df.sort_values(["model_meta", "cluster"])


(56000, 7193)
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19308
[LightGBM] [Info] Number of data points in the train set: 56000, number of used features: 1384
[LightGBM] [Info] Start training from score 0.245621
catboost
Learning rate set to 0.077338
0:	learn: 0.1648190	total: 11.8ms	remaining: 11.8s
1:	learn: 0.1637201	total: 20.9ms	remaining: 10.4s
2:	learn: 0.1627315	total: 28.8ms	remaining: 9.57s
3:	learn: 0.1619450	total: 36.9ms	remaining: 9.19s
4:	learn: 0.1612320	total: 45.1ms	remaining: 8.97s
5:	learn: 0.1605757	total: 53.5ms	remaining: 8.87s
6:	learn: 0.1600621	total: 62ms	remaining: 8.8s
7:	learn: 0.1596135	total: 70.1ms	remaining: 8.69s
8:	learn: 0.1592122	total: 79ms	remaining: 8.7s
9:	learn: 0.1588794	total: 87.3ms	remaining: 8.64s
10:	learn: 0.1585710	t

In [21]:
results_test_df

Unnamed: 0,model_base,model_meta,cluster,rmse,mae
0,all,catboost,0,0.171658,0.140225
0,all,catboost,1,0.173904,0.140336
0,all,catboost,2,0.17474,0.139906
0,all,catboost,3,0.154668,0.124224
0,all,catboost,full,0.170642,0.13757
0,all,lgbm,0,0.171375,0.140014
0,all,lgbm,1,0.173886,0.140667
0,all,lgbm,2,0.173727,0.138906
0,all,lgbm,3,0.154303,0.124064
0,all,lgbm,full,0.170326,0.137464
