In [1]:
import sys
sys.path.append("../")
import numpy as np
import time
import pandas as pd
import pickle
import math
from typing import Tuple


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.sparse import hstack

ANALYSIS_POSTFIX = "mined_sudden_2024-08-26"

experiment_config = {
    "RS" : 42,
    "ANALYSIS_POSTFIX": ANALYSIS_POSTFIX,
    "FEATURE_MODE" : "CODE", # CODE_MODEL
}

In [2]:
def step_two(experiment_config, 
             X_train,
             y_train,
             model,
             X_val=None,
             y_val=None,
             save=False): 
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    
    training_start_time = time.time()
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor(max_depth=10, silent=True)
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)
    training_end_time = time.time()
    time_training = training_end_time - training_start_time

    
    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        inference_start_time = time.time()
        y_pred = reg.predict(X_val)
        inference_end_time = time.time()
        time_inference = inference_end_time - inference_start_time

        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse, "time_training" : time_training, "time_inference" : time_inference}
    

def cv_step_2(experiment_config:dict, cv_df:pd.DataFrame) -> Tuple:

    t_models = ["lr", "svm", "lgbm", "catboost"]

    results = {}

    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    for test_fold in range(cv_df.fold.max()+1):
        print(test_fold)

        # Prepare the input data
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])

        if FEATURE_MODE=="CODE_MODEL":
            X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_train = hstack([X_train_column_sparse, X_train_tfidf])
        elif FEATURE_MODE=="CODE":
            X_train = X_train_tfidf
            
        y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
        
        X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
        if FEATURE_MODE=="CODE_MODEL":
            X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
            X_val = hstack([X_val_column_sparse, X_val_tfidf])
        elif FEATURE_MODE=="CODE":
            X_val = X_val_tfidf
            
        y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

        results[test_fold] = {}
        for model in t_models:
            print(model)
            preds_df = step_two(experiment_config=experiment_config,
                                X_train=X_train,
                                y_train=y_train,
                                X_val=X_val,
                                y_val=y_val,
                                model=model)
            cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
            results[test_fold][model] = preds_df

    cv_df = cv_df.reset_index(drop=True)

    return cv_df

def full_step_2(cv_df:pd.DataFrame,
                experiment_config:dict) -> None:
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    # TRAIN ON ALL PREDICTIONS AT ONCE

    t_models = ["lr", "svm", "lgbm", "catboost"]
    FEATURE_MODE = experiment_config["FEATURE_MODE"]

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.model_set!="ensemble", "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.model_set!="ensemble", "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_train = hstack([X_train_column_sparse, X_train_tfidf])
    elif FEATURE_MODE=="CODE":
        X_train = X_train_tfidf
        
    y_train = cv_df.loc[cv_df.model_set!="ensemble", "rouge"]
        
    with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
        pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
        
    for model in t_models:
        print(model)
        preds_df = step_two(experiment_config=experiment_config,
                            X_train=X_train,
                            y_train=y_train,
                            model=model,
                            save=True)
        
def pred_perf(experiment_config,
              X,
              model): 

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]

    with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','rb') as f:
            reg = pickle.load(f)

    y_pred = reg.predict(X)
    y_pred[y_pred<0] = 0
    return y_pred

def meta_predict(experiment_config:dict, 
                 test_df: pd.DataFrame,
                 base_models_names: list,
                 t_models:list = ["svm", "catboost"]) -> pd.DataFrame:

    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    FEATURE_MODE = experiment_config["FEATURE_MODE"]
    
    for model_i, model_set in enumerate(base_models_names):

        set_df = test_df.copy()
        set_df["model_set"] = model_set
        # Prepare the input data
        with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "rb") as file:
            vectorizer = pickle.load(file)

        if model_i==0:
            meta_preds_df = set_df.copy()
        else: 
            meta_preds_df = pd.concat([meta_preds_df, set_df])
            
    X_test_tfidf = vectorizer.transform(meta_preds_df.loc[:, "input_sequence"])
    if FEATURE_MODE=="CODE_MODEL":
        X_test_column_sparse = pd.get_dummies(meta_preds_df.loc[:, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_test = hstack([X_test_column_sparse, X_test_tfidf])
    elif FEATURE_MODE=="CODE":
        X_test = X_test_tfidf

    for model in t_models:
        print(model)
        meta_preds_df[f"{model}_preds"] = pred_perf(experiment_config=experiment_config, 
                                                    X=X_test,
                                                    model=model)

    meta_preds_df = meta_preds_df.reset_index(drop=True)
    return meta_preds_df

In [3]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)


In [4]:
COLUMNS_TEST = ['question_id', 'parent_answer_post_id', 'prob', 'input_sequence',
       'output_sequence', 'id', 'snippet_len', 'intent_len', 'snippet_token_n',
       'intent_token_n', 'cluster', 'input_ids', 'attention_mask', 'labels',
       'prediction', 'rouge', 'model_set']

COLUMNS_CV = COLUMNS_TEST.copy()
COLUMNS_CV.append("fold")

#### Preprocessing

In [5]:
cv_predictions = cv_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_CV]
test_predictions = test_predictions.loc[cv_predictions.model_set!="ensemble", COLUMNS_TEST]

# Code Only

We have 9 base lerner settings models that we compare learning of 1, splitting to two meta models,  all together. 

In [6]:
MODELS_LIST = [0, 1, 2, 5, 10, 'cluster_[1]', 'cluster_[4]', 'cluster_[3]', 'cluster_[0, 1, 4]']
MODE = ["ONE-BY-ONE", "TWO-MODELS", "ALL"]

In [None]:
results_cv_df = pd.DataFrame()

t_models = ["lr", "svm", "lgbm", "catboost"]

for model_base in MODELS_LIST:

    temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df = cv_step_2(experiment_config=experiment_config,
              cv_df=temp_df)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_perf_hat"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])


            results_cv_df = pd.concat([results_cv_df, t_res], axis=0)
        


    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_perf_hat"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])


        results_cv_df = pd.concat([results_cv_df, t_res], axis=0)

results_cv_df = results_cv_df.sort_values(["model_meta", "cluster"])


In [8]:
print("MAE")
display(results_cv_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

MAE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.118702,0.013585,0.094472,0.110511,0.124659,0.128642,0.131838
1,catboost,1,9.0,0.124568,0.018065,0.091059,0.114208,0.129422,0.135969,0.1436
2,catboost,2,9.0,0.123986,0.010899,0.105216,0.112683,0.130097,0.131667,0.132326
3,catboost,3,9.0,0.128527,0.007383,0.114903,0.123806,0.131789,0.133732,0.136306
4,catboost,4,9.0,0.129966,0.012629,0.101976,0.128909,0.132864,0.13607,0.144533
5,catboost,full,9.0,0.125645,0.009843,0.107373,0.117284,0.131591,0.132126,0.132697
6,lgbm,0,9.0,0.117401,0.013617,0.09257,0.110821,0.124408,0.127109,0.130118
7,lgbm,1,9.0,0.121329,0.015229,0.092765,0.115476,0.125191,0.130848,0.140001
8,lgbm,2,9.0,0.123957,0.010929,0.104743,0.113392,0.130213,0.131641,0.132044
9,lgbm,3,9.0,0.128401,0.006966,0.115004,0.123805,0.130787,0.132393,0.13623


In [9]:
print("RMSE")
display(results_cv_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())

RMSE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.146854,0.017553,0.112115,0.142296,0.152608,0.158619,0.164319
1,catboost,1,9.0,0.15634,0.02336,0.112536,0.138759,0.168495,0.169469,0.18086
2,catboost,2,9.0,0.153934,0.013872,0.13064,0.139461,0.161766,0.164074,0.164443
3,catboost,3,9.0,0.160873,0.008764,0.144274,0.156638,0.163208,0.167278,0.17085
4,catboost,4,9.0,0.16073,0.015719,0.125793,0.158634,0.163668,0.167928,0.178489
5,catboost,full,9.0,0.156539,0.012165,0.133906,0.146998,0.164123,0.164427,0.165436
6,lgbm,0,9.0,0.14581,0.016009,0.112056,0.144008,0.151588,0.157188,0.159157
7,lgbm,1,9.0,0.153347,0.021138,0.111786,0.141441,0.162259,0.165784,0.177934
8,lgbm,2,9.0,0.153793,0.013755,0.130136,0.140757,0.16163,0.163766,0.16414
9,lgbm,3,9.0,0.16026,0.008563,0.143615,0.155861,0.162463,0.165851,0.170334


In [None]:
results_test_df = pd.DataFrame()

t_models = ["lr", "svm", "lgbm", "catboost"]

for model_base in MODELS_LIST:

    cv_temp_df =  cv_predictions.loc[cv_predictions.model_set==model_base]
    temp_df =  test_predictions.loc[test_predictions.model_set==model_base]
    full_step_2(experiment_config=experiment_config,
                            cv_df=cv_temp_df)
    temp_df = meta_predict(experiment_config=experiment_config,
                            test_df=temp_df,
                            base_models_names=[model_base],
                            t_models=t_models)

    for model_meta in t_models:
        for cluster in sorted(temp_df.cluster.unique()):

            print(cluster)
            cluster_temp_df = temp_df.loc[temp_df.cluster==cluster, :]


            mae = mean_absolute_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"])
            
            rmse = math.sqrt(mean_squared_error(y_true=cluster_temp_df.loc[:, "rouge"],
                                        y_pred=cluster_temp_df.loc[:, f"{model_meta}_preds"]))
            
            t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": cluster, "rmse": rmse, "mae": mae}, index=[0])

            results_test_df = pd.concat([results_test_df, t_res], axis=0)
        
    for model_meta in t_models:


        mae = mean_absolute_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_preds"])
        
        rmse = math.sqrt(mean_squared_error(y_true=temp_df.loc[:, "rouge"],
                                        y_pred=temp_df.loc[:, f"{model_meta}_preds"]))
        
        t_res = pd.DataFrame(data={"model_base": model_base, "model_meta": model_meta, "cluster": "full", "rmse": rmse, "mae": mae,}, index=[0])

        results_test_df = pd.concat([results_test_df, t_res], axis=0)


results_test_df = results_test_df.sort_values(["model_meta", "cluster"])


In [12]:
print("MAE")
display(results_test_df.groupby(["model_meta", "cluster"], as_index=False)["mae"].describe())

MAE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.099943,0.012294,0.082306,0.093802,0.099587,0.110344,0.114731
1,catboost,1,9.0,0.125461,0.022718,0.091342,0.11076,0.126465,0.137644,0.164268
2,catboost,2,9.0,0.121976,0.01002,0.101762,0.116272,0.121297,0.129501,0.132367
3,catboost,3,9.0,0.128567,0.008831,0.109071,0.125337,0.129826,0.135653,0.137317
4,catboost,4,9.0,0.12578,0.010436,0.101919,0.126765,0.128117,0.130919,0.138723
5,catboost,full,9.0,0.125402,0.009876,0.102227,0.127252,0.128291,0.129254,0.136591
6,lgbm,0,9.0,0.101503,0.011873,0.085727,0.092795,0.098006,0.113473,0.115499
7,lgbm,1,9.0,0.12813,0.017924,0.102463,0.126106,0.126733,0.136146,0.159502
8,lgbm,2,9.0,0.122381,0.009781,0.101824,0.118742,0.123296,0.129533,0.133236
9,lgbm,3,9.0,0.128404,0.008755,0.109629,0.127575,0.129813,0.133205,0.137177


In [13]:
print("RMSE")
display(results_test_df.groupby(["model_meta", "cluster"], as_index=False)["rmse"].describe())

RMSE


Unnamed: 0,model_meta,cluster,count,mean,std,min,25%,50%,75%,max
0,catboost,0,9.0,0.120759,0.014657,0.097468,0.105388,0.123114,0.132817,0.135928
1,catboost,1,9.0,0.150873,0.021973,0.119166,0.139747,0.147317,0.15646,0.196427
2,catboost,2,9.0,0.14908,0.014553,0.116408,0.14721,0.149288,0.159773,0.16277
3,catboost,3,9.0,0.160826,0.010634,0.136926,0.158415,0.161251,0.168358,0.17199
4,catboost,4,9.0,0.155109,0.0132,0.124062,0.156147,0.157498,0.15959,0.173144
5,catboost,full,9.0,0.154675,0.012724,0.123981,0.15666,0.158203,0.158757,0.170306
6,lgbm,0,9.0,0.123899,0.011081,0.103186,0.120836,0.123776,0.133226,0.139913
7,lgbm,1,9.0,0.155761,0.020467,0.129412,0.144882,0.150646,0.171278,0.19459
8,lgbm,2,9.0,0.149613,0.014395,0.117437,0.147992,0.150723,0.159092,0.165071
9,lgbm,3,9.0,0.159787,0.010295,0.138624,0.158627,0.159997,0.167016,0.170951
