In [28]:
import sys
sys.path.append("../")
import numpy as np
import time
import pandas as pd
import pickle
import math
from typing import Tuple


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

from scipy.sparse import hstack

ANALYSIS_POSTFIX = "mined_sudden_2024-08-26"

experiment_config = {
    "RS" : 42,
    "ANALYSIS_POSTFIX": ANALYSIS_POSTFIX
}

In [29]:
def step_two(experiment_config, 
             X_train,
             y_train,
             model,
             X_val=None,
             y_val=None,
             save=False): 
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    
    training_start_time = time.time()
    if model=="lr":
        reg = LinearRegression().fit(X_train, y_train)
    elif model =="svm": 
        reg = SVR().fit(X_train, y_train)
    elif model=="rf":
        reg = RandomForestRegressor.fit(X_train, y_train)
    elif model=="lgbm":
        reg = LGBMRegressor(max_depth=10)
        reg.fit(X=X_train, y=y_train)
    elif model=="catboost":
        reg = CatBoostRegressor()
        reg.fit(X=X_train, y=y_train)
    training_end_time = time.time()
    time_training = training_end_time - training_start_time

    

    if save:
        with open(f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl','wb') as f:
            pickle.dump(reg, f)
        return f'./models/reg_{model}_{ANALYSIS_POSTFIX}.pkl'
    
    else:
        inference_start_time = time.time()
        y_pred = reg.predict(X_val)
        inference_end_time = time.time()
        time_inference = inference_end_time - inference_start_time

        y_pred[y_pred<0] = 0
        mae = mean_absolute_error(y_true=y_val, y_pred=y_pred)
        rmse = math.sqrt(mean_squared_error(y_true=y_val, y_pred=y_pred))
        return {"pred": y_pred, "mae": mae, "rmse": rmse, "time_training" : time_training, "time_inference" : time_inference}
    

def cv_step_2(experiment_config:dict, cv_df:pd.DataFrame) -> Tuple:

    t_models = ["lr", "svm", "lgbm", "catboost"]

    results = {}


    for test_fold in range(cv_df.fold.max()+1):
        print(test_fold)

        # Prepare the input data
        vectorizer = TfidfVectorizer()
        X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.fold!=test_fold, "input_sequence"])
        X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold!=test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_train = hstack([X_train_column_sparse, X_train_tfidf])
        y_train = cv_df.loc[cv_df.fold!=test_fold, "rouge"]
        
        X_val_tfidf = vectorizer.transform(cv_df.loc[cv_df.fold==test_fold, "input_sequence"])
        X_val_column_sparse = pd.get_dummies(cv_df.loc[cv_df.fold==test_fold, "model_set"], sparse=True).sparse.to_coo().tocsr()
        X_val = hstack([X_val_column_sparse, X_val_tfidf])
        y_val = cv_df.loc[cv_df.fold==test_fold, "rouge"]

        results[test_fold] = {}
        for model in t_models:
            print(model)
            preds_df = step_two(experiment_config=experiment_config,
                                X_train=X_train,
                                y_train=y_train,
                                X_val=X_val,
                                y_val=y_val,
                                model=model)
            cv_df.loc[cv_df.fold==test_fold, f"{model}_perf_hat"] = preds_df["pred"]
            results[test_fold][model] = preds_df

    cv_df = cv_df.reset_index(drop=True)

    # ENSEMBLE ESTIMATE (JUST HIGHEST PREDICTIONS)
    # models_index = cv_df.groupby("id")["catboost_perf_hat"].idxmax()
    # optimal_ensemble = cv_df.iloc[models_index][["id", "model_set"]]
    # optimal_ensemble_map = dict(zip(optimal_ensemble.id, optimal_ensemble.model_set))
    # cv_df["opt_es_id"] = cv_df.id.map(optimal_ensemble_map)
    # ensemble_preds = cv_df.loc[cv_df["model_set"]==cv_df["opt_es_id"], :]
    # ensemble_preds["rouge"].mean()
    # ensemble_preds["model_set"] = "ensemble"
    # cv_df = pd.concat([cv_df, ensemble_preds], axis=0)


    # rearrange results
    model_results = {}

    for model in t_models:
        model_results[model]= {}
        model_results[model]["rmse"] = []
        model_results[model]["mae"] = [] 

        for fold in range(3):
        
            model_results[model]["mae"].append(results[fold][model]["mae"])
            model_results[model]["rmse"].append(results[fold][model]["rmse"])
        
        model_results[model]["rmse_avg"] = np.array(model_results[model]["rmse"]).mean()
        model_results[model]["mae_avg"] = np.array(model_results[model]["mae"]).mean()

        model_results[model]["rmse_std"] = np.array(model_results[model]["rmse"]).std()
        model_results[model]["mae_std"] = np.array(model_results[model]["mae"]).std()

    for model in t_models:
        print(model)
        print("RMSE ", model_results[model]["rmse_avg"])
        print("MAE ",model_results[model]["mae_avg"])
        print("\n")

        print("RMSE STD ", model_results[model]["rmse_std"])
        print("MAE STD",model_results[model]["mae_std"])
        print("\n")

    return cv_df, model_results

def full_step_2(cv_df:pd.DataFrame,
                experiment_config:dict) -> None:
    
    ANALYSIS_POSTFIX = experiment_config["ANALYSIS_POSTFIX"]
    # TRAIN ON ALL PREDICTIONS AT ONCE

    t_models = ["lr", "svm", "lgbm", "catboost"]

    # Prepare the input data
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(cv_df.loc[cv_df.model_set!="ensemble", "input_sequence"])
    X_train_column_sparse = pd.get_dummies(cv_df.loc[cv_df.model_set!="ensemble", "model_set"], sparse=True).sparse.to_coo().tocsr()
    X_train = hstack([X_train_column_sparse, X_train_tfidf])
    y_train = cv_df.loc[cv_df.model_set!="ensemble", "rouge"]
        
    with open(f"./models/vectorizer_{ANALYSIS_POSTFIX}.pkl", "wb") as file:
        pickle.dump(vectorizer, file, protocol=pickle.HIGHEST_PROTOCOL) 
        
    for model in t_models:
        print(model)
        preds_df = step_two(experiment_config=experiment_config,
                            X_train=X_train,
                            y_train=y_train,
                            model=model,
                            save=True)

In [30]:
with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/cv_results.pickle", "rb") as handle:
    cv_predictions = pickle.load(handle)

with open(f"../ensemble_learning/reports/results/{ANALYSIS_POSTFIX}/test_results.pickle", "rb") as handle:
    test_predictions = pickle.load(handle)


#### Preprocessing

In [31]:
cv_predictions = cv_predictions.loc[cv_predictions.model_set!="ensemble", :]
test_predictions = test_predictions.loc[cv_predictions.model_set!="ensemble", :]

# Code Only

We have 9 base lerner settings models that we compare learning of 1, splitting to two meta models,  all together. 

In [32]:
MODELS_LIST = [0, 1, 2, 5, 10, 'cluster_[1]', 'cluster_[4]', 'cluster_[3]', 'cluster_[0, 1, 4]']
MODE = ["ONE-BY-ONE", "TWO-MODELS", "ALL"]

In [33]:
for model_i, model in enumerate(MODELS_LIST):

    temp_df =  cv_predictions.loc[cv_predictions.model_set==model]

    temp_df, model_results = cv_step_2(experiment_config=experiment_config,
              cv_df=temp_df)
    
    break

0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003719 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2698
[LightGBM] [Info] Number of data points in the train set: 4666, number of used features: 144
[LightGBM] [Info] Start training from score 0.125930
catboost
Learning rate set to 0.052224
0:	learn: 0.1358222	total: 54.6ms	remaining: 54.5s
1:	learn: 0.1356581	total: 60.1ms	remaining: 30s
2:	learn: 0.1355014	total: 66.1ms	remaining: 22s
3:	learn: 0.1353616	total: 72.1ms	remaining: 18s
4:	learn: 0.1352198	total: 78.1ms	remaining: 15.5s
5:	learn: 0.1350561	total: 83.5ms	remaining: 13.8s
6:	learn: 0.1349760	total: 89.3ms	remaining: 12.7s
7:	learn: 0.1348568	total: 95.2ms	remaining: 11.8s
8:	learn: 0.1347464	total: 101ms	remaining: 11.1s
9:	learn: 0.1346470	total: 107ms	remaining: 10.6s
10:	learn: 0.1345521	total: 113ms	remai