In [None]:
!pip install koolbox scikit-learn==1.5.2

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from lightgbm import LGBMRegressor
from scipy.stats import pearsonr as pr
from xgboost import XGBRegressor
from sklearn.base import clone
from koolbox import Trainer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import optuna
import joblib
import glob
import gc

warnings.filterwarnings("ignore")

In [None]:
feature_importance = pd.read_csv("/kaggle/input/marcus-crypto-analysis/rf_features.csv")
selected_features_rfe = pd.read_csv("/kaggle/input/marcus-crypto-analysis/rfe_features.csv")
target_correlations = pd.read_csv("/kaggle/input/marcus-crypto-analysis/tc_features.csv")
feature_scores = pd.read_csv("/kaggle/input/marcus-crypto-analysis/fs_features.csv")

In [None]:
important_features = {
    'top_univariate': feature_scores.head(100)['feature'].tolist(),
    'top_rf_importance': feature_importance.head(100)['feature'].tolist(),
    'rfe_selected': selected_features_rfe.head(100)['feature'].tolist(),
    'high_target_corr': target_correlations.head(100)['feature'].tolist()
}

common_features = set(important_features['top_univariate']) & \
                 set(important_features['top_rf_importance']) & \
                 set(important_features['rfe_selected'])& \
                set(important_features['high_target_corr'])
print(f"  Common Features selected by all 4 methods: {(common_features)}")

In [None]:
common_features=list(common_features)

X_FEATURES = ['bid_qty', 'ask_qty', 'buy_qty', 'sell_qty']+ common_features

# Detect Outliner

Reference https://www.kaggle.com/code/isinsuu/drw-autoencoder-mlp-outlier

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
import warnings
warnings.filterwarnings('ignore')

def simple_prophet_outlier_detection(df, feature_col, timestamp_col='__index_level_0__'):
    """
    Simple example of using Prophet to detect outliers in a single feature.
    
    The idea: Prophet models the expected behavior of the feature over time.
    Points that fall far outside Prophet's confidence interval are outliers.
    """
    print(f"Detecting outliers in {feature_col} using Prophet...")
    
    # 1. Prepare data for Prophet
    prophet_df = pd.DataFrame({
        'ds': df[timestamp_col],
        'y': df[feature_col]
    })
    
    # Remove obvious bad values
    prophet_df = prophet_df[np.isfinite(prophet_df['y'])]
    
    # Resample to hourly for efficiency (adjust based on your needs)
    prophet_hourly = prophet_df.set_index('ds').resample('1H').mean().reset_index()
    prophet_hourly = prophet_hourly.dropna()
    
    # 2. Fit Prophet model
    model = Prophet(
        changepoint_prior_scale=0.05,  # Low value = less sensitive to outliers
        interval_width=0.95,  # 95% confidence interval
        yearly_seasonality=False,
        weekly_seasonality=True,
        daily_seasonality=True
    )
    
    model.fit(prophet_hourly)
    
    # 3. Generate predictions
    forecast = model.predict(prophet_hourly)
    
    # 4. Identify outliers
    # Method 1: Points outside confidence interval
    outliers_ci = (
        (prophet_hourly['y'] < forecast['yhat_lower']) | 
        (prophet_hourly['y'] > forecast['yhat_upper'])
    )
    
    # Method 2: Points with large standardized residuals
    residuals = prophet_hourly['y'] - forecast['yhat']
    residual_std = residuals.std()
    outliers_residual = np.abs(residuals) > 3 * residual_std
    
    # Combine both methods
    outliers = outliers_ci & outliers_residual
    
    # 5. Visualize
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
    
    # Plot 1: Time series with outliers
    ax1.plot(prophet_hourly['ds'], prophet_hourly['y'], 'b.', alpha=0.5, label='Actual')
    ax1.plot(forecast['ds'], forecast['yhat'], 'g-', linewidth=2, label='Prophet Fit')
    ax1.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], 
                     alpha=0.2, color='green', label='95% CI')
    
    # Highlight outliers
    outlier_points = prophet_hourly[outliers]
    ax1.scatter(outlier_points['ds'], outlier_points['y'], 
               color='red', s=100, edgecolor='darkred', linewidth=2, 
               label=f'Outliers ({outliers.sum()})', zorder=10)
    
    ax1.set_title(f'Prophet Outlier Detection: {feature_col}')
    ax1.set_xlabel('Date')
    ax1.set_ylabel('Value')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Residual distribution
    ax2.hist(residuals, bins=50, alpha=0.7, color='blue', edgecolor='black')
    ax2.axvline(x=-3*residual_std, color='red', linestyle='--', label='±3σ threshold')
    ax2.axvline(x=3*residual_std, color='red', linestyle='--')
    ax2.set_title('Residual Distribution')
    ax2.set_xlabel('Residual (Actual - Predicted)')
    ax2.set_ylabel('Frequency')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # 6. Return outlier information
    outlier_info = {
        'n_outliers': outliers.sum(),
        'outlier_pct': outliers.sum() / len(prophet_hourly) * 100,
        'outlier_timestamps': outlier_points['ds'].tolist(),
        'outlier_values': outlier_points['y'].tolist(),
        'residual_std': residual_std
    }
    
    print(f"Found {outlier_info['n_outliers']} outliers ({outlier_info['outlier_pct']:.2f}%)")
    
    return outlier_info, model, forecast


# Quick example for multiple features
def detect_outliers_multiple_features(data_path, features_to_check=None):
    """
    Run outlier detection on multiple features.
    """
    # Load data
    if features_to_check is None:
        # Default to some common features
        features_to_check = ['volume', 'bid_qty', 'ask_qty', 'buy_qty', 'sell_qty']
    
    df = pd.read_parquet(data_path, columns=['__index_level_0__'] + features_to_check)
    
    # Ensure timestamp
    if '__index_level_0__' not in df.columns:
        df['__index_level_0__'] = pd.date_range('2023-03-01', periods=len(df), freq='T')
    
    # Analyze each feature
    outlier_summary = {}
    
    for feature in features_to_check:
        if feature in df.columns:
            print(f"\n{'='*60}")
            outlier_info, model, forecast = simple_prophet_outlier_detection(df, feature)
            outlier_summary[feature] = outlier_info
    
    # Summary plot
    fig, ax = plt.subplots(figsize=(10, 6))
    
    features = list(outlier_summary.keys())
    outlier_pcts = [outlier_summary[f]['outlier_pct'] for f in features]
    
    bars = ax.bar(features, outlier_pcts, color='coral', edgecolor='darkred', linewidth=2)
    
    # Highlight high outlier features
    for i, pct in enumerate(outlier_pcts):
        if pct > 1.0:  # More than 1% outliers
            bars[i].set_color('red')
    
    ax.set_title('Outlier Percentage by Feature', fontsize=14, fontweight='bold')
    ax.set_ylabel('Outlier %')
    ax.set_xlabel('Feature')
    ax.grid(True, alpha=0.3, axis='y')
    
    # Add value labels
    for bar, pct in zip(bars, outlier_pcts):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{pct:.2f}%', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    return outlier_summary


# Practical example: Using outlier detection for data cleaning
def clean_feature_outliers(df, feature_col, outlier_info, method='cap'):
    """
    Clean outliers from a feature using different methods.
    """
    # Get outlier timestamps
    outlier_timestamps = outlier_info['outlier_timestamps']
    
    # Create copy
    df_clean = df.copy()
    
    if method == 'remove':
        # Remove rows with outliers
        mask = ~df_clean['__index_level_0__'].isin(outlier_timestamps)
        df_clean = df_clean[mask]
        print(f"Removed {len(outlier_timestamps)} rows with outliers")
        
    elif method == 'cap':
        # Cap outliers at 99th percentile
        lower_cap = df_clean[feature_col].quantile(0.01)
        upper_cap = df_clean[feature_col].quantile(0.99)
        
        df_clean[feature_col] = df_clean[feature_col].clip(lower=lower_cap, upper=upper_cap)
        print(f"Capped {feature_col} to range [{lower_cap:.2f}, {upper_cap:.2f}]")
        
    elif method == 'interpolate':
        # Replace outliers with interpolated values
        outlier_mask = df_clean['__index_level_0__'].isin(outlier_timestamps)
        df_clean.loc[outlier_mask, feature_col] = np.nan
        df_clean[feature_col] = df_clean[feature_col].interpolate(method='linear')
        print(f"Interpolated {len(outlier_timestamps)} outlier values")
    
    return df_clean


# Example usage
if __name__ == "__main__":
    # Example 1: Single feature analysis
    data_path = '/kaggle/input/drw-crypto-market-prediction/train.parquet'
    df = pd.read_parquet(data_path, columns=['__index_level_0__', 'volume', 'label'] + X_FEATURES)
    
    if '__index_level_0__' not in df.columns:
        df['__index_level_0__'] = pd.date_range('2023-03-01', periods=len(df), freq='T')
    
    # Detect outliers in volume
    outlier_info, model, forecast = simple_prophet_outlier_detection(df, 'volume')
    
    # Example 2: Multiple features
    outlier_summary = detect_outliers_multiple_features(
        data_path,
        X_FEATURES
    )
    
    # Example 3: Clean the data
    df_clean = df.copy()
    for feat in X_FEATURES:
        if feat in outlier_summary:
            df_clean = clean_feature_outliers(df_clean, feat, outlier_summary[feat], method='cap')
    print("\nOutlier detection complete!")

In [None]:
def reduce_mem_usage(dataframe, dataset):    
    print('Reducing memory usage for:', dataset)
    initial_mem_usage = dataframe.memory_usage().sum() / 1024**2
    
    for col in dataframe.columns:
        col_type = dataframe[col].dtype

        c_min = dataframe[col].min()
        c_max = dataframe[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                dataframe[col] = dataframe[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                dataframe[col] = dataframe[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                dataframe[col] = dataframe[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                dataframe[col] = dataframe[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                dataframe[col] = dataframe[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                dataframe[col] = dataframe[col].astype(np.float32)
            else:
                dataframe[col] = dataframe[col].astype(np.float64)

    final_mem_usage = dataframe.memory_usage().sum() / 1024**2
    print('--- Memory usage before: {:.2f} MB'.format(initial_mem_usage))
    print('--- Memory usage after: {:.2f} MB'.format(final_mem_usage))
    print('--- Decreased memory usage by {:.1f}%\n'.format(100 * (initial_mem_usage - final_mem_usage) / initial_mem_usage))

    return dataframe

In [None]:
# ===== Feature Engineering =====
def feature_engineering(data):
    #features_df = pd.DataFrame(index=data.index)
    
    data['bid_ask_spread_proxy'] = data['ask_qty'] - data['bid_qty']
    data['total_liquidity'] = data['bid_qty'] + data['ask_qty']
    data['trade_imbalance'] = data['buy_qty'] - data['sell_qty']
    data['total_trades'] = data['buy_qty'] + data['sell_qty']
    
    data['volume_per_trade'] = data['volume'] / (data['buy_qty'] + data['sell_qty'] + 1e-8)
    data['buy_volume_ratio'] = data['buy_qty'] / (data['volume'] + 1e-8)
    data['sell_volume_ratio'] = data['sell_qty'] / (data['volume'] + 1e-8)
    
    data['buying_pressure'] = data['buy_qty'] / (data['buy_qty'] + data['sell_qty'] + 1e-8)
    data['selling_pressure'] = data['sell_qty'] / (data['buy_qty'] + data['sell_qty'] + 1e-8)
    
    data['order_imbalance'] = (data['bid_qty'] - data['ask_qty']) / (data['bid_qty'] + data['ask_qty'] + 1e-8)
    data['order_imbalance_abs'] = np.abs(data['order_imbalance'])
    data['bid_liquidity_ratio'] = data['bid_qty'] / (data['volume'] + 1e-8)
    data['ask_liquidity_ratio'] = data['ask_qty'] / (data['volume'] + 1e-8)
    data['market_depth'] = data['bid_qty'] + data['ask_qty']
    data['depth_imbalance'] = data['market_depth'] - data['volume']

    data['buy_sell_ratio'] = data['buy_qty'] / (data['sell_qty'] + 1e-8)
    data['bid_ask_ratio'] = data['bid_qty'] / (data['ask_qty'] + 1e-8)
    data['volume_liquidity_ratio'] = data['volume'] / (data['bid_qty'] + data['ask_qty'] + 1e-8)

    data['buy_volume_product'] = data['buy_qty'] * data['volume']
    data['sell_volume_product'] = data['sell_qty'] * data['volume']
    data['bid_ask_product'] = data['bid_qty'] * data['ask_qty']
    
    data['market_competition'] = (data['buy_qty'] * data['sell_qty']) / ((data['buy_qty'] + data['sell_qty']) + 1e-8)
    data['liquidity_competition'] = (data['bid_qty'] * data['ask_qty']) / ((data['bid_qty'] + data['ask_qty']) + 1e-8)
    
    total_activity = data['buy_qty'] + data['sell_qty'] + data['bid_qty'] + data['ask_qty']
    data['market_activity'] = total_activity
    data['activity_concentration'] = data['volume'] / (total_activity + 1e-8)
    
    data['info_arrival_rate'] = (data['buy_qty'] + data['sell_qty']) / (data['volume'] + 1e-8)
    data['market_making_intensity'] = (data['bid_qty'] + data['ask_qty']) / (data['buy_qty'] + data['sell_qty'] + 1e-8)
    data['effective_spread_proxy'] = np.abs(data['buy_qty'] - data['sell_qty']) / (data['volume'] + 1e-8)

    lambda_decay = 0.95
    ofi = data['buy_qty'] - data['sell_qty']
    data['order_flow_imbalance_ewm'] = ofi.ewm(alpha=1-lambda_decay).mean()

    data = data.replace([np.inf, -np.inf], np.nan)
    
    return data        

In [None]:
class CFG:
    train_path = "/kaggle/input/drw-crypto-market-prediction/train.parquet"
    test_path = "/kaggle/input/drw-crypto-market-prediction/test.parquet"
    sample_sub_path = "/kaggle/input/drw-crypto-market-prediction/sample_submission.csv"

    target = "label"
    n_folds = 5
    seed = 42

    run_optuna = True
    n_optuna_trials = 500

In [None]:
train = df_clean
test = pd.read_parquet(CFG.test_path).reset_index(drop=True)
selected_columns = X_FEATURES+["volume"]
train = train[selected_columns + [CFG.target]]
test = test[selected_columns]

# Apply feature engineering
train = feature_engineering(train)
test = feature_engineering(test)

to_remove = ["bid_qty", "ask_qty", "buy_qty", "sell_qty", "volume"]

train = train.drop(columns=to_remove)
test = test.drop(columns=to_remove)

train = reduce_mem_usage(train, "train")
test = reduce_mem_usage(test, "test")

X = train.drop(CFG.target, axis=1)
y = train[CFG.target]
X_test = test


# Training Base Models

In [None]:
def pearsonr(y_true, y_pred):
    return pr(y_true, y_pred)[0]

# Optuna training Next 

In [None]:
lgbm_params = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.5625888953382505,
    "learning_rate": 0.029312951475451557,
    "min_child_samples": 63,
    "min_child_weight": 0.11456572852335424,
    "n_estimators": 126,
    "n_jobs": -1,
    "num_leaves": 37,
    "random_state": 42,
    "reg_alpha": 85.2476527854083,
    "reg_lambda": 99.38305361388907,
    "subsample": 0.450669817684892,
    "verbose": -1
}

lgbm_goss_params = {
    "boosting_type": "goss",
    "colsample_bytree": 0.34695458228489784,
    "learning_rate": 0.031023014900595287,
    "min_child_samples": 30,
    "min_child_weight": 0.4727729225033618,
    "n_estimators": 220,
    "n_jobs": -1,
    "num_leaves": 58,
    "random_state": 42,
    "reg_alpha": 38.665994901468224,
    "reg_lambda": 92.76991677464294,
    "subsample": 0.4810891284493255,
    "verbose": -1
}

xgb_params = {
    "colsample_bylevel": 0.4778015829774066,
    "colsample_bynode": 0.362764358742407,
    "colsample_bytree": 0.7107423488010493,
    "gamma": 1.7094857725240398,
    "learning_rate": 0.02213323588455387,
    "max_depth": 20,
    "max_leaves": 12,
    "min_child_weight": 16,
    "n_estimators": 1667,
    "n_jobs": -1,
    "random_state": 42,
    "reg_alpha": 39.352415706891264,
    "reg_lambda": 75.44843704068275,
    "subsample": 0.06566669853471274,
    "verbosity": 0
}

In [None]:
fold_scores = {}
overall_scores = {}

oof_preds = {}
test_preds = {}

# LightGBM (gbdt)

In [None]:
lgbm_trainer = Trainer(
    LGBMRegressor(**lgbm_params),
    cv=KFold(n_splits=5, shuffle=False),
    metric=pearsonr,
    task="regression",
    metric_precision=6
)

lgbm_trainer.fit(X, y)

fold_scores["LightGBM (gbdt)"] = lgbm_trainer.fold_scores
overall_scores["LightGBM (gbdt)"] = [pearsonr(lgbm_trainer.oof_preds, y)]
oof_preds["LightGBM (gbdt)"] = lgbm_trainer.oof_preds
test_preds["LightGBM (gbdt)"] = lgbm_trainer.predict(X_test)

# LightGBM (goss)

In [None]:
lgbm_goss_trainer = Trainer(
    LGBMRegressor(**lgbm_goss_params),
    cv=KFold(n_splits=5, shuffle=False),
    metric=pearsonr,
    task="regression",
    metric_precision=6
)

lgbm_goss_trainer.fit(X, y)

fold_scores["LightGBM (goss)"] = lgbm_goss_trainer.fold_scores
overall_scores["LightGBM (goss)"] = [pearsonr(lgbm_goss_trainer.oof_preds, y)]
oof_preds["LightGBM (goss)"] = lgbm_goss_trainer.oof_preds
test_preds["LightGBM (goss)"] = lgbm_goss_trainer.predict(X_test)

# XGBoost

In [None]:
xgb_trainer = Trainer(
    XGBRegressor(**xgb_params),
    cv=KFold(n_splits=5, shuffle=False),
    metric=pearsonr,
    task="regression",
    metric_precision=6
)

xgb_trainer.fit(X, y)

fold_scores["XGBoost"] = xgb_trainer.fold_scores
overall_scores["XGBoost"] = [pearsonr(xgb_trainer.oof_preds, y)]
oof_preds["XGBoost"] = xgb_trainer.oof_preds
test_preds["XGBoost"] = xgb_trainer.predict(X_test)

# Ensembling with Ridge

In [None]:
def plot_weights(weights, title):
    sorted_indices = np.argsort(weights[0])[::-1]
    sorted_coeffs = np.array(weights[0])[sorted_indices]
    sorted_model_names = np.array(list(oof_preds.keys()))[sorted_indices]

    plt.figure(figsize=(10, weights.shape[1] * 0.5))
    ax = sns.barplot(x=sorted_coeffs, y=sorted_model_names, palette="RdYlGn_r")

    for i, (value, name) in enumerate(zip(sorted_coeffs, sorted_model_names)):
        if value >= 0:
            ax.text(value, i, f"{value:.3f}", va="center", ha="left", color="black")
        else:
            ax.text(value, i, f"{value:.3f}", va="center", ha="right", color="black")

    xlim = ax.get_xlim()
    ax.set_xlim(xlim[0] - 0.1 * abs(xlim[0]), xlim[1] + 0.1 * abs(xlim[1]))

    plt.title(title)
    plt.xlabel("")
    plt.ylabel("")
    plt.tight_layout()
    plt.show()

In [None]:
X = pd.DataFrame(oof_preds)
X_test = pd.DataFrame(test_preds)

In [None]:
joblib.dump(X, "oof_preds.pkl")
joblib.dump(X_test, "test_preds.pkl")

In [None]:
def objective(trial):    
    params = {
        "random_state": CFG.seed,
        "alpha": trial.suggest_float("alpha", 0, 1),
        "tol": trial.suggest_float("tol", 1e-6, 1e-2),
        "fit_intercept": trial.suggest_categorical("fit_intercept", [True, False]),
        "positive": trial.suggest_categorical("positive", [True, False])
    }

    trainer = Trainer(
        Ridge(**params),
        cv=KFold(n_splits=5, shuffle=False),
        metric=pearsonr,
        task="regression",
        verbose=False
    )
    trainer.fit(X, y)
    
    return pearsonr(trainer.oof_preds, y)

if CFG.run_optuna:
    sampler = optuna.samplers.TPESampler(seed=CFG.seed, multivariate=True, n_startup_trials=CFG.n_optuna_trials // 10)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=CFG.n_optuna_trials, n_jobs=-1, catch=(ValueError,))
    best_params = study.best_params

    ridge_params = {
        "random_state": CFG.seed,
        "alpha": best_params["alpha"],
        "tol": best_params["tol"],
        "fit_intercept": best_params["fit_intercept"],
        "positive": best_params["positive"]
    }
else:
    ridge_params = {
        "random_state": CFG.seed,
        'alpha': 0.9999400497591444, 'tol': 0.00943871628637141, 'fit_intercept': False, 'positive': True}

In [None]:
ridge_trainer = Trainer(
    Ridge(**ridge_params),
    cv=KFold(n_splits=5, shuffle=False),
    metric=pearsonr,
    task="regression",
    metric_precision=6
)

ridge_trainer.fit(X, y)

fold_scores["Ridge (ensemble)"] = ridge_trainer.fold_scores
overall_scores["Ridge (ensemble)"] = [pearsonr(ridge_trainer.oof_preds, y)]
ridge_test_preds = ridge_trainer.predict(X_test)

In [None]:
ridge_coeffs = np.zeros((1, X.shape[1]))
for m in ridge_trainer.estimators:
    ridge_coeffs += m.coef_
ridge_coeffs = ridge_coeffs / len(ridge_trainer.estimators)

plot_weights(ridge_coeffs, "Ridge Coefficients")

In [None]:
sub = pd.read_csv(CFG.sample_sub_path)
sub["prediction"] = test_preds["XGBoost"] 
sub.to_csv(f"xgb.csv", index=False)
sub.head()

In [None]:
sub["prediction"] = test_preds
sub.to_csv(f"sub_ridge_{overall_scores['Ridge (ensemble)'][0]:.6f}.csv", index=False)
sub.head()

In [None]:
fold_scores

# Results

In [None]:
fold_scores = pd.DataFrame(fold_scores)
#overall_scores = pd.DataFrame(overall_scores).transpose().sort_values(by=0, ascending=False)
order = overall_scores.index.tolist()

min_score = overall_scores.values.flatten().min()
max_score = overall_scores.values.flatten().max()
padding = (max_score - min_score) * 0.5
lower_limit = min_score - padding
upper_limit = max_score + padding

fig, axs = plt.subplots(1, 2, figsize=(15, fold_scores.shape[1] * 0.5))

boxplot = sns.boxplot(data=fold_scores, order=order, ax=axs[0], orient="h", color="grey")
axs[0].set_title(f"Fold Score")
axs[0].set_xlabel("")
axs[0].set_ylabel("")

barplot = sns.barplot(x=overall_scores.values.flatten(), y=overall_scores.index, ax=axs[1], color="grey")
axs[1].set_title(f"Overall Score")
axs[1].set_xlabel("")
axs[1].set_xlim(left=lower_limit, right=upper_limit)
axs[1].set_ylabel("")

for i, (score, model) in enumerate(zip(overall_scores.values.flatten(), overall_scores.index)):
    color = "cyan" if "ensemble" in model.lower() else "grey"
    barplot.patches[i].set_facecolor(color)
    boxplot.patches[i].set_facecolor(color)
    barplot.text(score, i, round(score, 6), va="center")

plt.tight_layout()
plt.show()