# Required imports

In [1]:
import os 

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

from helper_functions import *
from hk_news_features import *

%matplotlib widget

# Plot settings

In [2]:
MARKERSIZE=10
FONT_SIZE = 18
plt.rc('xtick', labelsize=FONT_SIZE)
plt.rc('ytick', labelsize=FONT_SIZE)
plt.rc('font', size=14)
plt.rc('axes', labelsize=FONT_SIZE)

%matplotlib widget

# Load data

In [3]:
BASE_DIR = f"{os.getcwd()}/../"
MODEL_EVAL_DIR = BASE_DIR + "model_eval/"

In [4]:
fid_file = "fid-finetune_data-dataset_hk-news-timeInterval_1-timeIntervalType_time-finetuneType_base.csv"
fid = pd.read_csv(MODEL_EVAL_DIR + fid_file)

fid = set_type_of_time_cols(fid)

for metric in METRICS:
    for tactic in TACTICS:
        fid[f'delta-target_{metric}_{tactic}'] = fid[f'target_{metric}_{tactic}'] - fid[f'curr_{metric}']

    for test_set in HK_NEWS_TEST_SETS:
        key = f"{test_set}_{metric}"
        fid[f"delta-target_{key}"] = fid[f"target_{key}"] - fid[f"curr_{key}"]

In [6]:
general_fid_file = "general_fid-finetune_data-dataset_hk-news-timeInterval_1-timeIntervalType_time-finetuneType_base.csv"
general_fid = pd.read_csv(MODEL_EVAL_DIR + general_fid_file)

general_fid = set_type_of_time_cols(general_fid)

In [None]:
general_fid[
    ['test-set', 'curr_finetune', 'prev_finetune'] + BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES
]

# Experimental settings

In [8]:
FEATURE_SET = ['curr_finetune', 'prev_finetune'] + BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES
FIP_MODEL_TYPE = 'rf'

NUM_SEEDS = 10

# Sampling strategies

In [9]:
results = {}
boxplot_res_dict = {}
counter = 0

## Uniform random sampling

In [10]:
def create_random_sample(X, y, sample_size, sub_sample_type, seed=0):
    if sample_size == 100:
        return X, y, len(X.curr_finetune.unique())
    
    # seed np.random for reproducibility
    np.random.seed(seed)
    
    if 'finetunings' in sub_sample_type:
        # select sample_size % finetunings to remove from the training set
        finetunings = np.random.choice(
            list(X.curr_finetune.unique()), 
            size=int((sample_size/100)*len(list(X.curr_finetune.unique()))), 
            replace=False
        )
    
        x = X.loc[
            (X.curr_finetune.isin(finetunings)) 
        ].copy()
        
        x = x.loc[
            (x.prev_finetune.isin(finetunings)) 
        ]
        
        return x, y.loc[x.index], len(finetunings)
            
    else:
        
        # Create a random sample of the dataset with the specified sample size
        indices = np.random.choice(
            X.index, 
            size=int((sample_size/100)*len(X)), 
            replace=False
        )

        return X.loc[indices], y.loc[indices]

In [11]:
def test_uniform_random_sampling(
    fid: pd.DataFrame,
    target: str,
    boxplot_res_dict: dict,
    results: dict,
    target_metric: str,
    sample_sizes: list,
    sub_sample_types: list = ['finetunings', 'features'],
    feature_set: list = FEATURE_SET,
    num_seeds: int = NUM_SEEDS,
):

    if 'test_set' in target:
        fip_type = 'generic'
        test_set = 'all-test-sets'
    else:
        fip_type = 'specific'
        test_set = target.split("_")[1]
    
    print(fid.columns)
    train_split, val_split, test_split = get_hkNews_splits(
        fid,
        fixed_test_set=True,
    )

    x_train = train_split
    y_train = train_split[target]
    x_val = val_split
    y_val = val_split[target]
    x_test = test_split
    y_test = test_split[target]
    
    print(f"total num finetunings={len(x_train.curr_finetune.unique())}")
    
    counter = len(results)
    for sub_sample_type in sub_sample_types:
        print(f"[D] Sampling by {sub_sample_type}")
        for sample_size in sample_sizes:
            print(f"[D]\tsample-size={sample_size}")
            mapes = []
            maes = []
            r2s = []
            pccs = []
            sccs = []
            train_sizes = []
            total_finetunings = []
            for seed in range(num_seeds):  # Repeat multiple times for reliability
                print(f"[D]\t\tseed: {seed}", end="\r", flush=True)
                X_sample, y_sample, num_finetunings = create_random_sample(
                    x_train, y_train, sample_size, sub_sample_type, seed
                )

                if X_sample.empty:
                    continue
                # Train the model on the sampled data
                clf = RandomForestRegressor(random_state=1, n_estimators=100, max_depth=10)
                clf.fit(X_sample[feature_set], y_sample)

                # Evaluate the model on the test set
                y_pred = clf.predict(x_test[feature_set])
                mape, mae, r2, pcc, scc = compute_metrics(
                    ground_truth=y_test,
                    preds=y_pred,
                    verbose=False,
                )

                mapes.append(mape)
                maes.append(mae)
                r2s.append(r2)
                pccs.append(round(pcc*100, 3))
                sccs.append(round(scc*100, 3))
                train_sizes.append(len(X_sample))
                total_finetunings.append(num_finetunings)

                boxplot_res_dict[len(results)*num_seeds + seed] = {
                    'seed': seed,
                    'sampling': 'uniform',
                    'fip_type': fip_type,
                    'test-set': test_set,
                    'sub_sample_type': sub_sample_type,
                    'sample_size': sample_size,
                    'num_finetunings': num_finetunings,
                    'target': target,
                    'target_metric': target_metric,
                    'train_size': len(X_sample),
                    'test_size': len(x_test),
                    'mape': mape,
                    'mae': mae,
                    'pcc': round(pcc*100, 5),                
                    'scc': round(scc*100, 5)
                }

            if len(pccs) == 0:
                continue 
            print(f"train_sizes={train_sizes}")
            results[counter] = {
                'sampling': 'uniform',
                'fip_type': fip_type,
                'test-set': test_set,
                'sub_sample_type': sub_sample_type,
                'sample_size': sample_size,
                'total_finetunings': np.mean(total_finetunings),
                'target': target,
                'target_metric': target_metric,
                'avg-train_size': np.mean(train_sizes),
                'std-train_size': np.std(train_sizes),
                'test_size': len(x_test),
            }

            for metric_name, metric in zip(['mape', 'mae', 'r2', 'pcc', 'scc'], [mapes, maes, r2s, pccs, sccs]):
                results[counter][f'avg-{metric_name}'] = np.mean(metric)
                results[counter][f'stdev-{metric_name}'] = np.std(metric)
                results[counter][f'90th-{metric_name}'] = np.percentile(metric, 90)
                results[counter][f'99th-{metric_name}'] = np.percentile(metric, 99)


            counter += 1

In [None]:
# Define the sample sizes you want to test
# these are percentages of the train-set
sample_sizes = [0.1, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

for metric in ['comet22']: #, 'chrf', 'comet22-qe', 'sacrebleu']:
    for fip_type, fid in zip(['generic'], [general_fid]):
        print("#"*25 + f" {metric.upper()} " + "#"*25)
        if 'specific' in fip_type:
            for test_set in HK_NEWS_TEST_SETS:
                FIP_FEATURE_SET = (
                    SENT_OVERLAP_FEATURES["new_data"] + EMBEDDING_FEATURES["new_data"]
                    + SENT_OVERLAP_FEATURES["finetune_data"] + EMBEDDING_FEATURES["finetune_data"]
                )
                target = f"delta-target_{test_set}_{metric}"
                test_uniform_random_sampling(
                    fid=fid,
                    target=target,
                    boxplot_res_dict=boxplot_res_dict,
                    results=results,
                    target_metric=metric,
                    sample_sizes=sample_sizes,
                    sub_sample_types=['finetunings'], #: list = ['features', 'finetunings'],
                    feature_set = FIP_FEATURE_SET,
                #     num_seeds: int = NUM_SEEDS,
                )
        else:
            FIP_FEATURE_SET = BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES #+ GENERIC_CONTENT_AWARE_FEATURES
            target = f"delta-target_test_set_{metric}"
            test_uniform_random_sampling(
                fid=fid,
                target=target,
                boxplot_res_dict=boxplot_res_dict,
                results=results,
                target_metric=metric,
                sample_sizes=sample_sizes,
                sub_sample_types=['finetunings'], #: list = ['features', 'finetunings'],
                feature_set = FIP_FEATURE_SET,
            #     num_seeds: int = NUM_SEEDS,
            )

## Results

In [19]:
res_df = pd.DataFrame(results).transpose()

In [None]:
list(res_df.columns)

In [None]:
res_df[[
    'target',
    'fip_type',
    'test-set',
    'target_metric',
    'sampling',
    'sub_sample_type',
    'sample_size',
    'avg-train_size',
    'std-train_size',
    'test_size',
    
    'avg-mae',
    'stdev-mae',
    'avg-pcc',
    'stdev-pcc',
#     'avg-scc',
#     'avg-r2',
#     'avg-mape',
    
    '90th-mae',
    '90th-pcc',
#     '90th-scc',
#     '90th-r2',
#     '90th-mape',
    
#     '99th-mae',
#     '99th-pcc',
#     '99th-scc',
#     '99th-r2',
#     '99th-mape',
]].loc[
    (res_df.sub_sample_type == 'finetunings')
    & (res_df.sample_size == 10)
]

### Bar plot

In [None]:
COLS = [
    'avg-mae',
    'avg-pcc',
    '90th-mae',
    '90th-pcc',
]

SAMPLE_SIZES = [
    5, 10, 30, 50, 70, 90, 100
#     5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
]

for x_col in COLS:
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    
    plot_data = res_df.loc[
        res_df.sample_size.isin(SAMPLE_SIZES)
    ].copy()
        
    sns_plot = sns.barplot(
        data=plot_data,
        x='sample_size',
        y=x_col,
        hue='test-set',
        ax=ax,
#         hue_order=targets,
        palette=sns.color_palette(
            palette='gist_heat', 
            n_colors=len(plot_data['test-set'].unique())
        ),
    )
    
    if 'avg' in x_col:
        
        # Add error bars
        num_hue_levels = len(plot_data.sub_sample_type.unique())
        num_categories = len(plot_data.sample_size.unique())

        for i, patch in enumerate(ax.patches):
            # Determine the index of the category and hue
            category_index = i // num_hue_levels
            hue_index = i % num_hue_levels

            # Calculate the center of each bar
            bar_center = patch.get_x() + patch.get_width() / 2

            # Get the corresponding error value
            error = plot_data[f"stdev-{x_col.split('-')[1]}"].iloc[category_index * num_hue_levels + hue_index]

            # Add error bars
            ax.errorbar(
                bar_center, 
                patch.get_height(), 
                yerr=error, 
                fmt='none', 
                c='black', 
                capsize=5
            )

    ax.set_ylabel(x_col.capitalize().replace("-", " "))
    ax.set_xlabel("Sample size")
#         ax.set_yscale('log')


    ax.legend(
        title='Sub-sample type',
        frameon=False,
    )
    if 'pcc' in x_col:
        ax.legend(
            loc='upper left', 
            bbox_to_anchor=(.2, 1.35),
            ncol=num_hue_levels,
            title='Sub-sample type',
            frameon=True,
        )

    plt.tight_layout()

### Box plot

In [25]:
boxplot_res = pd.DataFrame(boxplot_res_dict).transpose()

In [None]:
boxplot_res.loc[
    (boxplot_res.sample_size == 20)
    & (boxplot_res.target_metric == 'comet22')
    & (boxplot_res.fip_type == 'specific')
]

In [None]:
for metric in boxplot_res.target_metric.unique():
    
    plot_data = boxplot_res.loc[
        (boxplot_res.target_metric == metric)
    ].copy()
    
    for y_col in ['pcc', 'mape', 'scc']:
        fig, ax = plt.subplots(1, 1, figsize=(8, 5))

        hue_col = 'fip_type' # 'test-set'  
        sns.lineplot(
            data=plot_data, 
            x="num_finetunings",
            y=y_col, 
            hue=hue_col,
            ax=ax,
            palette=sns.color_palette(
                palette='gist_heat', 
                n_colors=len(plot_data[hue_col].unique())
            ),
#             legend=False,
        )

        # Create the second y-axis and plot
        ax2 = ax.twinx()
        sns.lineplot(
            data=plot_data, 
            x="num_finetunings",
            y='mae', 
            hue=hue_col,
            ax=ax2,
            label=f'MAE-{plot_data[hue_col].unique()}',
            palette=sns.color_palette(
                palette='Paired', 
                n_colors=len(plot_data[hue_col].unique())
            ),
            legend=False,
        )

        ax.set_ylabel(y_col.upper())
        ax2.set_ylabel("MAE")
        ax.set_xlabel("Number of finetunings")

#         ax.set_xscale('log')

        # Custom legend
        lines1, labels1 = ax.get_legend_handles_labels()
        lines2, labels2 = ax2.get_legend_handles_labels()
        for line, label in zip(lines2, labels2):
            line.set_linestyle("--")  # Customize the line style for the second axis

        # Combine legends and add hue information
        combined_lines = lines1 + lines2
        combined_labels = [f"{label} (PCC)" for label in labels1] + [f"{label} (MAE)" for label in labels1]

        # Position the legend in the middle right
        ax.legend(combined_lines, combined_labels, loc='center right')


        ax.set_title(f"{metric}")
        plt.tight_layout()