# Required imports

In [1]:
import os 
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedShuffleSplit

from helper_functions import *
from opus_eng_fra_features import *

# Plot settings

In [2]:
MARKERSIZE=10
FONT_SIZE = 18
plt.rc('xtick', labelsize=FONT_SIZE)
plt.rc('ytick', labelsize=FONT_SIZE)
plt.rc('font', size=14)
plt.rc('axes', labelsize=FONT_SIZE)

%matplotlib widget

# Load data

In [3]:
BASE_DIR = f"{os.getcwd()}/../"
MODEL_EVAL_DIR = BASE_DIR + "model_eval/"

In [4]:
general_fid_file = "general_fid-finetune_data-dataset_opus_eng_fra-timeInterval_10000-timeIntervalType_sentence-finetuneType_base.csv"
general_fid = pd.read_csv(MODEL_EVAL_DIR + general_fid_file)

In [None]:
list(general_fid.columns)

# Experimental settings

In [6]:
FEATURE_SET = ['curr_finetune', 'prev_finetune'] + BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES + GENERIC_CONTENT_AWARE_FEATURES
FIP_FEATURE_SET = BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES + GENERIC_CONTENT_AWARE_FEATURES
TARGET_METRIC = 'comet22'
FIP_MODEL_TYPE = 'rf'

NUM_SEEDS = 10

# Get original train/val/test sets

In [None]:
general_fid[BASIC_FEATURES].columns

In [None]:
train_split, val_split, test_split = get_opusEngFra_splits(general_fid)

target = f"delta-target_test_set_{TARGET_METRIC}"

x_train = train_split[FEATURE_SET]
y_train = train_split[target]
x_val = val_split[FEATURE_SET]
y_val = val_split[target]
x_test = test_split[FEATURE_SET]
y_test = test_split[target]

In [35]:
train_split[BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES + [target]].to_csv(
    "train-opus_eng_fra-generic_fip-basic_sys_perf_features.csv",
    index=False,
)

test_split[BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES + [target]].to_csv(
    "test-opus_eng_fra-generic_fip-basic_sys_perf_features.csv",
    index=False,
)

In [None]:
x_train.columns

# Sampling strategies

In [9]:
results = {}
boxplot_res_dict = {}
counter = 0

In [None]:
x_train

## Uniform random sampling

In [11]:
def create_random_sample(X, y, sample_size, sub_sample_type, seed=0):
    if sample_size == 100:
        return X, y
    
    # seed np.random for reproducibility
    np.random.seed(seed)
    
    if 'finetunings' in sub_sample_type:
        # select sample_size % finetunings to remove from the training set
        finetunings = np.random.choice(
            list(X.curr_finetune.unique()), 
            size=int((sample_size/100)*len(list(X.curr_finetune.unique()))), 
            replace=False
        )

        x = X.loc[
                (X.curr_finetune.isin(finetunings)) 
                | (X.prev_finetune.isin(finetunings)) 
            ]
        return x, y.loc[x.index]
            
    else:
        # Create a random sample of the dataset with the specified sample size
        indices = np.random.choice(
            X.index, 
            size=int((sample_size/100)*len(X)), 
            replace=False
        )

        return X.loc[indices], y.loc[indices]


# Define the sample sizes you want to test
# these are percentages of the train-set
sample_sizes = [0.1, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

for sub_sample_type in ['finetunings', 'features']:
    print(f"[D] Sampling by {sub_sample_type}")
    for sample_size in sample_sizes:
        print(f"[D]\tsample-size={sample_size}")
        mapes = []
        maes = []
        r2s = []
        pccs = []
        sccs = []
        for seed in range(NUM_SEEDS):  # Repeat multiple times for reliability
            print(f"[D]\t\tseed: {seed}", end="\r", flush=True)
            X_sample, y_sample = create_random_sample(
                x_train, y_train, sample_size, sub_sample_type, seed
            )
            
            if X_sample.empty:
                continue
            # Train the model on the sampled data
            clf = RandomForestRegressor(random_state=1, n_estimators=100, max_depth=10)
            clf.fit(X_sample[FIP_FEATURE_SET], y_sample)

            # Evaluate the model on the test set
            y_pred = clf.predict(x_test[FIP_FEATURE_SET])
            mape, mae, r2, pcc, scc = compute_metrics(
                ground_truth=y_test,
                preds=y_pred,
                verbose=False,
            )

            mapes.append(mape)
            maes.append(mae)
            r2s.append(r2)
            pccs.append(round(pcc*100, 3))
            sccs.append(round(scc*100, 3))
            
        if len(pccs) == 0:
            continue 
            
        results[counter] = {
            'sampling': 'uniform',
            'sub_sample_type': sub_sample_type,
            'sample_size': sample_size,
            'train_size': len(X_sample),
            'test_size': len(x_test),
        }

        for metric_name, metric in zip(['mape', 'mae', 'r2', 'pcc', 'scc'], [mapes, maes, r2s, pccs, sccs]):
            results[counter][f'avg-{metric_name}'] = np.mean(metric)
            results[counter][f'stdev-{metric_name}'] = np.std(metric)
            results[counter][f'90th-{metric_name}'] = np.percentile(metric, 90)
            results[counter][f'99th-{metric_name}'] = np.percentile(metric, 99)


        counter += 1

## Stratified random sampling

In [None]:
# Define the sample sizes you want to test
# these are percentages of the train-set
sample_sizes = [0.1, 1, 5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 99]

for sample_size in sample_sizes:
    mapes = []
    maes = []
    r2s = []
    pccs = []
    sccs = []

    # Binning the target variable into discrete categories
    # For example, using 10 bins
    num_bins = 10
    y_binned = pd.cut(y_train, bins=num_bins, labels=False)

    # Define the stratified splitter
    # Here we use n_splits = NUM_SEEDS 
    # to create NUM_SEEDS different sub-samples
    stratified_splitter = StratifiedShuffleSplit(
        n_splits=NUM_SEEDS, 
        train_size=int((sample_size/100)*len(x_train)), 
        random_state=42
    )

    # Create stratified sub-samples
    for train_index, test_index in stratified_splitter.split(x_train, y_binned):
        X_train_subsample = x_train.iloc[train_index]
        y_train_subsample = y_train.iloc[train_index]
#         X_test_subsample = x_test.iloc[test_index]
#         y_test_subsample = y_test.iloc[test_index]

        # Here, you can use the sub-sample (X_train_subsample, y_train_subsample)
        # to train and evaluate your model
        clf = RandomForestRegressor(random_state=1, n_estimators=100, max_depth=10)
        clf.fit(X_train_subsample, y_train_subsample)
        
        # Evaluate the model on the test set
        y_pred = clf.predict(x_test)
        mape, mae, r2, pcc, scc = compute_metrics(
            ground_truth=y_test,
            preds=y_pred,
            verbose=False,
        )
        
        mapes.append(mape)
        maes.append(mae)
        r2s.append(r2)
        pccs.append(round(pcc*100, 3))
        sccs.append(round(scc*100, 3))
        
        boxplot_res_dict[len(results)*NUM_SEEDS + seed] = {
            'seed': seed,
            'sampling': 'uniform',
            'sub_sample_type': sub_sample_type,
            'sample_size': sample_size,
            'train_size': len(X_sample),
            'test_size': len(x_test),
            'mape': mape,
            'mae': mae,
            'pcc': round(pcc*100, 5),                
            'scc': round(scc*100, 5)
        }
    
    results[counter] = {
        'sampling': 'stratified',
        'sample_size': sample_size,
        'train_size': len(X_train_subsample),
        'test_size': len(x_test),
    }
    
    for metric_name, metric in zip(['mape', 'mae', 'r2', 'pcc', 'scc'], [mapes, maes, r2s, pccs, sccs]):
        results[counter][f'avg-{metric_name}'] = np.mean(metric)
        results[counter][f'stdev-{metric_name}'] = np.std(metric)
        results[counter][f'90th-{metric_name}'] = np.percentile(metric, 90)
        results[counter][f'99th-{metric_name}'] = np.percentile(metric, 99)
        
        
    counter += 1

## Results

In [12]:
res_df = pd.DataFrame(results).transpose()

In [None]:
list(res_df.columns)

In [None]:
res_df[[
    'sampling',
    'sub_sample_type',
    'sample_size',
    'train_size',
    'test_size',
    
    'avg-mae',
    'stdev-mae',
    'avg-pcc',
    'stdev-pcc',
#     'avg-scc',
#     'avg-r2',
#     'avg-mape',
    
    '90th-mae',
    '90th-pcc',
#     '90th-scc',
#     '90th-r2',
#     '90th-mape',
    
#     '99th-mae',
#     '99th-pcc',
#     '99th-scc',
#     '99th-r2',
#     '99th-mape',
]]

### Bar plot

In [None]:
res_df.sample_size.unique()

In [None]:
COLS = [
    'avg-mae',
    'avg-pcc',
    '90th-mae',
    '90th-pcc',
]

SAMPLE_SIZES = [
    5, 10, 30, 50, 70, 90, 100
#     5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
]

for x_col in COLS:
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    
    plot_data = res_df.loc[
        res_df.sample_size.isin(SAMPLE_SIZES)
    ].copy()
        
    sns_plot = sns.barplot(
        data=plot_data,
        x='sample_size',
        y=x_col,
        hue='sub_sample_type',
        ax=ax,
#         hue_order=targets,
        palette=sns.color_palette(
            palette='gist_heat', 
            n_colors=len(plot_data.sub_sample_type.unique())
        )
    )
    
    if 'avg' in x_col:
        
        # Add error bars
        num_hue_levels = len(plot_data.sub_sample_type.unique())
        num_categories = len(plot_data.sample_size.unique())

        for i, patch in enumerate(ax.patches):
            # Determine the index of the category and hue
            category_index = i // num_hue_levels
            hue_index = i % num_hue_levels

            # Calculate the center of each bar
            bar_center = patch.get_x() + patch.get_width() / 2

            # Get the corresponding error value
            error = plot_data[f"stdev-{x_col.split('-')[1]}"].iloc[category_index * num_hue_levels + hue_index]

            # Add error bars
            ax.errorbar(
                bar_center, 
                patch.get_height(), 
                yerr=error, 
                fmt='none', 
                c='black', 
                capsize=5
            )

    ax.set_ylabel(x_col.capitalize().replace("-", " "))
    ax.set_xlabel("Sample size")
#         ax.set_yscale('log')


    ax.legend(
#         loc='upper right', 
#         bbox_to_anchor=(1.0, 1.35),
#         ncol=3,
        title='Sub-sample type',
        frameon=False,
    )

    plt.tight_layout()

### Box plot

In [None]:
boxplot_res = pd.DataFrame(boxplot_res_dict).transpose()

In [None]:
boxplot_res.loc[
    (boxplot_res.sample_size == 100)
    #& (boxplot_res.sub_sample_type == 'features')
]

In [None]:
SAMPLE_SIZES = [
    5, 10, 30, 50, 70, 90, 100
#     5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100
]

for y_col in ['pcc', 'mae', 'mape', 'scc']:
    fig, ax = plt.subplots(1, 1, figsize=(8, 5))
    
    plot_data = boxplot_res.loc[
        boxplot_res.sample_size.isin(SAMPLE_SIZES)
    ].copy()
        
    sns.boxplot(
        data=plot_data, 
        x="sample_size", 
        y=y_col, 
        hue="sub_sample_type",
        ax=ax,
        palette=sns.color_palette(
            palette='gist_heat', 
            n_colors=len(plot_data.sub_sample_type.unique())
        ),
    )

    ax.set_ylabel(y_col.upper())
    ax.set_xlabel("Sample size")

    ax.legend(
        title='Sub-sample type',
        frameon=False,
    )

    plt.tight_layout()