### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
from tqdm import tqdm # type: ignore
from datetime import date

import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer
from utils.sampling import create_splits, prep_cv_validation
from utils.training import cv_cluster_set, cv_training_epochs_sets, test_cluster_set
from utils.training import results_dict_todf, cv_step_2, full_step_2, test_training_epochs_sets
from utils.inference import meta_predict, create_ensemble_map, ensemble_compute

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch

#####################################
############  CONSTANTS #############
#####################################

RS = 42
BATCH_SIZE = 16
DECODER_LENGTH = 15
ENCODER_LENGTH = 15
MODEL_NAME = "Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, ], #2, 5, 10],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

experiment_config = {
    "DATA_STR" : "20240721",
    "RS" : RS,
    "DRIFT_TYPE" : "sudden",
    "NFOLD" : 3,
    "FULL_TRAIN_ARGS" : FULL_TRAIN_ARGS,
    "MODEL_NAME" : MODEL_NAME,
    "CLUSTER_EPOCHS" : 2,
}
experiment_config["ANALYSIS_POSTFIX"] = f"mined_{experiment_config['DRIFT_TYPE']}_{str(date.today())}"
experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"] += "/" + experiment_config["ANALYSIS_POSTFIX"] 
experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"] += "/" + experiment_config["ANALYSIS_POSTFIX"] 

if not os.path.exists(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"]):
    os.mkdir(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"])

if not os.path.exists(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"]):
    os.mkdir(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"])

tokenizer = AutoTokenizer.from_pretrained(experiment_config["MODEL_NAME"], skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(experiment_config["MODEL_NAME"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. 

In [2]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, train_size=100, test_size=25, cluster_id=4)
train_dataset, test_data, test_df, train_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"], sampling_dict["train_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

Train Data:  (100, 11)
Test Data:  (25, 11)
Train Data: Cluster cluster
2    57
3    36
4     4
1     3
Name: count, dtype: int64
Test Data: Cluster cluster
4    21
3     3
2     1
Name: count, dtype: int64


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 21851.02 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 23631.21 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1928.17 examples/s]
Filter: 100%|████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 10748.01 examples/s]
Filter: 100%|█████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 9258.13 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████████████████| 25/25 [00:00<00:00, 1524.23 examples/s]

Fold 0
Fold 1
Fold 2





In [3]:
fold_results = cv_training_epochs_sets(experiment_config=experiment_config,
                            splits=splits,
                            questions_list=questions_list,
                            train_dataset=train_dataset,
                            tokenizer=tokenizer)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_fold_epoch_set.pickle', 'wb') as handle:
    pickle.dump(fold_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 14988.22 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 17079.18 examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0





LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,6.1259,5.710525,0.152,0.0238,0.1397,0.1396,9.7941,0.0,0.6543,0.7022,257,366


Fold 1


Filter: 100%|███████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 8355.02 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 15274.79 examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0





LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.8428,6.034223,0.0961,0.0066,0.0878,0.0883,9.2424,0.0,0.7252,0.7568,221,292


Fold 2


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 10966.36 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 17347.61 examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0





LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.8527,5.92272,0.1151,0.0261,0.1068,0.1068,9.0606,0.0,0.654,0.7019,219,312


In [4]:
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_fold_epoch_set.pickle', 'rb') as handle:
   fold_results = pickle.load(handle)
   
for cluster_idx in [1, 4, 3]:
    fold_results = cv_cluster_set(experiment_config=experiment_config,
                                            splits=splits,
                                            questions_list=questions_list,
                                            train_dataset=train_dataset,
                                            tokenizer=tokenizer,
                                            fold_results=fold_results,
                                            cluster_id=cluster_idx)

cv_df = results_dict_todf(fold_results)

########## SAVE THE FILE

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_step1.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 14112.26 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 15737.89 examples/s]

Cluster 1 training size (3, 14)
TRAINING CLUSTER SET 1 FOR EPOCHS 2



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,6.4064,5.713845,0.1529,0.0238,0.1406,0.141,9.7059,0.0,0.6434,0.694,254,366
2,6.1965,5.71383,0.1529,0.0238,0.1406,0.141,9.7059,0.0,0.6434,0.694,254,366


Fold 1


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 13766.26 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 16712.37 examples/s]

Cluster 1 training size (0, 14)
TRAINING CLUSTER SET 1 FOR EPOCHS 2





LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 14684.40 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 17123.10 examples/s]

Cluster 1 training size (3, 14)
TRAINING CLUSTER SET 1 FOR EPOCHS 2



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda
Fold 0


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 14904.07 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 15298.75 examples/s]

Cluster 4 training size (3, 14)
TRAINING CLUSTER SET 4 FOR EPOCHS 2





LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.5601,5.713845,0.1529,0.0238,0.1406,0.141,9.7059,0.0,0.6434,0.694,254,366
2,5.6766,5.713825,0.1529,0.0238,0.1406,0.141,9.7059,0.0,0.6434,0.694,254,366


Fold 1


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 12217.25 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 15995.97 examples/s]

Cluster 4 training size (2, 14)
TRAINING CLUSTER SET 4 FOR EPOCHS 2



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda
Fold 2


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 15289.26 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 16146.84 examples/s]

Cluster 4 training size (3, 14)
TRAINING CLUSTER SET 4 FOR EPOCHS 2



Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda
Fold 0


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 14760.88 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 17273.31 examples/s]

Cluster 3 training size (23, 14)
TRAINING CLUSTER SET 3 FOR EPOCHS 2





LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.7126,5.713527,0.1529,0.0238,0.1406,0.141,9.7059,0.0,0.6434,0.694,254,366
2,5.6714,5.712401,0.1529,0.0238,0.1406,0.141,9.7059,0.0,0.6434,0.694,254,366


Fold 1


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 14121.76 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 17553.06 examples/s]

Cluster 3 training size (26, 14)
TRAINING CLUSTER SET 3 FOR EPOCHS 2





LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2


Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 14288.21 examples/s]
Filter: 100%|██████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 16012.46 examples/s]

Cluster 3 training size (23, 14)
TRAINING CLUSTER SET 3 FOR EPOCHS 2





LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [5]:
print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

Mean
model_set
0            0.121583
1            0.121179
cluster_1    0.121583
cluster_3    0.121583
cluster_4    0.121583
Name: rouge, dtype: float64
STD
model_set
0            0.128052
1            0.127795
cluster_1    0.128052
cluster_3    0.128052
cluster_4    0.128052
Name: rouge, dtype: float64


### Step 2. Learn performance

In [6]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_step1.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

########## RUN STEP 2 ON CV

cv_df, model_results = cv_step_2(experiment_config=experiment_config, cv_df=cv_df)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/s2_model_results.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_results.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)


0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.238063 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 35
[LightGBM] [Info] Number of data points in the train set: 330, number of used features: 10
[LightGBM] [Info] Start training from score 0.105168
catboost
Learning rate set to 0.034364
0:	learn: 0.1147907	total: 51.4ms	remaining: 51.4s
1:	learn: 0.1139874	total: 57ms	remaining: 28.5s
2:	learn: 0.1130629	total: 57.8ms	remaining: 19.2s
3:	learn: 0.1121879	total: 59.3ms	remaining: 14.8s
4:	learn: 0.1113043	total: 60ms	remaining: 11.9s
5:	learn: 0.1105453	total: 60.6ms	remaining: 10s
6:	learn: 0.1097579	total: 61.2ms	remaining: 8.69s
7:	learn: 0.1092289	total: 61.9ms	remaining: 7.67s
8:	learn: 0.1086171	total: 64.7ms	remaining: 7.12s
9:	learn: 0.1080112	total: 66.8ms	remaining: 6.61s
10:	learn: 0.1074744	total: 67.4ms	remain

In [7]:
### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_results.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)


print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())


full_step_2(cv_df=cv_df, 
            experiment_config=experiment_config)

Mean
model_set
0            0.121583
1            0.121179
cluster_1    0.121583
cluster_3    0.121583
cluster_4    0.121583
ensemble     0.121179
Name: rouge, dtype: float64
STD
model_set
0            0.128052
1            0.127795
cluster_1    0.128052
cluster_3    0.128052
cluster_4    0.128052
ensemble     0.127795
Name: rouge, dtype: float64
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.157759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 500, number of used features: 15
[LightGBM] [Info] Start training from score 0.121502
catboost
Learning rate set to 0.036696
0:	learn: 0.1264152	total: 1.15ms	remaining: 1.15s
1:	learn: 0.1255963	total: 2.17ms	remaining: 1.08s
2:	learn: 0.1248150	total: 2.76ms	remaining: 917ms
3:	learn: 0.1241115	total: 3.34ms	remaining: 

# TEST

In [8]:
# sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=True, train_size=100, test_size=25, cluster_id=4)
# train_dataset, test_data, test_df, train_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"], sampling_dict["train_df"]

# splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
#                             experiment_config=experiment_config)

In [9]:
with open(f"reports/results/{experiment_config['ANALYSIS_POSTFIX']}/cv_results.pickle", "rb") as handle:
    cv_resutls = pickle.load(handle)

base_models_list = list(cv_resutls.model_set.unique())
base_models_list.pop(-1)

'ensemble'

In [10]:
meta_preds_df = meta_predict(experiment_config=experiment_config, 
                    test_df=test_df,
                    base_models_names=base_models_list,
                    t_models=["lr", "svm", "lgbm", "catboost"])

########## SAVE THE FILE

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_step2.pickle', 'wb') as handle:
    pickle.dump(meta_preds_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

lr
svm
lgbm
catboost


In [5]:
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_step2.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)
    
meta_preds_df.groupby("model_set").svm_preds.mean()

model_set
0            0.117956
1            0.118346
cluster_1    0.118365
cluster_3    0.117898
cluster_4    0.118357
Name: svm_preds, dtype: float64

In [6]:
optimal_ensemble_map, ensemble_val_estim = create_ensemble_map(meta_preds_df=meta_preds_df, 
                                                                t_model_name="svm")

In [13]:
test_result_df = test_training_epochs_sets(experiment_config=experiment_config,
                            test_df=test_df,
                            test_data=test_data,
                            train_data=train_dataset,
                            tokenizer=tokenizer)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_epoch_set.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.9593,5.858005,0.0956,0.0206,0.0819,0.0815,11.4,0.0,0.8209,0.8352,228,273


In [17]:
train_df.cluster.value_counts()

cluster
2    57
3    36
4     4
1     3
Name: count, dtype: int64

In [3]:
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_epoch_set.pickle', 'rb') as handle:
   test_result_df = pickle.load(handle)

test_result_df = test_result_df.rename(columns={"epoch_set": "model_set"})

for cluster_idx in [1, 4, 3]:
    test_result_dict = test_cluster_set(experiment_config=experiment_config,
                                    test_df=test_df,
                                    test_data=test_data,
                                    tokenizer=tokenizer,
                                    results_df=test_result_df,
                                    cluster_id=cluster_idx,
                                    train_df=train_df)

########## SAVE THE FILE

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_results.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Cluster 1 training size (3, 14)
TRAINING CLUSTER SET 1 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,6.4064,5.868046,0.0956,0.0206,0.0819,0.0815,11.4,0.0,0.8209,0.8352,228,273
2,6.1965,5.868027,0.0956,0.0206,0.0819,0.0815,11.4,0.0,0.8209,0.8352,228,273


Cluster 4 training size (4, 14)
TRAINING CLUSTER SET 4 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,6.0211,5.868046,0.0956,0.0206,0.0819,0.0815,11.4,0.0,0.8209,0.8352,228,273
2,5.9651,5.867997,0.0956,0.0206,0.0819,0.0815,11.4,0.0,0.8209,0.8352,228,273


Cluster 3 training size (36, 14)
TRAINING CLUSTER SET 3 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.6971,5.866805,0.0956,0.0206,0.0819,0.0815,11.4,0.0,0.8209,0.8352,228,273
2,5.6811,5.863044,0.0956,0.0206,0.0819,0.0815,11.4,0.0,0.8209,0.8352,228,273


In [7]:
### ENSEMBLE COMPUTE
test_result_df = ensemble_compute(test_result_df=test_result_df,
                                  optimal_ensemble_map=optimal_ensemble_map)

########## ROUGE PER SETTING

print("Mean")
print(test_result_df.groupby("model_set")["rouge"].mean())

print("STD")
print(test_result_df.groupby("model_set")["rouge"].std())

test_result_df.opt_es_id.value_counts()

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_results_full.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Mean
model_set
0           0.097686
1           0.097686
ensemble    0.000000
Name: rouge, dtype: float64
STD
model_set
0           0.148129
1           0.148129
ensemble         NaN
Name: rouge, dtype: float64
