### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import sys
sys.path.append("../")

from tqdm import tqdm # type: ignore
from datetime import date

import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer
from utils.sampling import create_splits, prep_cv_validation
from utils.training import cv_cluster_set, cv_training_epochs_sets, test_cluster_set
from utils.training import results_dict_todf, cv_step_2, full_step_2, test_training_epochs_sets
from utils.inference import meta_predict, create_ensemble_map, ensemble_compute

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch

#####################################
############  CONSTANTS #############
#####################################

RS = 42
BATCH_SIZE = 16
DECODER_LENGTH = 25
ENCODER_LENGTH = 25
MODEL_NAME = "Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 2, 5, 10],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 1e-5,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

experiment_config = {
    "DATA_STR" : "20240721",
    "RS" : RS,
    "DRIFT_TYPE" : "drift",
    "NFOLD" : 3,
    "FULL_TRAIN_ARGS" : FULL_TRAIN_ARGS,
    "MODEL_NAME" : MODEL_NAME,
    "CLUSTER_EPOCHS" : 5,
    "CLUSTER_SET_ID" : [1, 4, 3, [0, 1, 4,]],
    "TRAIN_SIZE" : 7000,
    "TEST_SIZE" : 2500,
}
experiment_config["ANALYSIS_POSTFIX"] = f"mined_{experiment_config['DRIFT_TYPE']}_{str(date.today())}"
experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"] += "/" + experiment_config["ANALYSIS_POSTFIX"] 
experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"] += "/" + experiment_config["ANALYSIS_POSTFIX"] 

if not os.path.exists(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"]):
    os.mkdir(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"])

if not os.path.exists(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"]):
    os.mkdir(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"])

tokenizer = AutoTokenizer.from_pretrained(experiment_config["MODEL_NAME"], skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(experiment_config["MODEL_NAME"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. 

In [2]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, train_size=experiment_config["TRAIN_SIZE"], test_size=experiment_config["TEST_SIZE"], cluster_id=4)
train_dataset, test_data, test_df, train_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"], sampling_dict["train_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

Train Data:  (7000, 11)
Test Data:  (2500, 11)
Train Data: Cluster cluster
2    3888
3    2114
4     596
1     234
0     168
Name: count, dtype: int64
Test Data: Cluster cluster
4    926
2    605
3    350
1    344
0    275
Name: count, dtype: int64


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Fold 0
Fold 1
Fold 2


In [3]:
fold_results = cv_training_epochs_sets(experiment_config=experiment_config,
                            splits=splits,
                            questions_list=questions_list,
                            train_dataset=train_dataset,
                            tokenizer=tokenizer)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_fold_epoch_set.pickle', 'wb') as handle:
    pickle.dump(fold_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0


In [4]:
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_fold_epoch_set.pickle', 'rb') as handle:
   fold_results = pickle.load(handle)
   
for cluster_idx in experiment_config["CLUSTER_SET_ID"]:
    fold_results = cv_cluster_set(experiment_config=experiment_config,
                                            splits=splits,
                                            questions_list=questions_list,
                                            train_dataset=train_dataset,
                                            tokenizer=tokenizer,
                                            fold_results=fold_results,
                                            cluster_id=cluster_idx)

cv_df = results_dict_todf(fold_results)

########## SAVE THE FILE

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_step1.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [1] training size (5, 14)
TRAINING CLUSTER SET [1] FOR EPOCHS 5
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,6.2142,5.56111,0.1528,0.0238,0.1362,0.1368,9.6418,0.0092,0.7073,0.7427,17040,22942
2,6.0961,5.560882,0.1528,0.0238,0.1362,0.1368,9.6418,0.0092,0.7073,0.7427,17040,22942
3,5.5067,5.560461,0.1528,0.0238,0.1362,0.1368,9.6418,0.0092,0.7073,0.7427,17040,22942
4,5.1211,5.559864,0.1529,0.0238,0.1363,0.1368,9.641,0.0092,0.7071,0.7427,17038,22942
5,5.6897,5.559074,0.153,0.0238,0.1363,0.1369,9.6465,0.0092,0.708,0.7433,17053,22942


Fold 1


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [1] training size (7, 14)
TRAINING CLUSTER SET [1] FOR EPOCHS 5
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [1] training size (8, 14)
TRAINING CLUSTER SET [1] FOR EPOCHS 5
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 0


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [4] training size (578, 14)
TRAINING CLUSTER SET [4] FOR EPOCHS 5
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.6261,5.169208,0.1719,0.0302,0.1536,0.1537,9.7601,0.0123,0.7259,0.7574,17376,22942
2,5.0589,4.690166,0.2113,0.0495,0.192,0.1921,10.0591,0.0189,0.766,0.7896,18114,22942
3,4.6387,4.448548,0.2613,0.0721,0.2367,0.2368,11.3599,0.0285,0.8834,0.8897,20412,22942
4,4.3574,4.338812,0.2891,0.0866,0.2594,0.2595,11.7699,0.0341,0.9148,0.9182,21066,22942
5,4.1948,4.320555,0.2889,0.0845,0.2583,0.2584,11.6662,0.0323,0.906,0.9101,20880,22942


Fold 1


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [4] training size (538, 14)
TRAINING CLUSTER SET [4] FOR EPOCHS 5
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [4] training size (548, 14)
TRAINING CLUSTER SET [4] FOR EPOCHS 5
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 0


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [3] training size (1980, 14)
TRAINING CLUSTER SET [3] FOR EPOCHS 5
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.9444,4.333526,0.2757,0.0816,0.2477,0.2479,12.7009,0.0331,1.0,1.0167,23326,22942
2,4.227,4.138303,0.3263,0.1059,0.291,0.2911,13.347,0.0454,1.0,1.0645,24421,22942
3,4.0036,4.093394,0.3277,0.1074,0.2925,0.2926,13.1542,0.0473,1.0,1.0463,24005,22942
4,3.8973,4.07493,0.3305,0.1088,0.2967,0.2969,13.2768,0.0481,1.0,1.0474,24030,22942
5,3.8415,4.07372,0.3315,0.1099,0.2968,0.2971,13.2408,0.049,1.0,1.0464,24006,22942


Fold 1


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [3] training size (2040, 14)
TRAINING CLUSTER SET [3] FOR EPOCHS 5
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [3] training size (2004, 14)
TRAINING CLUSTER SET [3] FOR EPOCHS 5
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 0


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [0, 1, 4] training size (598, 14)
TRAINING CLUSTER SET [0, 1, 4] FOR EPOCHS 5
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.6143,5.152984,0.1753,0.0316,0.1564,0.1567,9.7935,0.0126,0.7324,0.7625,17493,22942
2,5.0247,4.669456,0.2167,0.0521,0.1971,0.1975,10.2635,0.0203,0.7892,0.8086,18550,22942
3,4.6209,4.43492,0.2676,0.0745,0.2421,0.2422,11.7134,0.0292,0.919,0.9221,21155,22942
4,4.3445,4.329432,0.2879,0.0842,0.2573,0.2573,11.644,0.0317,0.9054,0.9096,20869,22942
5,4.197,4.311857,0.2918,0.0859,0.2603,0.2604,11.8672,0.0322,0.9264,0.929,21312,22942


Fold 1


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [0, 1, 4] training size (558, 14)
TRAINING CLUSTER SET [0, 1, 4] FOR EPOCHS 5
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Cluster [0, 1, 4] training size (570, 14)
TRAINING CLUSTER SET [0, 1, 4] FOR EPOCHS 5
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [5]:
print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

Mean
model_set
0                    0.161484
1                    0.342008
2                    0.348875
5                    0.357782
10                   0.356509
cluster_[0, 1, 4]    0.297413
cluster_[1]          0.161520
cluster_[3]          0.340375
cluster_[4]          0.294668
Name: rouge, dtype: float64
STD
model_set
0                    0.148428
1                    0.167981
2                    0.168859
5                    0.169450
10                   0.168416
cluster_[0, 1, 4]    0.174008
cluster_[1]          0.148322
cluster_[3]          0.174320
cluster_[4]          0.174021
Name: rouge, dtype: float64


### Step 2. Learn performance

In [6]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_step1.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

########## RUN STEP 2 ON CV

cv_df, model_results = cv_step_2(experiment_config=experiment_config, cv_df=cv_df)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/s2_model_results.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_results.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)


0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.021803 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13711
[LightGBM] [Info] Number of data points in the train set: 41994, number of used features: 987
[LightGBM] [Info] Start training from score 0.297249
catboost
Learning rate set to 0.0739
0:	learn: 0.1806756	total: 56.8ms	remaining: 56.8s
1:	learn: 0.1787251	total: 64ms	remaining: 31.9s
2:	learn: 0.1770640	total: 71.9ms	remaining: 23.9s
3:	learn: 0.1756701	total: 78.9ms	remaining: 19.6s
4:	learn: 0.1743623	total: 85.7ms	remaining: 17.1s
5:	learn: 0.1732295	total: 92ms	remaining: 15.2s
6:	learn: 0.1721948	total: 98.8ms	remaining: 14s
7:	learn: 0.1712479	total: 106ms	remaining: 13.1s
8:	learn: 0.1704864	total: 113ms	remaining: 12.4s
9:	learn: 0.1698144	total: 119ms	remaining: 11.8s
10:	learn: 0.1692118	total: 126ms	remain

In [7]:
### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_results.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)


print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())


full_step_2(cv_df=cv_df, 
            experiment_config=experiment_config)

Mean
model_set
0                    0.161484
1                    0.342008
2                    0.348875
5                    0.357782
10                   0.356509
cluster_[0, 1, 4]    0.297413
cluster_[1]          0.161520
cluster_[3]          0.340375
cluster_[4]          0.294668
ensemble             0.357430
Name: rouge, dtype: float64
STD
model_set
0                    0.148428
1                    0.167981
2                    0.168859
5                    0.169450
10                   0.168416
cluster_[0, 1, 4]    0.174008
cluster_[1]          0.148322
cluster_[3]          0.174320
cluster_[4]          0.174021
ensemble             0.168296
Name: rouge, dtype: float64
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.035068 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20496
[LightGBM] [Info] Number of data points in

# TEST

In [8]:
# sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=True, train_size=100, test_size=25, cluster_id=4)
# train_dataset, test_data, test_df, train_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"], sampling_dict["train_df"]

# splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
#                             experiment_config=experiment_config)

In [9]:
with open(f"reports/results/{experiment_config['ANALYSIS_POSTFIX']}/cv_results.pickle", "rb") as handle:
    cv_resutls = pickle.load(handle)

base_models_list = list(cv_resutls.model_set.unique())
base_models_list.pop(-1)

'ensemble'

In [10]:
meta_preds_df = meta_predict(experiment_config=experiment_config, 
                    test_df=test_df,
                    base_models_names=base_models_list,
                    t_models=["lr", "svm", "lgbm", "catboost"])

########## SAVE THE FILE

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_step2.pickle', 'wb') as handle:
    pickle.dump(meta_preds_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

lr
svm
lgbm
catboost


In [11]:
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_step2.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)
    
meta_preds_df.groupby("model_set").catboost_preds.mean()

model_set
0                    0.164312
1                    0.338320
2                    0.342925
5                    0.352416
10                   0.352421
cluster_[0, 1, 4]    0.300745
cluster_[1]          0.164515
cluster_[3]          0.338320
cluster_[4]          0.297956
Name: catboost_preds, dtype: float64

In [12]:
optimal_ensemble_map, ensemble_val_estim = create_ensemble_map(meta_preds_df=meta_preds_df, 
                                                                t_model_name="svm")

In [13]:
test_result_df = test_training_epochs_sets(experiment_config=experiment_config,
                            test_df=test_df,
                            test_data=test_data,
                            train_data=train_dataset,
                            tokenizer=tokenizer)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_epoch_set.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.5076,4.093495,0.3478,0.1157,0.3085,0.3084,13.6764,0.0487,1.0,1.0945,26645,24344


TRAINING EPOCH SET 2
TRAINING EPOCHS 1
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.0222,4.008162,0.3511,0.1165,0.3101,0.31,13.5932,0.0499,1.0,1.0809,26314,24344


TRAINING EPOCH SET 5
TRAINING EPOCHS 3
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7707,3.951005,0.352,0.114,0.3086,0.3086,13.6192,0.0494,1.0,1.0824,26349,24344
2,3.8976,3.917106,0.3505,0.1128,0.3083,0.3083,13.4304,0.0484,1.0,1.0635,25890,24344
3,3.8024,3.910935,0.3528,0.1156,0.3108,0.3107,13.5816,0.05,1.0,1.0741,26147,24344


TRAINING EPOCH SET 10
TRAINING EPOCHS 5
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.3412,3.944862,0.3501,0.1128,0.3059,0.3057,13.4224,0.0488,1.0,1.0614,25838,24344
2,3.5549,3.928447,0.3465,0.1116,0.3048,0.3048,13.228,0.0497,1.0,1.043,25392,24344
3,3.5751,3.902961,0.3528,0.1162,0.3101,0.3102,13.4892,0.0513,1.0,1.065,25927,24344
4,3.5677,3.901816,0.3547,0.116,0.3104,0.3105,13.4956,0.0504,1.0,1.065,25926,24344
5,3.5264,3.905879,0.3525,0.1143,0.3091,0.3094,13.4844,0.0513,1.0,1.0631,25879,24344


In [14]:
train_df.cluster.value_counts()

cluster
2    3125
3    3012
4     832
0      21
1      10
Name: count, dtype: int64

In [15]:
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_epoch_set.pickle', 'rb') as handle:
   test_result_df = pickle.load(handle)

test_result_df = test_result_df.rename(columns={"epoch_set": "model_set"})

for cluster_idx in experiment_config["CLUSTER_SET_ID"]:
    test_result_df = test_cluster_set(experiment_config=experiment_config,
                                    test_df=test_df,
                                    test_data=test_data,
                                    tokenizer=tokenizer,
                                    results_df=test_result_df,
                                    cluster_id=cluster_idx,
                                    train_df=train_df)

########## SAVE THE FILE

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_results.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Cluster [1] training size (10, 14)
TRAINING CLUSTER SET [1] FOR EPOCHS5
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.6386,5.602902,0.1622,0.0262,0.1452,0.1457,9.68,0.008,0.7239,0.7558,18399,24344
2,5.3237,5.602283,0.1621,0.0262,0.1451,0.1456,9.6788,0.008,0.7237,0.7556,18395,24344
3,5.5115,5.601264,0.1622,0.0263,0.1452,0.1458,9.676,0.008,0.7237,0.7556,18395,24344
4,5.4839,5.59981,0.1623,0.0263,0.1453,0.1458,9.6788,0.008,0.7242,0.756,18404,24344
5,5.5162,5.597931,0.1623,0.0263,0.1453,0.1459,9.6856,0.008,0.7255,0.757,18429,24344


Cluster [4] training size (832, 14)
TRAINING CLUSTER SET [4] FOR EPOCHS5
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.4806,4.978137,0.1959,0.0374,0.1733,0.1736,9.854,0.0147,0.7496,0.7762,18897,24344
2,4.7623,4.510124,0.2738,0.0749,0.2463,0.2464,11.8404,0.0303,0.9403,0.942,22933,24344
3,4.3908,4.347179,0.2903,0.0839,0.2584,0.2586,11.886,0.034,0.9388,0.9406,22897,24344
4,4.1843,4.298379,0.2945,0.0853,0.2603,0.2604,11.9388,0.0349,0.9465,0.9479,23076,24344
5,4.0841,4.291209,0.3026,0.0902,0.2679,0.2681,12.1372,0.0379,0.9574,0.9583,23329,24344


Cluster [3] training size (3012, 14)
TRAINING CLUSTER SET [3] FOR EPOCHS5
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.7175,4.240239,0.3196,0.1002,0.2832,0.2834,13.0004,0.0406,1.0,1.0506,25576,24344
2,4.1021,4.11285,0.3307,0.1047,0.2924,0.2926,12.8236,0.0452,1.0,1.0306,25090,24344
3,3.9253,4.071818,0.3338,0.109,0.298,0.2981,13.2476,0.0476,1.0,1.054,25659,24344
4,3.835,4.05892,0.3382,0.1093,0.2997,0.2998,13.1584,0.0457,1.0,1.0514,25595,24344
5,3.7812,4.058057,0.3397,0.1111,0.3015,0.3016,13.2552,0.0476,1.0,1.0541,25661,24344


Cluster [0, 1, 4] training size (863, 14)
TRAINING CLUSTER SET [0, 1, 4] FOR EPOCHS5
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.4707,4.955633,0.1991,0.0386,0.1761,0.1762,9.868,0.0132,0.7506,0.7771,18918,24344
2,4.7463,4.485016,0.2756,0.0743,0.2465,0.2465,11.9516,0.0305,0.9477,0.9491,23104,24344
3,4.374,4.327327,0.2978,0.0877,0.265,0.2655,12.0712,0.0363,0.9508,0.952,23175,24344
4,4.1784,4.28933,0.3011,0.0896,0.2667,0.2668,12.3184,0.0385,0.9776,0.9778,23804,24344
5,4.0854,4.283941,0.3056,0.093,0.2707,0.2709,12.4308,0.0398,0.9847,0.9848,23975,24344


In [16]:
### ENSEMBLE COMPUTE
test_result_df = ensemble_compute(test_result_df=test_result_df,
                                  optimal_ensemble_map=optimal_ensemble_map)

########## ROUGE PER SETTING

print("Mean")
print(test_result_df.groupby("model_set")["rouge"].mean())

print("STD")
print(test_result_df.groupby("model_set")["rouge"].std())

print(test_result_df.opt_es_id.value_counts())

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_results_full.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Mean
model_set
0                    0.163454
1                    0.347968
2                    0.353897
5                    0.355079
10                   0.354205
cluster_[0, 1, 4]    0.305873
cluster_[1]          0.163528
cluster_[3]          0.342918
cluster_[4]          0.303905
ensemble             0.352185
Name: rouge, dtype: float64
STD
model_set
0                    0.149661
1                    0.170741
2                    0.171784
5                    0.170474
10                   0.173010
cluster_[0, 1, 4]    0.177706
cluster_[1]          0.149618
cluster_[3]          0.174285
cluster_[4]          0.176532
ensemble             0.171800
Name: rouge, dtype: float64
opt_es_id
5                    6520
10                   6210
cluster_[3]          4340
2                    3410
1                    2260
cluster_[0, 1, 4]    1130
cluster_[4]          1070
0                      40
cluster_[1]            20
Name: count, dtype: int64


In [18]:
test_result_df.opt_es_id.value_counts()

opt_es_id
5                    6520
10                   6210
cluster_[3]          4340
2                    3410
1                    2260
cluster_[0, 1, 4]    1130
cluster_[4]          1070
0                      40
cluster_[1]            20
Name: count, dtype: int64