### 1. Settings

In [23]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
import sys
sys.path.append("../")

from tqdm import tqdm # type: ignore
from datetime import date

import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from utils.sampling import create_splits, prep_cv_validation
from utils.training import cv_cluster_set, cv_training_epochs_sets, test_cluster_set
from utils.training import results_dict_todf, cv_step_2, full_step_2, test_training_epochs_sets
from utils.inference import meta_predict, create_ensemble_map, ensemble_compute

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch

#####################################
############  CONSTANTS #############
#####################################

RS = 42
BATCH_SIZE = 16
DECODER_LENGTH = 30
ENCODER_LENGTH = 30
MODEL_NAME = "Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 2, 5, 10],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 1e-5,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

experiment_config = {
    "DATA_STR" : "20240908",
    "RS" : RS,
    "DRIFT_TYPE" : "drift",
    "NFOLD" : 3,
    "FULL_TRAIN_ARGS" : FULL_TRAIN_ARGS,
    "MODEL_NAME" : MODEL_NAME,
    "CLUSTER_EPOCHS" : 3,
    "CLUSTER_SET_ID" : [0, 3, [0, 3,]],
    "TRAIN_SIZE" : 7000,
    "TEST_SIZE" : 2500,
}
experiment_config["ANALYSIS_POSTFIX"] = f"mined_{experiment_config['DRIFT_TYPE']}_{str(date.today())}"
experiment_config["ANALYSIS_POSTFIX"] = f"mined_{experiment_config['DRIFT_TYPE']}_2024-09-09"
experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"] += "/" + experiment_config["ANALYSIS_POSTFIX"] 
experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"] += "/" + experiment_config["ANALYSIS_POSTFIX"] 

if not os.path.exists(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"]):
    os.mkdir(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["logging_dir"])

if not os.path.exists(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"]):
    os.mkdir(experiment_config["FULL_TRAIN_ARGS"]["SEQ_TRAINER_ARGS"]["output_dir"])

tokenizer = AutoTokenizer.from_pretrained(experiment_config["MODEL_NAME"], skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(experiment_config["MODEL_NAME"])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

t_models = ["catboost"]
RUN_BASE_TRAINING = False

### 2. Conala data. Preprocessing. 

In [24]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, train_size=experiment_config["TRAIN_SIZE"], test_size=experiment_config["TEST_SIZE"], cluster_id=4)
train_dataset, test_data, test_df, train_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"], sampling_dict["train_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

Train Data:  (7000, 11)
Test Data:  (2500, 11)
Train Data: Cluster cluster
1    3280
0    1352
2    1325
3    1043
Name: count, dtype: int64
Test Data: Cluster cluster
0    2500
Name: count, dtype: int64


Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

Fold 0
Fold 1
Fold 2


In [25]:
if RUN_BASE_TRAINING:
    fold_results = cv_training_epochs_sets(experiment_config=experiment_config,
                                splits=splits,
                                questions_list=questions_list,
                                train_dataset=train_dataset,
                                tokenizer=tokenizer)

    with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_fold_epoch_set.pickle', 'wb') as handle:
        pickle.dump(fold_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
if RUN_BASE_TRAINING:
    with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_fold_epoch_set.pickle', 'rb') as handle:
        fold_results = pickle.load(handle)
    
    for cluster_idx in experiment_config["CLUSTER_SET_ID"]:
        fold_results = cv_cluster_set(experiment_config=experiment_config,
                                                splits=splits,
                                                questions_list=questions_list,
                                                train_dataset=train_dataset,
                                                tokenizer=tokenizer,
                                                fold_results=fold_results,
                                                cluster_id=cluster_idx)

    cv_df = results_dict_todf(fold_results)

    ########## SAVE THE FILE

    with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_step1.pickle', 'wb') as handle:
        pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
if RUN_BASE_TRAINING:
    print("Mean")
    print(cv_df.groupby(["model_set"])["rouge"].mean())

    print("STD")
    print(cv_df.groupby("model_set")["rouge"].std())

### Step 2. Learn performance

In [3]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_step1.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

########## RUN STEP 2 ON CV

cv_df, model_results = cv_step_2(experiment_config=experiment_config, cv_df=cv_df, t_models=t_models, add_cluster=True)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/s2_model_results.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_results.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)


0
catboost
Learning rate set to 0.072537
0:	learn: 0.1636484	total: 62.8ms	remaining: 1m 2s
1:	learn: 0.1623045	total: 70.1ms	remaining: 35s
2:	learn: 0.1612065	total: 77.6ms	remaining: 25.8s
3:	learn: 0.1601988	total: 85ms	remaining: 21.2s
4:	learn: 0.1593870	total: 92.1ms	remaining: 18.3s
5:	learn: 0.1585987	total: 98.9ms	remaining: 16.4s
6:	learn: 0.1579467	total: 106ms	remaining: 15s
7:	learn: 0.1573682	total: 112ms	remaining: 13.9s
8:	learn: 0.1568890	total: 119ms	remaining: 13.1s
9:	learn: 0.1564456	total: 125ms	remaining: 12.4s
10:	learn: 0.1560240	total: 132ms	remaining: 11.9s
11:	learn: 0.1556719	total: 140ms	remaining: 11.5s
12:	learn: 0.1553753	total: 147ms	remaining: 11.2s
13:	learn: 0.1550540	total: 154ms	remaining: 10.9s
14:	learn: 0.1548516	total: 161ms	remaining: 10.6s
15:	learn: 0.1545973	total: 169ms	remaining: 10.4s
16:	learn: 0.1543544	total: 176ms	remaining: 10.2s
17:	learn: 0.1541722	total: 183ms	remaining: 9.97s
18:	learn: 0.1540000	total: 190ms	remaining: 9.79s


In [28]:
### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/cv_results.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)


print("Mean")
print(cv_df.groupby(["model_set"])[f"{t_models[0]}_perf_hat"].mean())

print("STD")
print(cv_df.groupby("model_set")[f"{t_models[0]}_perf_hat"].std())


full_step_2(cv_df=cv_df, 
            experiment_config=experiment_config, t_models=t_models, add_cluster=True)

Mean
model_set
0                 0.132312
1                 0.273133
2                 0.274084
5                 0.279276
10                0.279877
cluster_[0, 3]    0.273351
cluster_[0]       0.200182
cluster_[3]       0.253990
ensemble          0.280119
Name: catboost_perf_hat, dtype: float64
STD
model_set
0                 0.046651
1                 0.047404
2                 0.047423
5                 0.047317
10                0.047236
cluster_[0, 3]    0.047359
cluster_[0]       0.046675
cluster_[3]       0.046849
ensemble          0.047344
Name: catboost_perf_hat, dtype: float64
catboost
Learning rate set to 0.077338
0:	learn: 0.1644648	total: 10.2ms	remaining: 10.1s
1:	learn: 0.1630638	total: 18.7ms	remaining: 9.31s
2:	learn: 0.1618527	total: 27.3ms	remaining: 9.06s
3:	learn: 0.1608406	total: 35.4ms	remaining: 8.82s
4:	learn: 0.1599305	total: 43.7ms	remaining: 8.7s
5:	learn: 0.1591494	total: 52ms	remaining: 8.62s
6:	learn: 0.1584581	total: 60.5ms	remaining: 8.58s
7:	learn: 0.

# TEST

In [5]:
# sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=True, train_size=100, test_size=25, cluster_id=4)
# train_dataset, test_data, test_df, train_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"], sampling_dict["train_df"]

# splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
#                             experiment_config=experiment_config)

In [6]:
with open(f"reports/results/{experiment_config['ANALYSIS_POSTFIX']}/cv_results.pickle", "rb") as handle:
    cv_resutls = pickle.load(handle)

base_models_list = list(cv_resutls.model_set.unique())
base_models_list.pop(-1)

'ensemble'

In [7]:
meta_preds_df = meta_predict(experiment_config=experiment_config, 
                    test_df=test_df,
                    base_models_names=base_models_list,
                    t_models=t_models, 
                    add_cluster=True)

########## SAVE THE FILE

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_step2.pickle', 'wb') as handle:
    pickle.dump(meta_preds_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

catboost


In [29]:
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_step2.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)
    
meta_preds_df.groupby("model_set").catboost_preds.mean()

model_set
0                 0.132555
1                 0.272427
2                 0.274570
5                 0.279419
10                0.279291
cluster_[0, 3]    0.272427
cluster_[0]       0.200776
cluster_[3]       0.253656
Name: catboost_preds, dtype: float64

In [30]:
optimal_ensemble_map, ensemble_val_estim = create_ensemble_map(meta_preds_df=meta_preds_df, 
                                                                t_model_name="catboost")

In [31]:
ensemble_val_estim.catboost_preds.mean()

0.27955527465272334

In [13]:
if RUN_BASE_TRAINING:
    test_result_df = test_training_epochs_sets(experiment_config=experiment_config,
                                test_df=test_df,
                                test_data=test_data,
                                train_data=train_dataset,
                                tokenizer=tokenizer)

    with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_epoch_set.pickle', 'wb') as handle:
        pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
Salesforce/codet5-base-multi-sum


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


cuda
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
Salesforce/codet5-base-multi-sum


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.9381,4.671429,0.217,0.0461,0.1939,0.1938,13.7004,0.0105,1.0,1.9302,26833,13902


TRAINING EPOCH SET 2
TRAINING EPOCHS 1
./models/1_epoch_set


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.4666,4.584875,0.2253,0.0503,0.1989,0.1989,13.5868,0.0102,1.0,1.9017,26438,13902


TRAINING EPOCH SET 5
TRAINING EPOCHS 3
./models/2_epoch_set


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2079,4.519742,0.2306,0.0522,0.2025,0.2026,13.5692,0.0102,1.0,1.9071,26513,13902
2,4.3393,4.469268,0.23,0.0536,0.2027,0.2026,13.5948,0.0114,1.0,1.8884,26253,13902
3,4.2454,4.451576,0.2325,0.0541,0.2048,0.2047,13.526,0.0117,1.0,1.88,26136,13902


TRAINING EPOCH SET 10
TRAINING EPOCHS 5
./models/5_epoch_set
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7465,4.47159,0.2365,0.0527,0.2074,0.2075,13.4428,0.0106,1.0,1.8796,26130,13902
2,3.9684,4.430523,0.235,0.0552,0.2088,0.2088,13.4916,0.0123,1.0,1.8676,25964,13902
3,3.9917,4.39955,0.2387,0.0572,0.2116,0.2117,13.5168,0.0125,1.0,1.8715,26018,13902
4,3.9798,4.381341,0.2382,0.0569,0.2107,0.2108,13.376,0.0126,1.0,1.8497,25714,13902
5,3.9315,4.388689,0.2383,0.0569,0.2118,0.2117,13.3916,0.0126,1.0,1.8456,25657,13902


In [15]:
RUN_BASE_TRAINING = True
if RUN_BASE_TRAINING:
    train_df.cluster.value_counts()

In [16]:
if RUN_BASE_TRAINING:
    with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_epoch_set.pickle', 'rb') as handle:
        test_result_df = pickle.load(handle)

    test_result_df = test_result_df.rename(columns={"epoch_set": "model_set"})

    for cluster_idx in experiment_config["CLUSTER_SET_ID"]:
        test_result_df = test_cluster_set(experiment_config=experiment_config,
                                        test_df=test_df,
                                        test_data=test_data,
                                        tokenizer=tokenizer,
                                        results_df=test_result_df,
                                        cluster_id=cluster_idx,
                                        train_df=train_df)

    ########## SAVE THE FILE

    with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_results.pickle', 'wb') as handle:
        pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Cluster [0] training size (1352, 14)
TRAINING CLUSTER SET [0] FOR EPOCHS3


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.7559,5.023318,0.1751,0.0211,0.1637,0.164,7.0764,0.0066,0.8776,0.8845,12297,13902
2,4.9565,4.650017,0.2337,0.0416,0.2134,0.2134,7.5856,0.0113,0.962,0.9627,13383,13902
3,4.6605,4.589846,0.2415,0.0433,0.2182,0.2182,7.6796,0.013,0.9735,0.9738,13538,13902


Cluster [3] training size (1043, 14)
TRAINING CLUSTER SET [3] FOR EPOCHS3
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.7093,5.459018,0.135,0.0193,0.1246,0.1244,12.8796,0.0044,1.0,1.8484,25696,13902
2,4.9903,5.223158,0.1746,0.029,0.1582,0.1582,16.1288,0.0067,1.0,2.361,32823,13902
3,4.6829,5.188588,0.1812,0.0329,0.1635,0.1634,16.4324,0.0072,1.0,2.4167,33597,13902


Cluster [0, 3] training size (2395, 14)
TRAINING CLUSTER SET [0, 3] FOR EPOCHS3


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.4378,4.814584,0.1876,0.0325,0.1714,0.1713,13.472,0.0079,1.0,1.9236,26742,13902
2,4.7791,4.6435,0.2076,0.0427,0.1847,0.1846,14.6884,0.0101,1.0,2.1192,29461,13902
3,4.6175,4.605558,0.2122,0.0429,0.1895,0.1894,14.4496,0.0096,1.0,2.0877,29023,13902


In [17]:
with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_results.pickle', 'rb') as handle:
        test_result_df = pickle.load(handle)

### ENSEMBLE COMPUTE
test_result_df = ensemble_compute(test_result_df=test_result_df,
                                  optimal_ensemble_map=optimal_ensemble_map)

########## ROUGE PER SETTING

print("Mean")
print(test_result_df.groupby("model_set")["rouge"].mean())

print("STD")
print(test_result_df.groupby("model_set")["rouge"].std())

print(test_result_df.opt_es_id.value_counts())

with open(f'reports/results/{experiment_config["ANALYSIS_POSTFIX"]}/test_results_full.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Mean
model_set
0                 0.118588
1                 0.218737
2                 0.228100
5                 0.235283
10                0.238665
cluster_[0, 3]    0.214620
cluster_[0]       0.241818
cluster_[3]       0.182781
ensemble          0.241889
Name: rouge, dtype: float64
STD
model_set
0                 0.148489
1                 0.159593
2                 0.164356
5                 0.165821
10                0.167403
cluster_[0, 3]    0.157261
cluster_[0]       0.186646
cluster_[3]       0.134504
ensemble          0.184731
Name: rouge, dtype: float64
opt_es_id
cluster_[0]    19782
10              2457
5                261
Name: count, dtype: int64


In [32]:
print(test_result_df.loc[test_result_df["model_set"] ==test_result_df["opt_es_id"]].opt_es_id.value_counts())

opt_es_id
cluster_[0]    2198
10              273
5                29
Name: count, dtype: int64
