### 1. Settings

In [1]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
from tqdm import tqdm # type: ignore
from datetime import date

import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer
from utils.sampling import create_splits, prep_cv_validation
from utils.training import cv_cluster_set, cv_training_epochs_sets, test_cluster_set
from utils.training import results_dict_todf, cv_step_2, full_step_2, test_training_epochs_sets
from utils.inference import meta_predict, create_ensemble_map

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 15
ENCODER_LENGTH = 30
DATE_STR = 20240721
model_name="Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": list(range(10)),
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. 

In [2]:
experiment_config = {
    "DATE_STR" : "20240721",
    "RS" : 42,
    "DRIFT_TYPE" : "sudden",
    "NFOLD" : 3,
    "FULL_TRAIN_ARGS" : FULL_TRAIN_ARGS,
    "MODEL_NAME" : model_name,
    "CLUSTER_EPOCHS" : 2,
}
experiment_config["ANALYSIS_POSTFIX"] = f"mined_{experiment_config['DRIFT_TYPE']}_{str(date.today())}"

In [3]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=False)
train_dataset, test_data, test_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

Train Data:  (7942, 11)
Test Data:  (2058, 11)
Train Data: Cluster cluster
2    3632
3    2204
1    1672
0     414
4      20
Name: count, dtype: int64
Test Data: Cluster cluster
4    1980
3      39
2      25
1      12
0       2
Name: count, dtype: int64


Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Map:   0%|          | 0/2058 [00:00<?, ? examples/s]

Fold 0
Fold 1
Fold 2


In [4]:
fold_results = cv_training_epochs_sets(experiment_config=experiment_config,
                            splits=splits,
                            questions_list=questions_list,
                            train_dataset=train_dataset,
                            tokenizer=tokenizer)

Fold 0


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5298 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5298 [00:00<?, ? examples/s]

Map:   0%|          | 0/5298 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Map:   0%|          | 0/2644 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1817,4.698771,0.2487,0.0616,0.2212,0.2213,14.1649,0.0215,1.0,1.1625,29444,25328


TRAINING EPOCH SET 2
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6863,4.596178,0.2591,0.0658,0.229,0.229,14.1532,0.0215,1.0,1.1576,29319,25328


TRAINING EPOCH SET 3
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.4871,4.561218,0.2602,0.0671,0.2295,0.2295,14.1978,0.0208,1.0,1.1583,29337,25328


TRAINING EPOCH SET 4
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.3097,4.553477,0.2606,0.0662,0.2285,0.2286,14.1392,0.0202,1.0,1.1513,29161,25328


TRAINING EPOCH SET 5
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.1325,4.565162,0.2601,0.0662,0.2288,0.2289,14.0866,0.02,1.0,1.1452,29005,25328


TRAINING EPOCH SET 6
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9463,4.592522,0.2558,0.0647,0.2251,0.2251,14.0064,0.0209,1.0,1.1361,28775,25328


TRAINING EPOCH SET 7
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7502,4.633582,0.2531,0.0631,0.2223,0.2223,13.8975,0.0196,1.0,1.1251,28497,25328


TRAINING EPOCH SET 8
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.5517,4.686911,0.2489,0.0608,0.2189,0.2189,13.7398,0.0183,1.0,1.1085,28075,25328


TRAINING EPOCH SET 9
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.3598,4.749295,0.2448,0.0585,0.2146,0.2147,13.6384,0.018,1.0,1.0993,27844,25328


Fold 1


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5295 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5295 [00:00<?, ? examples/s]

Map:   0%|          | 0/5295 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Map:   0%|          | 0/2647 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1708,4.710202,0.2492,0.061,0.2217,0.2219,14.0385,0.0195,1.0,1.1426,29242,25592


TRAINING EPOCH SET 2
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.677,4.602981,0.2689,0.0687,0.2358,0.2358,13.9158,0.0237,1.0,1.1267,28834,25592


TRAINING EPOCH SET 3
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.4786,4.567392,0.2731,0.0708,0.2386,0.2387,13.9203,0.0253,1.0,1.1243,28774,25592


TRAINING EPOCH SET 4
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.3009,4.560123,0.2722,0.0708,0.2375,0.2374,13.8451,0.025,1.0,1.1146,28525,25592


TRAINING EPOCH SET 5
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.122,4.572438,0.2693,0.0686,0.2346,0.2346,13.847,0.0237,1.0,1.1129,28481,25592


TRAINING EPOCH SET 6
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.9348,4.59957,0.2649,0.0669,0.231,0.2311,13.8425,0.0232,1.0,1.1104,28418,25592


TRAINING EPOCH SET 7
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.7388,4.642591,0.2614,0.0651,0.2283,0.2284,13.6351,0.022,1.0,1.091,27920,25592


TRAINING EPOCH SET 8
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.5395,4.693892,0.258,0.063,0.2253,0.2255,13.5164,0.0206,1.0,1.0789,27610,25592


TRAINING EPOCH SET 9
TRAINING EPOCHS 1
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,3.3475,4.754014,0.252,0.0589,0.2193,0.2192,13.326,0.0203,1.0,1.0625,27192,25592


Fold 2


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Map:   0%|          | 0/2651 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.18,4.713183,0.2403,0.0617,0.2159,0.216,14.017,0.0202,1.0,1.1579,29271,25280


TRAINING EPOCH SET 2
TRAINING EPOCHS 1


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
for cluster_idx in [1, 4, 3]:
    fold_results = cv_cluster_set(experiment_config=experiment_config,
                                    splits=splits,
                                    questions_list=questions_list,
                                    train_dataset=train_dataset,
                                    tokenizer=tokenizer,
                                    fold_results=fold_results,
                                    cluster_id=cluster_idx)

cv_df = results_dict_todf(fold_results)

########## SAVE THE FILE

with open(f'reports/results/cv_result_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
fold_results['cluster_1'][0]['rouge'].mean()

In [None]:

fold_results.keys()

In [None]:
print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

In [3]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/cv_result_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

FileNotFoundError: [Errno 2] No such file or directory: 'reports/results/cv_result_mined_sudden_2024-08-10.pickle'

### Step 2. Learn performance

In [None]:
cv_df, model_results = cv_step_2(experiment_config=experiment_config, cv_df=cv_df)

with open(f'reports/results/s2_model_results_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/cd_df_with_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:

print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/cd_df_with_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

In [None]:
full_step_2(cv_df=cv_df, 
            experiment_config=experiment_config)

# TEST

In [None]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=True)
train_dataset, test_data, test_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

In [None]:
with open(f"reports/results/cd_df_with_s2_{experiment_config['ANALYSIS_POSTFIX']}.pickle", "rb") as handle:
    cv_resutls = pickle.load(handle)

base_models_list = list(cv_resutls.model_set.unique())
base_models_list.pop(-1)

In [None]:
meta_preds_df = meta_predict(experiment_config=experiment_config, 
                    test_df=test_df,
                    base_models_names=base_models_list,
                    t_models=["svm", "catboost"])

########## SAVE THE FILE

with open(f'reports/results/test_results_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(meta_preds_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(f'reports/results/test_results_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)

In [3]:
with open(f'reports/results/test_results_s2_mined_sudden_2024-08-09.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)

In [22]:
meta_preds_df.groupby("model_set").svm_preds.mean()

model_set
0            0.121347
1            0.253740
5            0.280409
8            0.276581
cluster_1    0.161217
cluster_3    0.159681
cluster_4    0.157979
Name: svm_preds, dtype: float64

In [19]:
optimal_ensemble_map, ensemble_val_estim = create_ensemble_map(meta_preds_df=meta_preds_df, 
                                                                t_model_name="svm")

In [6]:
test_result_df = test_training_epochs_sets(experiment_config=experiment_config,
                            test_df=test_df,
                            test_data=test_data,
                            train_data=train_dataset,
                            tokenizer=tokenizer)

NameError: name 'test_df' is not defined

In [None]:
test_result_df.columns

In [None]:
test_result_dict.model_set.unique()

In [None]:
for cluster_idx in [1, 4, 3]:
    test_result_dict = test_cluster_set(experiment_config=experiment_config,
                                    test_df=test_df,
                                    test_data=test_data,
                                    tokenizer=tokenizer,
                                    results=test_result_df,
                                    cluster_id=cluster_idx)

test_result_df = results_dict_todf(test_result_dict)

########## SAVE THE FILE

with open(f'reports/results/test_results_df_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
### ENSEMBLE COMPUTE
test_result_df = ensemble_compute(test_result_df=test_result_df,
                                  optimal_ensemble_map=optimal_ensemble_map)

In [None]:
########## ROUGE PER SETTING

print("Mean")
print(test_result_df.groupby("epoch_set")["rouge"].mean())

print("STD")
print(test_result_df.groupby("epoch_set")["rouge"].std())

In [None]:
test_result_df.opt_es_id.value_counts()