### 1. Settings

In [40]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
from tqdm import tqdm # type: ignore
from datetime import date

import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer
from utils.sampling import create_splits, prep_cv_validation
from utils.training import cv_cluster_set, cv_training_epochs_sets, test_cluster_set
from utils.training import results_dict_todf, cv_step_2, full_step_2, test_training_epochs_sets
from utils.inference import meta_predict, create_ensemble_map

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 20
ENCODER_LENGTH = 30
DATE_STR = 20240721
model_name="Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 5, 8],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

### 2. Conala data. Preprocessing. 

In [41]:
experiment_config = {
    "DATE_STR" : "20240721",
    "RS" : 42,
    "DRIFT_TYPE" : "sudden",
    "NFOLD" : 3,
    "FULL_TRAIN_ARGS" : FULL_TRAIN_ARGS,
    "MODEL_NAME" : model_name,
    "CLUSTER_EPOCHS" : 2,
}
experiment_config["ANALYSIS_POSTFIX"] = f"mined_{experiment_config['DRIFT_TYPE']}_{str(date.today())}"

In [3]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=False)
train_dataset, test_data, test_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

Train Data:  (7942, 11)
Test Data:  (2058, 11)
Train Data: Cluster cluster
2    3632
3    2204
1    1672
0     414
4      20
Name: count, dtype: int64
Test Data: Cluster cluster
4    1980
3      39
2      25
1      12
0       2
Name: count, dtype: int64


Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Map:   0%|          | 0/2058 [00:00<?, ? examples/s]

Fold 0
Fold 1
Fold 2


In [4]:
fold_results = cv_training_epochs_sets(experiment_config=experiment_config,
                            splits=splits,
                            questions_list=questions_list,
                            train_dataset=train_dataset,
                            tokenizer=tokenizer)

Fold 0


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5298 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5298 [00:00<?, ? examples/s]

Map:   0%|          | 0/5298 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Map:   0%|          | 0/2644 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1817,4.698771,0.2487,0.0616,0.2212,0.2213,14.1649,0.0215,1.0,1.1625,29444,25328


TRAINING EPOCH SET 5
TRAINING EPOCHS 4
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6798,4.561677,0.2599,0.0666,0.2298,0.2299,14.1528,0.0207,1.0,1.1572,29309,25328
2,4.5816,4.505205,0.265,0.0697,0.2336,0.2335,14.0564,0.0225,1.0,1.1441,28979,25328
3,4.4846,4.484755,0.2674,0.0715,0.2352,0.2352,14.0061,0.0233,1.0,1.1344,28732,25328
4,4.4337,4.477675,0.2687,0.0728,0.2366,0.2367,14.09,0.0243,1.0,1.1442,28981,25328


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2429,4.479573,0.2658,0.0693,0.2327,0.2327,14.1483,0.0232,1.0,1.1477,29068,25328
2,4.2185,4.470181,0.2703,0.0724,0.2367,0.2366,14.0344,0.0233,1.0,1.1356,28762,25328
3,4.2043,4.4659,0.2692,0.0716,0.2354,0.2353,14.166,0.0227,1.0,1.1473,29058,25328


Fold 1


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5295 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5295 [00:00<?, ? examples/s]

Map:   0%|          | 0/5295 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Map:   0%|          | 0/2647 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.1708,4.710202,0.2492,0.061,0.2217,0.2219,14.0385,0.0195,1.0,1.1426,29242,25592


TRAINING EPOCH SET 5
TRAINING EPOCHS 4


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6706,4.571948,0.2701,0.0716,0.2375,0.2374,14.0831,0.0246,1.0,1.1401,29178,25592
2,4.571,4.509912,0.2721,0.0733,0.2382,0.2383,13.9698,0.0262,1.0,1.1286,28884,25592
3,4.4792,4.48725,0.2734,0.0733,0.2394,0.2394,14.1073,0.0253,1.0,1.138,29124,25592
4,4.4222,4.483542,0.2755,0.0734,0.2405,0.2407,14.071,0.0256,1.0,1.1332,29000,25592


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2315,4.491459,0.2759,0.0734,0.2404,0.2406,14.1711,0.0264,1.0,1.1404,29184,25592
2,4.2073,4.473559,0.2752,0.0727,0.2402,0.24,14.0608,0.0266,1.0,1.1286,28884,25592
3,4.1976,4.472107,0.2762,0.0726,0.2411,0.2411,14.0257,0.026,1.0,1.1232,28746,25592


Fold 2


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Map:   0%|          | 0/2651 [00:00<?, ? examples/s]

TRAINING EPOCH SET 0
TRAINING EPOCHS 0


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda
TRAINING EPOCH SET 1
TRAINING EPOCHS 1
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.18,4.713183,0.2403,0.0617,0.2159,0.216,14.017,0.0202,1.0,1.1579,29271,25280


TRAINING EPOCH SET 5
TRAINING EPOCHS 4


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL ./tmp/
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.6788,4.577013,0.2608,0.0722,0.232,0.232,14.0219,0.0245,1.0,1.1485,29033,25280
2,4.5745,4.520506,0.2694,0.0738,0.238,0.2381,13.9415,0.0249,1.0,1.1382,28773,25280
3,4.4696,4.501167,0.2737,0.0753,0.2418,0.2417,14.143,0.0252,1.0,1.1571,29252,25280
4,4.4227,4.495306,0.2736,0.0751,0.2413,0.2414,14.0932,0.0245,1.0,1.1517,29114,25280


TRAINING EPOCH SET 8
TRAINING EPOCHS 3
LOADING MODEL ./tmp/
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.2411,4.499412,0.2682,0.0735,0.2372,0.2372,14.0034,0.0238,1.0,1.1372,28748,25280
2,4.2089,4.486858,0.271,0.0739,0.2392,0.2392,13.9694,0.0247,1.0,1.1362,28722,25280
3,4.1886,4.485295,0.2703,0.074,0.2382,0.2383,14.0645,0.0245,1.0,1.1451,28949,25280


In [5]:
for cluster_idx in [1, 4, 3]:
    fold_results = cv_cluster_set(experiment_config=experiment_config,
                                    splits=splits,
                                    questions_list=questions_list,
                                    train_dataset=train_dataset,
                                    tokenizer=tokenizer,
                                    fold_results=fold_results,
                                    cluster_id=cluster_idx)

cv_df = results_dict_todf(fold_results)

########## SAVE THE FILE

with open(f'reports/results/cv_result_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Fold 0


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Map:   0%|          | 0/2644 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 1 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.098,4.670476,0.2452,0.0594,0.2182,0.2181,12.9414,0.0207,1.0,1.0515,26632,25328
2,4.6446,4.617533,0.2573,0.065,0.2268,0.2271,13.2186,0.0223,1.0,1.0674,27034,25328


Fold 1


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Map:   0%|          | 0/2647 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 1 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Map:   0%|          | 0/2651 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 1 FOR EPOCHS2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda
Fold 0


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Map:   0%|          | 0/2644 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 4 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.0322,4.704625,0.2307,0.053,0.2045,0.2044,12.177,0.0182,0.9934,0.9934,25162,25328
2,4.5596,4.655423,0.2406,0.059,0.2133,0.2133,12.5794,0.019,1.0,1.0201,25837,25328


Fold 1


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Map:   0%|          | 0/2647 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 4 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 2


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Map:   0%|          | 0/2651 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 4 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Fold 0


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2644 [00:00<?, ? examples/s]

Map:   0%|          | 0/2644 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 3 FOR EPOCHS2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.0843,4.672104,0.2482,0.0621,0.2194,0.2196,13.5352,0.0209,1.0,1.1056,28003,25328
2,4.6218,4.619489,0.2546,0.0643,0.2242,0.2245,13.5299,0.0219,1.0,1.1085,28075,25328


Fold 1


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2647 [00:00<?, ? examples/s]

Map:   0%|          | 0/2647 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 3 FOR EPOCHS2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda
Fold 2


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5291 [00:00<?, ? examples/s]

Map:   0%|          | 0/5291 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2651 [00:00<?, ? examples/s]

Map:   0%|          | 0/2651 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 3 FOR EPOCHS2


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


NameError: name 'ANALYSIS_POSTFIX' is not defined

In [18]:
fold_results['cluster_1'][0]['rouge'].mean()

0.2592078728820197

In [7]:

fold_results.keys()

dict_keys([0, 1, 5, 8, 'cluster_1', 'cluster_4', 'cluster_3'])

In [8]:
print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

Mean
model_set
0            0.113737
1            0.247452
5            0.273484
8            0.273170
cluster_1    0.162001
cluster_3    0.160304
cluster_4    0.156176
Name: rouge, dtype: float64
STD
model_set
0            0.124526
1            0.157347
5            0.158682
8            0.158973
cluster_1    0.153707
cluster_3    0.151417
cluster_4    0.149950
Name: rouge, dtype: float64


In [20]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/cv_result_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

### Step 2. Learn performance

In [21]:
cv_df, model_results = cv_step_2(experiment_config=experiment_config, cv_df=cv_df)

with open(f'reports/results/s2_model_results_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/cd_df_with_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)


0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.262464 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14595
[LightGBM] [Info] Number of data points in the train set: 37086, number of used features: 1062
[LightGBM] [Info] Start training from score 0.178388
catboost
Learning rate set to 0.072463
0:	learn: 0.1572665	total: 83.2ms	remaining: 1m 23s
1:	learn: 0.1550724	total: 95.6ms	remaining: 47.7s
2:	learn: 0.1531492	total: 105ms	remaining: 35s
3:	learn: 0.1514378	total: 114ms	remaining: 28.3s
4:	learn: 0.1499549	total: 123ms	remaining: 24.4s
5:	learn: 0.1486640	total: 130ms	remaining: 21.5s
6:	learn: 0.1475208	total: 139ms	remaining: 19.7s
7:	learn: 0.1464843	total: 147ms	remaining: 18.2s
8:	learn: 0.1456481	total: 155ms	remaining: 17s
9:	learn: 0.1449033	total: 162ms	remaining: 16.1s
10:	learn: 0.1442216	total: 170ms	remai

In [None]:

print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/cd_df_with_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

In [23]:
full_step_2(cv_df=cv_df, 
            experiment_config=experiment_config)

lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21184
[LightGBM] [Info] Number of data points in the train set: 55594, number of used features: 1420
[LightGBM] [Info] Start training from score 0.198046
catboost
Learning rate set to 0.077249
0:	learn: 0.1609318	total: 23.3ms	remaining: 23.3s
1:	learn: 0.1594933	total: 37.1ms	remaining: 18.5s
2:	learn: 0.1582267	total: 48.6ms	remaining: 16.1s
3:	learn: 0.1571694	total: 59.6ms	remaining: 14.8s
4:	learn: 0.1562638	total: 71.6ms	remaining: 14.3s
5:	learn: 0.1554548	total: 81.3ms	remaining: 13.5s
6:	learn: 0.1547782	total: 91.5ms	remaining: 13s
7:	learn: 0.1541655	total: 103ms	remaining: 12.8s
8:	learn: 0.1536466	total: 114ms	remaining: 12.5s
9:	learn: 0.1532312	total: 124ms	remaining: 12.2s
10:	learn: 0.1528646	total: 135ms	r

# TEST

In [24]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=True)
train_dataset, test_data, test_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

Train Data:  (7942, 11)
Test Data:  (2058, 11)
Train Data: Cluster cluster
2    3632
3    2204
1    1672
0     414
4      20
Name: count, dtype: int64
Test Data: Cluster cluster
4    1980
3      39
2      25
1      12
0       2
Name: count, dtype: int64


Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Map:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Map:   0%|          | 0/2058 [00:00<?, ? examples/s]

Fold 0
Fold 1
Fold 2


In [26]:
with open(f"reports/results/cd_df_with_s2_{experiment_config['ANALYSIS_POSTFIX']}.pickle", "rb") as handle:
    cv_resutls = pickle.load(handle)

base_models_list = list(cv_resutls.model_set.unique())
base_models_list.pop(-1)

'ensemble'

In [29]:
meta_preds_df = meta_predict(experiment_config=experiment_config, 
                    test_df=test_df,
                    base_models_names=base_models_list,
                    t_models=["svm", "catboost"])

########## SAVE THE FILE

with open(f'reports/results/test_results_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(meta_preds_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

svm
catboost


In [42]:
with open(f'reports/results/test_results_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)

FileNotFoundError: [Errno 2] No such file or directory: 'reports/results/test_results_s2_mined_sudden_2024-08-10.pickle'

In [62]:
with open(f'reports/results/test_results_s2_mined_sudden_2024-08-09.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)

In [63]:
meta_preds_df.groupby("model_set").svm_preds.mean()

model_set
0            0.121347
1            0.253740
5            0.280409
8            0.276581
cluster_1    0.161217
cluster_3    0.159681
cluster_4    0.157979
Name: svm_preds, dtype: float64

In [64]:
optimal_ensemble_map = create_ensemble_map(meta_preds_df=meta_preds_df, 
                                           t_model_name="catboost")

In [68]:
test_result_df = test_training_epochs_sets(experiment_config=experiment_config,
                            test_df=test_df,
                            test_data=test_data,
                            train_data=train_dataset,
                            tokenizer=tokenizer)

TRAINING EPOCH SET 0
TRAINING EPOCHS 0
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


TRAINING EPOCH SET 1
TRAINING EPOCHS 1
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,5.0402,4.602952,0.2643,0.0652,0.2301,0.2304,14.4606,0.0195,1.0,1.1878,23603,19871


OutOfMemoryError: CUDA out of memory. Tried to allocate 182.00 MiB. GPU 0 has a total capacity of 44.46 GiB of which 36.19 MiB is free. Process 1549668 has 4.78 GiB memory in use. Including non-PyTorch memory, this process has 36.85 GiB memory in use. Process 1463273 has 2.78 GiB memory in use. Of the allocated memory 32.09 GiB is allocated by PyTorch, and 4.44 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [61]:
test_result_df.columns

Index(['question_id', 'parent_answer_post_id', 'prob', 'input_sequence',
       'output_sequence', 'id', 'snippet_len', 'intent_len', 'snippet_token_n',
       'intent_token_n', 'cluster', 'input_ids', 'attention_mask', 'labels',
       'prediction', 'rouge', 'epoch_set', 'cluster_1'],
      dtype='object')

In [59]:
test_result_dict.model_set.unique()

AttributeError: 'DataFrame' object has no attribute 'model_set'

In [55]:
for cluster_idx in [1, 4, 3]:
    test_result_dict = test_cluster_set(experiment_config=experiment_config,
                                    test_df=test_df,
                                    test_data=test_data,
                                    tokenizer=tokenizer,
                                    results=test_result_df,
                                    cluster_id=cluster_idx)

test_result_df = results_dict_todf(test_result_dict)

########## SAVE THE FILE

with open(f'reports/results/test_results_df_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7942 [00:00<?, ? examples/s]

Map:   0%|          | 0/7942 [00:00<?, ? examples/s]

TRAINING CLUSTER SET 1 FOR EPOCHS2
LOADING MODEL Salesforce/codet5-base-multi-sum
cuda


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len,Bleu,Brevity Penalty,Length Ratio,Translation Length,Reference Length
1,4.9517,4.590503,0.2565,0.0635,0.2251,0.2254,14.5272,0.0213,1.0,1.1973,23792,19871
2,4.5697,4.547361,0.263,0.0671,0.2315,0.2318,14.38,0.023,1.0,1.1786,23420,19871


ValueError: Columns must be same length as key

In [None]:
### ENSEMBLE COMPUTE
test_result_df = ensemble_compute(test_result_df=test_result_df,
                                  optimal_ensemble_map=optimal_ensemble_map)

In [None]:
########## ROUGE PER SETTING

print("Mean")
print(test_result_df.groupby("epoch_set")["rouge"].mean())

print("STD")
print(test_result_df.groupby("epoch_set")["rouge"].std())

In [None]:
test_result_df.opt_es_id.value_counts()