### 1. Settings

In [18]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
from tqdm import tqdm # type: ignore
from datetime import date

import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer
from utils.sampling import create_splits, prep_cv_validation
from utils.training import cv_cluster_set, cv_training_epochs_sets
from utils.training import results_dict_todf, cv_step_2, full_step_2

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 20
ENCODER_LENGTH = 30
ANALYSIS_POSTFIX = f"mined_no_drift_{str(date.today())}"
DATE_STR = 20240721
SEMANTIC_DRIFT = True
model_name="Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": [0, 1, 5, 8, 10, 16],
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

### 2. Conala data. Preprocessing. 

In [2]:
experiment_config = {
    "DATE_STR" : "20240721",
    "RS" : 42,
    "DRIFT_TYPE" : "sudden",
    "NFOLD" : 3,
    "FULL_TRAIN_ARGS" : FULL_TRAIN_ARGS,
    "MODEL_NAME" : model_name,
    "ANALYSIS_POSTFIX" : ANALYSIS_POSTFIX,
    "CLUSTER_EPOCHS" : 1,
}

In [3]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=False)
train_dataset, test_data, test_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

Train Data:  (7942, 11)
Test Data:  (2058, 11)
Train Data: Cluster cluster
2    3632
3    2204
1    1672
0     414
4      20
Name: count, dtype: int64
Test Data: Cluster cluster
4    1980
3      39
2      25
1      12
0       2
Name: count, dtype: int64


Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2058 [00:00<?, ? examples/s]

Map:   0%|          | 0/2058 [00:00<?, ? examples/s]

Fold 0
Fold 1
Fold 2


In [None]:
fold_results = cv_training_epochs_sets(experiment_config=experiment_config,
                            splits=splits,
                            questions_list=questions_list,
                            train_dataset=train_dataset,
                            tokenizer=tokenizer)

In [None]:
for cluster_idx in [1, 4, 5]:
    fold_results = cv_cluster_set(experiment_config=experiment_config,
                                    splits=splits,
                                    questions_list=questions_list,
                                    train_dataset=train_dataset,
                                    tokenizer=tokenizer,
                                    fold_results=fold_results,
                                    cluster_id=cluster_idx)

cv_df = results_dict_todf(fold_results)

########## SAVE THE FILE

with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

In [4]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/cv_result_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

Mean
epoch_set
0     0.177599
1     0.364850
5     0.382583
8     0.381317
10    0.380107
16    0.378237
Name: rouge, dtype: float64
STD
epoch_set
0     0.155300
1     0.175505
5     0.171856
8     0.171687
10    0.171634
16    0.172638
Name: rouge, dtype: float64


### Step 2. Learn performance

In [6]:
cv_df, model_results = cv_step_2(experiment_config=experiment_config, cv_df=cv_df)

with open(f'reports/results/s2_model_results_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

0
lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 6.621671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14574
[LightGBM] [Info] Number of data points in the train set: 31806, number of used features: 836
[LightGBM] [Info] Start training from score 0.343661
catboost
Learning rate set to 0.070725
0:	learn: 0.1810157	total: 72.9ms	remaining: 1m 12s
1:	learn: 0.1790317	total: 82.4ms	remaining: 41.1s
2:	learn: 0.1773187	total: 92.5ms	remaining: 30.8s
3:	learn: 0.1757246	total: 99.9ms	remaining: 24.9s
4:	learn: 0.1743863	total: 106ms	remaining: 21.2s
5:	learn: 0.1732114	total: 112ms	remaining: 18.6s
6:	learn: 0.1721594	total: 118ms	remaining: 16.7s
7:	learn: 0.1712010	total: 123ms	remaining: 15.3s
8:	learn: 0.1703818	total: 129ms	remaining: 14.2s
9:	learn: 0.1696222	total: 134ms	remaining: 13.2s
10:	learn: 0.1689747	total: 139ms	

In [10]:
### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/cd_df_with_s2_{ANALYSIS_POSTFIX}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

lr
svm
lgbm
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 8.601641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 21483
[LightGBM] [Info] Number of data points in the train set: 47646, number of used features: 1092
[LightGBM] [Info] Start training from score 0.344116
catboost
Learning rate set to 0.075389
0:	learn: 0.1831371	total: 16.1ms	remaining: 16.1s
1:	learn: 0.1810476	total: 27.5ms	remaining: 13.7s
2:	learn: 0.1792138	total: 36.4ms	remaining: 12.1s
3:	learn: 0.1776325	total: 44.9ms	remaining: 11.2s
4:	learn: 0.1762803	total: 53.1ms	remaining: 10.6s
5:	learn: 0.1750655	total: 60.8ms	remaining: 10.1s
6:	learn: 0.1740201	total: 68.9ms	remaining: 9.78s
7:	learn: 0.1730877	total: 76.4ms	remaining: 9.47s
8:	learn: 0.1722715	total: 83.2ms	remaining: 9.16s
9:	learn: 0.1715376	total: 89.9ms	remaining: 8.9s
10:	learn: 0.1709201	total: 97.

In [None]:
full_step_2(cv_df=cv_df, 
            experiment_config=experiment_config)