### 1. Settings

In [2]:
#####################################
##########  DEPENDECIES ############
#####################################

import os
import pickle
from tqdm import tqdm # type: ignore
from datetime import date

import evaluate

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

import utils.prep as pr
import utils.eval as ev
import utils.inference as infer
from utils.sampling import create_splits, prep_cv_validation
from utils.training import cv_cluster_set, cv_training_epochs_sets, test_cluster_set
from utils.training import results_dict_todf, cv_step_2, full_step_2, test_training_epochs_sets
from utils.inference import meta_predict, create_ensemble_map, ensemble_compute

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import torch
#####################################
############  CONSTANTS #############
#####################################
RS = 42

MODEL = "CodeT5"
BATCH_SIZE = 16
DECODER_LENGTH = 15
ENCODER_LENGTH = 15
DATE_STR = 20240721
model_name="Salesforce/codet5-base-multi-sum"

FULL_TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": list(range(2)),
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 6e-6,
        "warmup_steps": 500,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "no",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
        "output_dir" : 'reports/results',
        "logging_dir" : "reports/logs",
    },
}

tokenizer = AutoTokenizer.from_pretrained(model_name, skip_special_tokens=False)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rouge = evaluate.load('rouge')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/RDC/zinovyee.hub/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 2. Conala data. Preprocessing. 

In [3]:
experiment_config = {
    "DATE_STR" : "20240721",
    "RS" : 42,
    "DRIFT_TYPE" : "sudden",
    "NFOLD" : 3,
    "FULL_TRAIN_ARGS" : FULL_TRAIN_ARGS,
    "MODEL_NAME" : model_name,
    "CLUSTER_EPOCHS" : 2,
}
experiment_config["ANALYSIS_POSTFIX"] = f"mined_{experiment_config['DRIFT_TYPE']}_{str(date.today())}"

In [None]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=False)
train_dataset, test_data, test_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

In [None]:
fold_results = cv_training_epochs_sets(experiment_config=experiment_config,
                            splits=splits,
                            questions_list=questions_list,
                            train_dataset=train_dataset,
                            tokenizer=tokenizer)

In [None]:
with open(f'reports/results/foldresult_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(fold_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
#with open(f'reports/results/foldresult_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
#    fold_results = pickle.load(handle)

with open(f'reports/results/foldresult_mined_sudden_2024-08-10.pickle', 'rb') as handle:
    fold_results = pickle.load(handle)

In [None]:
for cluster_idx in [1, 4, 3]:
    fold_results = cv_cluster_set(experiment_config=experiment_config,
                                    splits=splits,
                                    questions_list=questions_list,
                                    train_dataset=train_dataset,
                                    tokenizer=tokenizer,
                                    fold_results=fold_results,
                                    cluster_id=cluster_idx)

cv_df = results_dict_todf(fold_results)

########## SAVE THE FILE

with open(f'reports/results/cv_result_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
fold_results['cluster_1'][0]['rouge'].mean()

In [None]:
cv_df.groupby("model_set").rouge.mean()

In [None]:
fold_results.keys()

In [None]:
print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

In [None]:
########## LOAD CV RESULTS

import pickle
with open(f'reports/results/cv_result_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

### Step 2. Learn performance

In [None]:
cv_df, model_results = cv_step_2(experiment_config=experiment_config, cv_df=cv_df)

with open(f'reports/results/s2_model_results_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(model_results, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(f'reports/results/cd_df_with_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(cv_df, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [5]:

print("Mean")
print(cv_df.groupby(["model_set"])["rouge"].mean())

print("STD")
print(cv_df.groupby("model_set")["rouge"].std())

### TO SAVE THE VECTORIZER AND STEP 2 MODELS

with open(f'reports/results/cd_df_with_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    cv_df = pickle.load(handle)

FileNotFoundError: [Errno 2] No such file or directory: 'reports/results/cd_df_with_s2_mined_sudden_2024-08-15.pickle'

In [None]:
full_step_2(cv_df=cv_df, 
            experiment_config=experiment_config)

# TEST

In [None]:
sampling_dict = create_splits(experiment_config=experiment_config, tokenizer=tokenizer, test=True)
train_dataset, test_data, test_df = sampling_dict["train_data"], sampling_dict["test_data"], sampling_dict["test_df"]

splits, questions_list = prep_cv_validation(train_dataset=train_dataset, 
                            experiment_config=experiment_config)

In [None]:
with open(f"reports/results/cd_df_with_s2_{experiment_config['ANALYSIS_POSTFIX']}.pickle", "rb") as handle:
    cv_resutls = pickle.load(handle)

base_models_list = list(cv_resutls.model_set.unique())
base_models_list.pop(-1)

In [None]:
meta_preds_df = meta_predict(experiment_config=experiment_config, 
                    test_df=test_df,
                    base_models_names=base_models_list,
                    t_models=["svm", "catboost"])

########## SAVE THE FILE

with open(f'reports/results/test_results_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(meta_preds_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
with open(f'reports/results/test_results_s2_mined_sudden_2024-08-11.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)

with open(f'reports/results/test_results_s2_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
    meta_preds_df = pickle.load(handle)

In [None]:
meta_preds_df.groupby("model_set").svm_preds.mean()

In [None]:
optimal_ensemble_map, ensemble_val_estim = create_ensemble_map(meta_preds_df=meta_preds_df, 
                                                                t_model_name="svm")

In [None]:
test_result_df = test_training_epochs_sets(experiment_config=experiment_config,
                            test_df=test_df,
                            test_data=test_data,
                            train_data=train_dataset,
                            tokenizer=tokenizer)

In [None]:
test_result_df.columns

In [None]:
with open(f'reports/results/test_results_df_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
test_result_df.columns

In [None]:
#with open(f'reports/results/test_results_df_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'rb') as handle:
#    test_result_df = pickle.load(handle)

with open(f'reports/results/test_results_df_mined_sudden_2024-08-11.pickle', 'rb') as handle:
    test_result_df = pickle.load(handle)

test_result_df = test_result_df.rename(columns={"epoch_set": "model_set"})

for cluster_idx in [1, 4, 3]:
    test_result_dict = test_cluster_set(experiment_config=experiment_config,
                                    test_df=test_df,
                                    test_data=test_data,
                                    tokenizer=tokenizer,
                                    results_df=test_result_df,
                                    cluster_id=cluster_idx)

#test_result_df = results_dict_todf(test_result_dict)

########## SAVE THE FILE

with open(f'reports/results/test_results_df_{experiment_config["ANALYSIS_POSTFIX"]}.pickle', 'wb') as handle:
    pickle.dump(test_result_df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
### ENSEMBLE COMPUTE
test_result_df = ensemble_compute(test_result_df=test_result_df,
                                  optimal_ensemble_map=optimal_ensemble_map)

In [None]:
########## ROUGE PER SETTING

print("Mean")
print(test_result_df.groupby("model_set")["rouge"].mean())

print("STD")
print(test_result_df.groupby("model_set")["rouge"].std())

In [None]:
test_result_df.opt_es_id.value_counts()