In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl

# Huggin Face 
from transformers import (
    AutoTokenizer, 
    AutoConfig,
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)

from tokenizers import AddedToken

# PyTorch
import torch

from datasets import Dataset
# scikit-learn
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold,train_test_split

from typing import Tuple,List
import gc
import json
import os

import wandb
import time

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

2024-05-22 09:09:33.171945: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-22 09:09:33.172074: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-22 09:09:33.268979: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
print("===== Initialize the Environment for Training =====")
WANDB_PROJECT_NAME = "Kaggle_AES_HPS"
RUN_NAME =  int(time.time())
#! pip install optuna
#! pip install ray[tune]
#! pip install wandb --upgrade
os.environ["WANDB_PROJECT"]=WANDB_PROJECT_NAME
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
# wandb.login()
print(f"===== Everything is Settled,WandB Run Name :{RUN_NAME} =====")

===== Initialize the Environment for Training =====
===== Everything is Settled,WandB Run Name :1716368981 =====


In [3]:
class Tokenize(object):
    def __init__(self, train, valid, tokenizer,max_length):
        self.tokenizer = tokenizer
        self.train = train
        self.valid = valid
        self.max_length = max_length
        
    def get_dataset(self, df):
        ds = Dataset.from_dict({
                'essay_id': [e for e in df['essay_id']],
                'full_text': [ft for ft in df['full_text']],
                'label': [s for s in df['label']],
            })
        return ds
        
    def tokenize_function(self, example):
        tokenized_inputs = self.tokenizer(
            example['full_text'], truncation=True, max_length=self.max_length,
        )
        return tokenized_inputs
    
    def __call__(self):
        train_ds = self.get_dataset(self.train)
        valid_ds = self.get_dataset(self.valid)
        
        tokenized_train = train_ds.map(
            self.tokenize_function, batched=True
        )
        tokenized_valid = valid_ds.map(
            self.tokenize_function, batched=True
        )
        
        return tokenized_train, tokenized_valid, self.tokenizer


def fold_df(path:str="/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv", 
        n_splits:int = 5, 
        size:int = None,
        label_col_name: str = "score",
        random_state: int = 42) ->pd.DataFrame:

        data = pd.read_csv(path)
        data['label'] = data[label_col_name].apply(lambda x: x-1)
        
        if size :
            data, _ = train_test_split(data, train_size=size, stratify=data[label_col_name], random_state=random_state)
            data = data.reset_index(drop=True)
        
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        for i, (_, val_index) in enumerate(skf.split(data, data[label_col_name])):
            data.loc[val_index, "fold"] = i
        return data

def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred
    qwk = cohen_kappa_score(labels, predictions.argmax(-1), weights='quadratic')
    results = {
        'qwk': qwk
    }
    return results

In [4]:
def search_hp(data,
              hp_space,
              backend,
              opti_n_trials,
             run_name):
    best_params_list = []
    qwk_scores = []
    model_path = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base'
    token_max_len = 1024
    num_labels =6
    output_dir = "/kaggle/working/output"
    # optuna_n_trials = 10
    for fold in range(len(data['fold'].unique())):

        train = data[data['fold'] != fold].copy()
        valid = data[data['fold'] == fold].copy()

        tokenizer = AutoTokenizer.from_pretrained(model_path)
        tokenizer.add_tokens([AddedToken("\n", normalized=False)])
        tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])
        tokenize = Tokenize(train, valid, tokenizer,token_max_len)
        tokenized_train, tokenized_valid, _ = tokenize()

        # model = aes_training.init_model
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        
        def init_model():
            return AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
        
        
        training_args = TrainingArguments(
            output_dir="/kaggle/working/output",
            evaluation_strategy = "epoch",
            save_strategy = "no",#"epoch",
            learning_rate= 2e-5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=8,
            num_train_epochs= 1,
            weight_decay= 0.01,
            load_best_model_at_end=False,#True,
            metric_for_best_model="qwk",
            push_to_hub=False,
            optim="adamw_torch",
            report_to="wandb",  # enable logging to W&B
            run_name=f"{run_name}_{fold}",  # name of the W&B run (optional)
            logging_steps=1,
        )
        # training_args.output_dir = '/kaggle/working/output' 

        trainer = Trainer(
            model=None,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_valid,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
            model_init=init_model,
            data_collator=data_collator,
        )

        best_run = trainer.hyperparameter_search(n_trials=opti_n_trials,#optuna_n_trials, 
                                                 direction="maximize",
                                                 backend=backend, # "optuna",
                                                 hp_space=hp_space,)  
        best_params_list.append(best_run.hyperparameters)
        
        # Evaluate the model on the validation set and get QWK score
        metrics = trainer.evaluate(eval_dataset=tokenized_valid)
        qwk_scores.append(metrics['eval_qwk'])
        
        with open(os.path.join(output_dir, f"best_params_fold_{fold}.json"), "w") as f:
            json.dump(best_run.hyperparameters, f, indent=4)
        
        
        del trainer,tokenized_train, tokenized_valid
        torch.cuda.empty_cache()
        gc.collect()
    
    # Save the QWK scores
    with open(os.path.join(output_dir, "qwk_scores.json"), "w") as f:
        json.dump(qwk_scores, f, indent=4)
        
    return best_params_list

In [5]:
sweep_configuration = {
        'method': 'random',
        'metric': {
            'name': 'eval_qwk',
            'goal': 'maximize'
        },
        'parameters': {
            'learning_rate': {
                'distribution': 'log_uniform_values',
                'min': 1e-6,
                'max': 1e-4
            },
            'num_train_epochs': {
                #'values': [5, 8, 9, 10,12,15,20]
                'distribution': 'log_uniform_values',
                'min': 4,
                'max': 20
            },
            'per_device_train_batch_size': {
                'values': [1, 2, 3]
            },
            'per_device_eval_batch_size': {
                'values': [1, 2, 3,8]
            },
            'warmup_steps': {
                'distribution': 'uniform',
                'min': 0,
                'max': 500
            },
            'weight_decay': {
                'distribution': 'uniform',
                'min': 0.0,
                'max': 0.3
            },
            'adam_beta1': {
                'distribution': 'log_uniform_values',
                'min': 0.85,
                'max': 0.95
            },
            'adam_beta2': {
                'distribution': 'log_uniform_values',
                'min': 0.98,
                'max': 0.999
            },
            'adam_epsilon': {
                'distribution': 'log_uniform_values',
                'min': 1e-8,
                'max': 1e-6
            },
            'max_grad_norm': {
                'distribution': 'uniform',
                'min': 0.0,
                'max': 1.0
            },
            'lr_scheduler_type': {
                'values': ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"]
            }
        }
    }

# 

In [6]:
def wandb_search_hp():
    global i,hps_df,range_k
    i+=1
    print(f"========= wandb_search_hp:{i} started =========")
    run = wandb.init()
    qwk_scores = []
    eval_loss = []
    model_path = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base'
    token_max_len = 1024
    num_labels =6
    output_dir = "/kaggle/working/output"
    config = wandb.config
    
    for fold in range_k:
        
        print(f"========= wandb_search_hp:{i},fold{fold} =========")

        train = k_fold_df[k_fold_df['fold'] != fold].copy()
        valid = k_fold_df[k_fold_df['fold'] == fold].copy()

        tokenizer = AutoTokenizer.from_pretrained(model_path)
        tokenizer.add_tokens([AddedToken("\n", normalized=False)])
        tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])
        tokenize = Tokenize(train, valid, tokenizer,token_max_len)
        tokenized_train, tokenized_valid, _ = tokenize()

        # model = aes_training.init_model
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        
        def init_model():
            return AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
        
        
        training_args = TrainingArguments(
            output_dir="/kaggle/working/output",
            logging_dir="/kaggle/working/logs",
            evaluation_strategy = "epoch",
            save_strategy = "no",  # Change to "epoch" if needed
            learning_rate= config.learning_rate,
            per_device_train_batch_size=config.per_device_train_batch_size,
            per_device_eval_batch_size=config.per_device_eval_batch_size,
            num_train_epochs= config.num_train_epochs,
            weight_decay= config.weight_decay,
            lr_scheduler_type=config.lr_scheduler_type,
            max_grad_norm=config.max_grad_norm,
            adam_epsilon=config.adam_epsilon,
            adam_beta1=config.adam_beta1,
            adam_beta2=config.adam_beta2,
            warmup_steps=config.warmup_steps,
            load_best_model_at_end=False,#True,
            metric_for_best_model="qwk",
            push_to_hub=False,
            optim="adamw_torch",
            report_to="wandb",  # enable logging to W&B
            run_name=f"{RUN_NAME}_{fold}",  # name of the W&B run (optional)
            logging_steps=1,
        )
        # training_args.output_dir = '/kaggle/working/output' 

        trainer = Trainer(
            model_init=init_model,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_valid,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
            data_collator=data_collator,
        )

        trainer.train()
        
        
        # Evaluate the model on the validation set and get QWK score
        metrics = trainer.evaluate(eval_dataset=tokenized_valid)
        print(f"\n\n ====metrics:{metrics}\n\n=====")
        qwk_scores.append(metrics['eval_qwk'])
        eval_loss.append(metrics["eval_loss"])
       
        
        del trainer,tokenized_train, tokenized_valid
        torch.cuda.empty_cache()
        gc.collect()
    
    
    eval_loss = np.mean(eval_loss)
    eval_qwk = np.mean(qwk_scores)
    wandb.log({
                "eval_loss": eval_loss,
               "eval_qwk": eval_qwk
              }
    )
    print(f"========= wandb_search_hp:{i} has finished ========= ")

In [7]:
# sweep_id = wandb.sweep(sweep=sweep_configuration, project=WANDB_PROJECT_NAME)
i = 0
k = 5
size = 500
agent_trail_count=1
k_fold_df : pd.DataFrame = fold_df(size=size,n_splits=k)
range_k = range(k)
# wandb.agent(sweep_id, wandb_search_hp, count=agent_trail_count) 

In [8]:
print(i)

0


In [9]:
# Clean the old straining Stuffs
! rm -rf /kaggle/working/output
# Check the avaiable space of the working folder avoiding disk no space exception during out training
! du -h --max-depth=1

28K	.
