In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import polars as pl

# Huggin Face 
from transformers import (
    AutoTokenizer, 
    AutoConfig,
    AutoModelForSequenceClassification, 
    Trainer, 
    TrainingArguments, 
    DataCollatorWithPadding
)

from tokenizers import AddedToken

# PyTorch
import torch

from datasets import Dataset
# scikit-learn
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold,train_test_split

from typing import Tuple,List
import gc
import json
import os

import wandb

2024-05-16 10:02:48.995495: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-16 10:02:48.995593: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-16 10:02:49.124203: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
print("===== Initialize the Environment for Training =====")
! pip install optuna
! pip install ray[tune]
! pip install wandb --upgrade
os.environ["WANDB_PROJECT"]="Kaggle_AES"
os.environ["WANDB_LOG_MODEL"]="true"
os.environ["WANDB_WATCH"]="false"
print("===== Everything is Settled =====")


===== Initialize the Environment for Training =====
Collecting wandb
  Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading wandb-0.17.0-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wandb
  Attempting uninstall: wandb
    Found existing installation: wandb 0.16.6
    Uninstalling wandb-0.16.6:
      Successfully uninstalled wandb-0.16.6
Successfully installed wandb-0.17.0
===== Everything is Settled =====


In [3]:
class Tokenize(object):
    def __init__(self, train, valid, tokenizer,max_length):
        self.tokenizer = tokenizer
        self.train = train
        self.valid = valid
        self.max_length = max_length
        
    def get_dataset(self, df):
        ds = Dataset.from_dict({
                'essay_id': [e for e in df['essay_id']],
                'full_text': [ft for ft in df['full_text']],
                'label': [s for s in df['label']],
            })
        return ds
        
    def tokenize_function(self, example):
        tokenized_inputs = self.tokenizer(
            example['full_text'], truncation=True, max_length=self.max_length,
        )
        return tokenized_inputs
    
    def __call__(self):
        train_ds = self.get_dataset(self.train)
        valid_ds = self.get_dataset(self.valid)
        
        tokenized_train = train_ds.map(
            self.tokenize_function, batched=True
        )
        tokenized_valid = valid_ds.map(
            self.tokenize_function, batched=True
        )
        
        return tokenized_train, tokenized_valid, self.tokenizer

In [4]:
 def fold_df(path:str="/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv", 
        n_splits:int = 5, 
        size:int = None,
        label_col_name: str = "score",
        random_state: int = 42) ->pd.DataFrame:

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
        data = pd.read_csv(path)
        data['label'] = data[label_col_name].apply(lambda x: x-1)
        
        if size :
            data, _ = train_test_split(data, train_size=size, stratify=data[label_col_name], random_state=random_state)
            data = data.reset_index(drop=True)
        
        for i, (_, val_index) in enumerate(skf.split(data, data[label_col_name])):
            data.loc[val_index, "fold"] = i
        return data
    
data : pd.DataFrame = fold_df(size=1000)
# data[['fold','label']].groupby(['fold','label',]).size().reset_index(name="Count")

In [5]:
def compute_metrics(eval_pred):
    
    predictions, labels = eval_pred
    qwk = cohen_kappa_score(labels, predictions.argmax(-1), weights='quadratic')
    results = {
        'qwk': qwk
    }
    return results

def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 10),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [1,2,3]),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 500),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "adam_beta1": trial.suggest_float("adam_beta1", 0.85, 0.95),
        "adam_beta2": trial.suggest_float("adam_beta2", 0.98, 0.999),
        "adam_epsilon": trial.suggest_float("adam_epsilon", 1e-8, 1e-6, log=True),
        "max_grad_norm": trial.suggest_float("max_grad_norm", 0.0, 1.0),
        "lr_scheduler_type": trial.suggest_categorical("lr_scheduler_type", ["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"])
    }

def search_hp():
    best_params_list = []
    qwk_scores = []
    model_path = '/kaggle/input/huggingfacedebertav3variants/deberta-v3-base'
    token_max_len = 1024
    num_labels =6
    output_dir = "/kaggle/working/output"
    optuna_n_trials = 10
    for fold in range(len(data['fold'].unique())):

        train = data[data['fold'] != fold].copy()
        valid = data[data['fold'] == fold].copy()

        tokenizer = AutoTokenizer.from_pretrained(model_path)
        tokenizer.add_tokens([AddedToken("\n", normalized=False)])
        tokenizer.add_tokens([AddedToken(" "*2, normalized=False)])
        tokenize = Tokenize(train, valid, tokenizer,token_max_len)
        tokenized_train, tokenized_valid, _ = tokenize()

        # model = aes_training.init_model
        data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
        
        def init_model():
            return AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
        
        
        training_args = TrainingArguments(
            output_dir="/kaggle/working/output",
            evaluation_strategy = "epoch",
            save_strategy = "no",#"epoch",
            learning_rate= 2e-5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=8,
            num_train_epochs= 1,
            weight_decay= 0.01,
            load_best_model_at_end=False,#True,
            metric_for_best_model="qwk",
            push_to_hub=False,
            optim="adamw_torch",
        )
        # training_args.output_dir = '/kaggle/working/output' 
        

        trainer = Trainer(
            model=None,
            args=training_args,
            train_dataset=tokenized_train,
            eval_dataset=tokenized_valid,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
            model_init=init_model,
            data_collator=data_collator,
        )

        best_run = trainer.hyperparameter_search(n_trials=optuna_n_trials, direction="maximize",backend="optuna",hp_space=optuna_hp_space,)  
        best_params_list.append(best_run.hyperparameters)
        
        # Evaluate the model on the validation set and get QWK score
        metrics = trainer.evaluate(eval_dataset=tokenized_valid)
        qwk_scores.append(metrics['eval_qwk'])
        
        with open(os.path.join(output_dir, f"best_params_fold_{fold}.json"), "w") as f:
            json.dump(best_run.hyperparameters, f, indent=4)
        
        
        del trainer,tokenized_train, tokenized_valid
        torch.cuda.empty_cache()
        gc.collect()
    
    # Save the QWK scores
    with open(os.path.join(output_dir, "qwk_scores.json"), "w") as f:
        json.dump(qwk_scores, f, indent=4)
        
    return best_params_list
    

In [6]:
#hps = search_hp()
# wandb.finish()

In [7]:
# ! rm -rf ./output/runs/
# ! du -h --max-depth=1

In [8]:
#with open(f"/kaggle/working/output/best_params_fold_1.json", 'r') as file:
#    j = json.load(file)
    
    
#print(j)