<a id="1"></a>
# <div style="border: 2px solid #555; color:black; border-radius: 10px; background-color: #0074D9; padding: 10px; font-size: 20px; text-align: center;">Introduction</div>

**Table Of Content:**
* [Introduction](#1)
* [Refactor and Define utils](#2)
* [Refactor Train](#3)
* [Sweeps](#4)


<a id="2"></a>
# <div style="border: 2px solid #555; color:black; border-radius: 10px; background-color: #0074D9; padding: 10px; font-size: 20px; text-align: center;">Refactor and Define utils</div>
* [return top](#1)

In [None]:
%%writefile utils.py
import json
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from datasets import Dataset
import io
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import torch.nn.functional as F
import wandb

def read_data():
    """
    return train , test
    """
    with open("/kaggle/working/artifacts/detect_llm_raw_data:v1/train_df.table.json") as json_data:
        data = json.load(json_data)
        train = pd.DataFrame(data = data["data"],columns=data["columns"])
        json_data.close()

    with open("/kaggle/working/artifacts/detect_llm_raw_data:v1/test_df.table.json") as json_data:
        data = json.load(json_data)
        test = pd.DataFrame(data = data["data"],columns=data["columns"])
        json_data.close()
    return train , test

def preprocess(train=None,test=None):
    """
    return dataset_train, dataset_test
    """
    train.fillna(" ",inplace=True)
    test.fillna(" ",inplace=True)
    train["text"] = train["Question"] + " " + train["Response"]
    test["text"] = test["Question"] + " " + test["Response"]
    df_train = train[["target","text"]]
    df_test = test[["text"]]
    dataset_train = Dataset.from_pandas(df_train)
    dataset_test = Dataset.from_pandas(df_test)
    
    return dataset_train, dataset_test


def dataset_tokenize_n_split(train, dataset_train, dataset_test,model_name):
    """
    return split_train_dataset,split_eval_dataset , tokenized_test , tokenizer
    """
    tokenizer       = AutoTokenizer.from_pretrained(model_name )
    def tokenize_function(examples):
    
        return tokenizer(examples["text"], padding="max_length", truncation=True)

    tokenized_train = dataset_train.map(tokenize_function, batched=True)
    tokenized_test  = dataset_test.map(tokenize_function, batched=True)
    tokenized_train = tokenized_train.remove_columns(['text'])
    tokenized_train = tokenized_train.rename_column("target", "labels")
    tokenized_test = tokenized_test.remove_columns(['text'])

    kf= StratifiedKFold(n_splits=10,shuffle=True,random_state=42)
    for i , (tr_idx,val_idx) in enumerate(kf.split(train,train.target)):
        print(f"Fold : {i}")
        print(f"shape train : {tr_idx.shape}")
        print(f"shape val : {val_idx.shape}")
        break
        
    
    split_train_dataset = tokenized_train.select(tr_idx)
    split_eval_dataset = tokenized_train.select(val_idx)

    return split_train_dataset,split_eval_dataset , tokenized_test , tokenizer, train.iloc[val_idx]

def predict_fn(model,dataset_ = None):
    
    """
    return mean of all_probabilities (m,7)
    """
    input_ids = dataset_['input_ids']
    # token_type_ids = dataset_['token_type_ids']
    attention_mask = dataset_['attention_mask']

    # Move the input tensors to the GPU
    input_ids = torch.tensor(input_ids).to('cuda:0')
    # token_type_ids = torch.tensor(token_type_ids).to('cuda:0')
    attention_mask = torch.tensor(attention_mask).to('cuda:0')

    # Define batch size
    batch_size = 8

    # Calculate the number of batches
    num_samples = len(input_ids)
    num_batches = (num_samples + batch_size - 1) // batch_size

    # Initialize a list to store the softmax probabilities
    all_probabilities = []

    # Make predictions in batches
    with torch.no_grad():
        for batch in range(num_batches):
            start_idx = batch * batch_size
            end_idx = min((batch + 1) * batch_size, num_samples)

            batch_input_ids = input_ids[start_idx:end_idx]
    #         batch_token_type_ids = token_type_ids[start_idx:end_idx]
            batch_attention_mask = attention_mask[start_idx:end_idx]

            outputs = model(input_ids=batch_input_ids, 
    #                         token_type_ids=batch_token_type_ids, 
                            attention_mask=batch_attention_mask)
            logits = outputs.logits

            # Apply softmax to get probabilities
            probabilities = F.softmax(logits, dim=1)


            all_probabilities.extend(probabilities.tolist())
    return np.concatenate(all_probabilities,axis=0).reshape(dataset_.shape[0],7)


def conf_mat(df_val = None,preds_val = None):
    """
    no return
    """
    plt.figure(figsize=(8,8))
    ConfusionMatrixDisplay.from_predictions(df_val.target,np.argmax(preds_val,axis=1))
    plt.savefig(f"val_conf_matrix.png", format="png")
    plt.show();
    conf = wandb.Image(data_or_path="val_conf_matrix.png")
    wandb.log({"val_conf_matrix": conf})
def create_model(model_name = "distilroberta-base",num_labels = 7):
    """
    return
    """
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    # Specify the GPU device
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # Move your model to the GPU
    model.to(device);
    
    return model

<a id="3"></a>
# <div style="border: 2px solid #555; color:black; border-radius: 10px; background-color: #0074D9; padding: 10px; font-size: 20px; text-align: center;">Refactor Train</div>
* [return top](#1)

In [None]:
%%writefile sweeps_result.py
epochs = 16
seed = 42
batch_size = 8
learning_rate = 0.0001405
weight_decay = 0.2
learning_sch = 'linear'
architecture = "distilroberta-base"

In [None]:
import sweeps_result

In [None]:
%%writefile requirements.txt


In [None]:
# %%writefile train.py
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from datasets import Dataset
import json
from IPython.display import display
import wandb
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from transformers import AutoModelForSequenceClassification,TrainerCallback
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import torch.nn.functional as F
from utils import *
import io
import matplotlib.pyplot as plt

class WandbMetricsLogger(TrainerCallback):
    def on_evaluate(self, args, state, control, model, metrics):
        # Log metrics to Wandb
        wandb.log(metrics)
        
default_config = {
        'method': 'random',
        'metric': {
        'goal': 'minimize', 
        'name': 'eval_loss'
        },
    }


    # hyperparameters
parameters_dict = {
        'epochs': {
            'value': 2
            },
        'seed': {
            'value': 42
            },
        'batch_size': {
            'values': [4, 8, 16]
            },
        'learning_rate': {
            'distribution': 'log_uniform_values',
            'min': 1e-4,
            'max': 2e-3
        },
        'weight_decay': {
            'values': [0.0, 0.2]
        },
        'learning_sch': {
            'values': ['linear','polynomial','cosine']
        },
        'architecture': {
            'values': ["distilroberta-base","bert-base-uncased","distilbert-base-uncased"]
        },
    }


default_config['parameters'] = parameters_dict

def compute_metrics_fn(eval_preds):
    metrics = dict()

    # Extract the validation loss from eval_preds
    validation_loss = eval_preds.loss
    metrics['validation_loss'] = validation_loss

    return metrics

def parse_args():
    "Overriding default argments"
    argparser = argparse.ArgumentParser(description='Process hyper-parameters')
    argparser.add_argument('--batch_size', type=int, default=default_config.get("parameters").get("batch_size").get("values")[-1],
                           help='batch size')
    argparser.add_argument('--epochs', type=int, default=default_config.get("parameters").get("epochs").get("value"),
                           help='number of training epochs')
    argparser.add_argument('--lr', type=float, default=default_config.get("parameters").get("learning_rate").get("min"),
                           help='learning rate')
    argparser.add_argument('--seed', type=int, default=default_config.get("parameters").get("seed").get("value"),
                           help='random seed')
    argparser.add_argument('--weight_decay', type=float, default=default_config.get("parameters").get("weight_decay").get("values")[-1],
                           help='random seed')
    
    args = argparser.parse_args()
    vars(default_config).update(vars(args))
    return



def train(config=None):
    
    torch.manual_seed(default_config.get("parameters").get("seed").get("value"))
    
    run = wandb.init(
                project="h2o-ai-predict-the-llm-kaggle-competition", 
                entity=None, 
                   job_type="train",
                name = "04-Retrain",
                tags = ["RETRAIN"]
                
    )
    if "artifacts" not in os.listdir():
        raw_data_at = run.use_artifact('mustafakeser/h2o-ai-predict-the-llm-kaggle-competition/detect_llm_raw_data:v1', 
                                                       type='raw_data')
        artifact_di = raw_data_at.download()
    else: pass
    train , test = read_data()
    dataset_train, dataset_test = preprocess(train=train,test=test)
    if config is None:
        config = wandb.config
    else:
        pass 
    split_train_dataset,split_eval_dataset , tokenized_test , tokenizer, df_val = dataset_tokenize_n_split(train,dataset_train, dataset_test,config.architecture)

    
    
    
    model = create_model(model_name =config.architecture ,num_labels = 7)
    
    
    training_args = TrainingArguments(                                

                                output_dir='distilroberta-retrain',
                                report_to='wandb',  # Turn on Weights & Biases logging
                                num_train_epochs=config.epochs,
                                learning_rate=config.learning_rate,
                                lr_scheduler_type = config.learning_sch,
                                metric_for_best_model="eval_loss", 
                                load_best_model_at_end=True,
                                remove_unused_columns=True,
                                greater_is_better=False,
                                weight_decay = config.weight_decay,
                                evaluation_strategy="steps",
                                logging_steps=100,
                                per_device_train_batch_size = config.batch_size,
                                per_device_eval_batch_size = config.batch_size ,
                                
                                )
    early_stopping = EarlyStoppingCallback(early_stopping_patience=2)
    trainer = Trainer(
                        model=model,
                        args=training_args,
                        train_dataset=split_train_dataset,
                        eval_dataset=split_eval_dataset,
                        callbacks = [early_stopping],
                        tokenizer=tokenizer,
        )
    trainer.train()
    
    val_pred = predict_fn(model, split_eval_dataset)
    conf_mat(df_val,val_pred)
    
# if __name__=="__main__":
# #     wandb.agent(sweep_id, train, count=20)
#     parse_args()
#     train(default_config)

<a id="4"></a>
# <div style="border: 2px solid #555; color:black; border-radius: 10px; background-color: #0074D9; padding: 10px; font-size: 20px; text-align: center;">Sweeps</div>
* [return top](#1)

In [None]:
wandb.login(relogin=True)

In [None]:
train(sweeps_result)

In [None]:
wandb.finish()