LoRA

In [1]:
import optuna
import pandas as pd
import os
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, LlamaForCausalLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
)
from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
import wandb
from rouge import Rouge

In [2]:
model_name = "beomi/Llama-3-Open-Ko-8B"
config_data = {
    "general": {
        "data_path": "../data/",
        "model_name": model_name,
        "output_dir": "./"
    },
    "tokenizer": {
        "encoder_max_len": 256,
        "decoder_max_len": 50,
        "bos_token": "<s>",
        "eos_token": "</s>",
        "special_tokens": ['#Person1#', '#Person2#', '#Person3#', '#PhoneNumber#', '#Address#', '#PassportNumber#']
    },
    "training": {
        "overwrite_output_dir": True,
        "num_train_epochs": 20,
        "learning_rate": 1e-5,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 32,
        "warmup_ratio": 0.1,
        "weight_decay": 0.01,
        "lr_scheduler_type": 'cosine',
        "optim": 'adamw_torch',
        "gradient_accumulation_steps": 1,
        "evaluation_strategy": 'epoch',
        "save_strategy": 'epoch',
        "save_total_limit": 5,
        "fp16": False,
        "load_best_model_at_end": True,
        "seed": 42,
        "logging_dir": "./logs",
        "logging_strategy": "epoch",
        "predict_with_generate": True,
        "generation_max_length": 100,
        "max_new_tokens": 256,
        "do_train": True,
        "do_eval": True,
        "early_stopping_patience": 3,
        "early_stopping_threshold": 0.001,
        "report_to": "wandb"
    },
    "wandb": {
        "entity": "legendki",
        "project": "NLP-Summarization",
        "name": "Llama-LoRA-Optuna",
    },
    "inference": {
        "ckt_path": "model ckt path",
        "result_path": "./prediction/",
        "no_repeat_ngram_size": 2,
        "early_stopping": True,
        "generate_max_length": 100,
        "num_beams": 4,
        "batch_size": 32,
        "remove_tokens": ['<s>', '</s>', '<pad>']
    }
}

In [3]:
def compute_metrics(config, tokenizer, pred):
    rouge = Rouge()
    predictions = pred.predictions
    labels = pred.label_ids

    predictions[predictions == -100] = tokenizer.pad_token_id
    labels[labels == -100] = tokenizer.pad_token_id

    decoded_preds = tokenizer.batch_decode(predictions, clean_up_tokenization_spaces=True)
    labels = tokenizer.batch_decode(labels, clean_up_tokenization_spaces=True)

    replaced_predictions = decoded_preds.copy()
    replaced_labels = labels.copy()
    remove_tokens = config['inference']['remove_tokens']
    for token in remove_tokens:
        replaced_predictions = [sentence.replace(token, " ") for sentence in replaced_predictions]
        replaced_labels = [sentence.replace(token, " ") for sentence in replaced_labels]

    results = rouge.get_scores(replaced_predictions, replaced_labels, avg=True)

    result = {
        'rouge1': results['rouge-1']['f'],
        'rouge2': results['rouge-2']['f'],
        'rougeL': results['rouge-l']['f'],
    }
    return result


In [4]:
def load_trainer_for_train(config, generate_model, tokenizer, train_inputs_dataset, val_inputs_dataset):
    print('-'*10, 'Make training arguments', '-'*10,)
    training_args = Seq2SeqTrainingArguments(
        output_dir=config['general']['output_dir'],  
        overwrite_output_dir=config['training']['overwrite_output_dir'],
        num_train_epochs=config['training']['num_train_epochs'],
        learning_rate=config['training']['learning_rate'],
        per_device_train_batch_size=config['training']['per_device_train_batch_size'], 
        per_device_eval_batch_size=config['training']['per_device_eval_batch_size'], 
        warmup_ratio=config['training']['warmup_ratio'], 
        weight_decay=config['training']['weight_decay'], 
        lr_scheduler_type=config['training']['lr_scheduler_type'],
        optim=config['training']['optim'],
        gradient_accumulation_steps=config['training']['gradient_accumulation_steps'],
        evaluation_strategy=config['training']['evaluation_strategy'], 
        save_strategy=config['training']['save_strategy'],
        save_total_limit=config['training']['save_total_limit'], 
        fp16=config['training']['fp16'],
        load_best_model_at_end=config['training']['load_best_model_at_end'], 
        seed=config['training']['seed'],
        logging_dir=config['training']['logging_dir'], 
        logging_strategy=config['training']['logging_strategy'],
        predict_with_generate=config['training']['predict_with_generate'],
        generation_max_length=config['training']['generation_max_length'],
        do_train=config['training']['do_train'],
        do_eval=config['training']['do_eval'],
        report_to=config['training']['report_to']
    )

    wandb.init(
        entity=config['wandb']['entity'],
        project=config['wandb']['project'],
        name=config['wandb']['name'],
    )

    os.environ["WANDB_LOG_MODEL"] = "true"
    os.environ["WANDB_WATCH"] = "false"

    MyCallback = EarlyStoppingCallback(
        early_stopping_patience=config['training']['early_stopping_patience'],
        early_stopping_threshold=config['training']['early_stopping_threshold']
    )
    print('-'*10, 'Make training arguments complete', '-'*10,)
    print('-'*10, 'Make trainer', '-'*10,)

    trainer = Seq2SeqTrainer(
        model=generate_model,  
        args=training_args,
        train_dataset=train_inputs_dataset,
        eval_dataset=val_inputs_dataset,
        compute_metrics=lambda pred: compute_metrics(config, tokenizer, pred),
        callbacks=[MyCallback]
    )
    print('-'*10, 'Make trainer complete', '-'*10,)

    return trainer

In [5]:
def load_tokenizer_and_model_for_train(config, device):
    print('-'*10, 'Load tokenizer & model', '-'*10,)
    
    tokenizer = AutoTokenizer.from_pretrained(config['general']['model_name'])
    tokenizer.padding_side = 'left'
    special_tokens_dict = {'additional_special_tokens': config['tokenizer']['special_tokens']}
    tokenizer.add_special_tokens(special_tokens_dict)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False,
    )

    model = LlamaForCausalLM.from_pretrained(
        config['general']['model_name'], 
        quantization_config=quant_config,
        device_map="auto",
    )
    model.config.use_cache = False
    model.resize_token_embeddings(len(tokenizer))

    peft_config = LoraConfig(
        r=4,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, peft_config)
    model.to(device)
    print('-'*10, 'Load tokenizer & model complete', '-'*10,)
    return model, tokenizer


In [6]:
def load_data(config):
    data_path = config['general']['data_path']
    train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
    val_df = pd.read_csv(os.path.join(data_path, 'dev.csv'))
    return train_df, val_df

In [7]:
class Preprocess:
    def __init__(self, bos_token, eos_token):
        self.bos_token = bos_token
        self.eos_token = eos_token

    @staticmethod
    def make_set_as_df(file_path, is_train=True):
        df = pd.read_csv(file_path)
        if is_train:
            return df[['fname', 'dialogue', 'summary']]
        else:
            return df[['fname', 'dialogue']]

    def make_input(self, dataset, is_test=False):
        if is_test:
            encoder_input = dataset['dialogue']
            decoder_input = [self.bos_token] * len(dataset['dialogue'])
            return encoder_input.tolist(), list(decoder_input)
        else:
            encoder_input = dataset['dialogue']
            decoder_input = dataset['summary'].apply(lambda x: self.bos_token + str(x))
            decoder_output = dataset['summary'].apply(lambda x: str(x) + self.eos_token)
            return encoder_input.tolist(), decoder_input.tolist(), decoder_output.tolist()

In [8]:
def prepare_train_dataset(config, preprocessor, data_path, tokenizer):
    train_file_path = os.path.join(data_path, 'train.csv')
    val_file_path = os.path.join(data_path, 'dev.csv')

    train_data = preprocessor.make_set_as_df(train_file_path)
    val_data = preprocessor.make_set_as_df(val_file_path)

    print('-'*150)
    print(f'train_data:\n {train_data["dialogue"][0]}')
    print(f'train_label:\n {train_data["summary"][0]}')

    print('-'*150)
    print(f'val_data:\n {val_data["dialogue"][0]}')
    print(f'val_label:\n {val_data["summary"][0]}')

    encoder_input_train, decoder_input_train, decoder_output_train = preprocessor.make_input(train_data)
    encoder_input_val, decoder_input_val, decoder_output_val = preprocessor.make_input(val_data)
    print('-'*10, 'Load data complete', '-'*10, )

    tokenized_encoder_inputs = tokenizer(encoder_input_train, return_tensors="pt", padding=True,
                                         add_special_tokens=True, truncation=True, max_length=config['tokenizer'][
            'encoder_max_len'], return_token_type_ids=False)
    tokenized_decoder_inputs = tokenizer(decoder_input_train, return_tensors="pt", padding=True,
                                         add_special_tokens=True, truncation=True, max_length=config['tokenizer'][
            'decoder_max_len'], return_token_type_ids=False)
    tokenized_decoder_outputs = tokenizer(decoder_output_train, return_tensors="pt", padding=True,
                                          add_special_tokens=True, truncation=True, max_length=config['tokenizer'][
            'decoder_max_len'], return_token_type_ids=False)

    train_inputs_dataset = DatasetForTrain(tokenized_encoder_inputs, tokenized_decoder_inputs, tokenized_decoder_outputs,
                                           len(encoder_input_train))

    val_tokenized_encoder_inputs = tokenizer(encoder_input_val, return_tensors="pt", padding=True,
                                             add_special_tokens=True, truncation=True, max_length=config['tokenizer'][
            'encoder_max_len'], return_token_type_ids=False)
    val_tokenized_decoder_inputs = tokenizer(decoder_input_val, return_tensors="pt", padding=True,
                                             add_special_tokens=True, truncation=True, max_length=config['tokenizer'][
            'decoder_max_len'], return_token_type_ids=False)
    val_tokenized_decoder_outputs = tokenizer(decoder_output_val, return_tensors="pt", padding=True,
                                              add_special_tokens=True, truncation=True, max_length=config['tokenizer'][
            'decoder_max_len'], return_token_type_ids=False)

    val_inputs_dataset = DatasetForVal(val_tokenized_encoder_inputs, val_tokenized_decoder_inputs,
                                       val_tokenized_decoder_outputs, len(encoder_input_val))

    print('-'*10, 'Make dataset complete', '-'*10, )
    return train_inputs_dataset, val_inputs_dataset


In [9]:
class DatasetForTrain(Dataset):
    def __init__(self, encoder_input, decoder_input, labels, length):
        self.encoder_input = encoder_input
        self.decoder_input = decoder_input
        self.labels = labels
        self.length = length

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encoder_input.items()}
        item2 = {key: val[idx].clone().detach() for key, val in self.decoder_input.items()}
        item2['decoder_input_ids'] = item2['input_ids']
        item2['decoder_attention_mask'] = item2['attention_mask']
        item2.pop('input_ids')
        item2.pop('attention_mask')
        item.update(item2)

        label = self.labels['input_ids'][idx]
        item['labels'] = torch.cat([label, torch.full((item['input_ids'].shape[-1] - label.shape[-1],), -100)], dim=0)
        
        return item

    def __len__(self):
        return self.length


In [10]:
class DatasetForVal(Dataset):
    def __init__(self, encoder_input, decoder_input, labels, length):
        self.encoder_input = encoder_input
        self.decoder_input = decoder_input
        self.labels = labels
        self.length = length

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encoder_input.items()}
        item2 = {key: val[idx].clone().detach() for key, val in self.decoder_input.items()}
        item2['decoder_input_ids'] = item2['input_ids']
        item2['decoder_attention_mask'] = item2['attention_mask']
        item2.pop('input_ids')
        item2.pop('attention_mask')
        item.update(item2)

        label = self.labels['input_ids'][idx]

        if item['input_ids'].shape[-1] > label.shape[-1]:
            label = torch.cat([label, torch.full((item['input_ids'].shape[-1] - label.shape[-1],), -100)], dim=0)
        elif item['input_ids'].shape[-1] < label.shape[-1]:
            item['input_ids'] = torch.cat([item['input_ids'], torch.full((label.shape[-1] - item['input_ids'].shape[-1],), tokenizer.pad_token_id)], dim=0)

        item['labels'] = label

        return item

    def __len__(self):
        return self.length


In [11]:
def prepare_train_dataset(config, preprocessor, data_path, tokenizer):
    train_file_path = os.path.join(data_path, 'train.csv')
    val_file_path = os.path.join(data_path, 'dev.csv')

    train_data = preprocessor.make_set_as_df(train_file_path)
    val_data = preprocessor.make_set_as_df(val_file_path)

    encoder_input_train, decoder_input_train, decoder_output_train = preprocessor.make_input(train_data)
    encoder_input_val, decoder_input_val, decoder_output_val = preprocessor.make_input(val_data)

    tokenized_encoder_inputs = tokenizer(encoder_input_train, return_tensors="pt", padding=True,
                                         add_special_tokens=True, truncation=True, max_length=config['tokenizer']['encoder_max_len'], return_token_type_ids=False)
    tokenized_decoder_inputs = tokenizer(decoder_input_train, return_tensors="pt", padding=True,
                                         add_special_tokens=True, truncation=True, max_length=config['tokenizer']['decoder_max_len'], return_token_type_ids=False)
    tokenized_decoder_outputs = tokenizer(decoder_output_train, return_tensors="pt", padding=True,
                                          add_special_tokens=True, truncation=True, max_length=config['tokenizer']['decoder_max_len'], return_token_type_ids=False)

    train_inputs_dataset = DatasetForTrain(tokenized_encoder_inputs, tokenized_decoder_inputs, tokenized_decoder_outputs, len(encoder_input_train))

    val_tokenized_encoder_inputs = tokenizer(encoder_input_val, return_tensors="pt", padding=True,
                                             add_special_tokens=True, truncation=True, max_length=config['tokenizer']['encoder_max_len'], return_token_type_ids=False)
    val_tokenized_decoder_inputs = tokenizer(decoder_input_val, return_tensors="pt", padding=True,
                                             add_special_tokens=True, truncation=True, max_length=config['tokenizer']['decoder_max_len'], return_token_type_ids=False)
    val_tokenized_decoder_outputs = tokenizer(decoder_output_val, return_tensors="pt", padding=True,
                                              add_special_tokens=True, truncation=True, max_length=config['tokenizer']['decoder_max_len'], return_token_type_ids=False)

    val_inputs_dataset = DatasetForVal(val_tokenized_encoder_inputs, val_tokenized_decoder_inputs, val_tokenized_decoder_outputs, len(encoder_input_val))

    print('-'*10, 'Make dataset complete', '-'*10,)
    return train_inputs_dataset, val_inputs_dataset

In [12]:
def objective(trial):
    config_data['training']['learning_rate'] = 1e-5
    config_data['training']['per_device_train_batch_size'] = 4
    config_data['training']['num_train_epochs'] = 26
    config_data['training']['warmup_ratio'] = 0.09577831393575928
    config_data['training']['optim'] = 'adamw_hf'
    config_data['training']['gradient_accumulation_steps'] = 3
    config_data['training']['lr_scheduler_type'] = 'cosine'
    config_data['training']['fp16'] = True

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Running on: {device}")

    generate_model, tokenizer = load_tokenizer_and_model_for_train(config_data, device)
    print("Model and Tokenizer Loaded.")

    preprocessor = Preprocess(config_data['tokenizer']['bos_token'], config_data['tokenizer']['eos_token'])
    train_inputs_dataset, val_inputs_dataset = prepare_train_dataset(config_data, preprocessor, config_data['general']['data_path'], tokenizer)

    trainer = load_trainer_for_train(config_data, generate_model, tokenizer, train_inputs_dataset, val_inputs_dataset)

    trainer.train()

    eval_metrics = trainer.evaluate(eval_dataset=val_inputs_dataset)
    rougeL = eval_metrics.get('rougeL', 0.0)

    return rougeL

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Running on: {device}")

generate_model, tokenizer = load_tokenizer_and_model_for_train(config_data, device)
print("Model and Tokenizer Loaded.")

preprocessor = Preprocess(config_data['tokenizer']['bos_token'], config_data['tokenizer']['eos_token'])
train_inputs_dataset, val_inputs_dataset = prepare_train_dataset(config_data, preprocessor, config_data['general']['data_path'], tokenizer)

trainer = load_trainer_for_train(config_data, generate_model, tokenizer, train_inputs_dataset, val_inputs_dataset)

trainer.train()

best_model_path = trainer.state.best_model_checkpoint
config_data['inference']['ckt_path'] = best_model_path