### 모델 및 라이브러리 불러오기

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer
import torch
import os


tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium")

In [None]:
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple
import json

import pandas as pd
import numpy as np
import torch
import random

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

### 데이터 불러오기

#### 대화 데이터

In [None]:
harry = pd.read_csv('/home/ldy/kdms2023/harry_persona/dataset/harrypotter_series.csv')

In [None]:
text_harry = (harry.loc[harry['Name'] == 'HARRY']).reset_index(drop=True)
text_harry

#### 페르소나 데이터

In [None]:
persona = pd.read_excel('/home/ldy/kdms2023/harry_persona/dataset/harrypotter_persona.xlsx', engine='openpyxl')
persona

In [None]:
harry_persona_column = []

for column_name in persona:
    harry_persona_column.append(column_name)
    
harry_persona_column

### 데이터셋 형변환

In [None]:
train_contexted = []

for i in tqdm(range(len(text_harry))):
  row = []
  row.append(text_harry['Answer'][i])
  row.append(text_harry['Question'][i])
  train_contexted.append(row) 

In [None]:
columns = ['response', 'context'] 
columns = columns + ['context/'+str(i) for i in range(0)]
columns

In [None]:
data_df = pd.DataFrame.from_records(train_contexted, columns=columns)
data_df.head(5)

context와 가장 유사한 페르소나를 코사인 유사도를 사용하여 구함

In [None]:
data_df['persona'] = 0

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def get_cosine_similarity(sentence1, sentence2):
    # CountVectorizer를 이용하여 각 문장을 벡터로 변환합니다.
    vectorizer = CountVectorizer().fit_transform([sentence1, sentence2])
    vectors = vectorizer.toarray()
    
    # 두 벡터간의 코사인 유사도를 계산합니다.
    cosine_sim = cosine_similarity(vectors[0].reshape(1,-1), vectors[1].reshape(1,-1))
    
    return cosine_sim[0][0]

In [None]:
for i in tqdm(range(len(data_df))):
    max_score = 0
    max_idx = 1
    for j in range(1, 17):
        sub_score = get_cosine_similarity(data_df['context'][i], persona[harry_persona_column[j]][0])
        if sub_score > max_score:
            max_score = sub_score
            max_idx = j
            
    data_df['persona'][i] = persona[harry_persona_column[max_idx]][0]

In [None]:
data_df

## Persona concat (페르소나와 대화 텍스트 연결 / [SEP] O)

페르소나와 대화 데이터를 concat

In [None]:
sample_data = (data_df["context"][0], "[SEP]", data_df['persona'][0])
sample_data

In [None]:
joined_str = " ".join(sample_data)
joined_str

In [None]:
persona_concat = []
persona_contexted = []

for i in tqdm(range(len(data_df))):
  sample_data = (data_df["context"][i], "[SEP]", data_df['persona'][i])
  joined_str = " ".join(sample_data)
  persona_concat.append(joined_str)

for i in tqdm(range(len(data_df))):
  row = []
  row.append(data_df["response"][i])
  row.append(persona_concat[i])
  persona_contexted.append(row) 

In [None]:
columns = ['response', 'context'] 
columns

pdata_df = pd.DataFrame.from_records(persona_contexted, columns=columns)
pdata_df.head(5)

### Train / Test split

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(pdata_df, test_size=0.2, random_state=812)

In [None]:
trn_df = X_train.reset_index(drop=True)
val_df = X_test.reset_index(drop=True)
trn_df

## Persona concat (페르소나와 대화 텍스트 연결, test에서 X / [SEP] O)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X1_train, X1_test = train_test_split(data_df, test_size=0.2, random_state=812)
X1_train

In [None]:
val_df = X1_test.reset_index(drop=True)

In [None]:
persona_concat = []
persona_contexted = []

for i in tqdm(range(len(X1_train))):
  sample_data = (X1_train["context"][i], "[SEP]", X1_train['persona'][i])
  joined_str = " ".join(sample_data)
  persona_concat.append(joined_str)

for i in tqdm(range(len(X1_train))):
  row = []
  row.append(X1_train["response"][i])
  row.append(persona_concat[i])
  persona_contexted.append(row) 

In [None]:
columns = ['response', 'context'] 
columns

trn_df = pd.DataFrame.from_records(persona_contexted, columns=columns)
trn_df.head(5)

## Persona concat (페르소나와 대화 텍스트 연결 / [SEP] X)

In [None]:
persona_concat = []
persona_contexted = []

for i in tqdm(range(len(data_df))):
  sample_data = (data_df["context"][i], data_df['persona'][i])
  joined_str = " ".join(sample_data)
  persona_concat.append(joined_str)

for i in tqdm(range(len(data_df))):
  row = []
  row.append(data_df["response"][i])
  row.append(persona_concat[i])
  persona_contexted.append(row) 

In [None]:
columns = ['response', 'context'] 
columns

pdata_df = pd.DataFrame.from_records(persona_contexted, columns=columns)
pdata_df.head(5)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(pdata_df, test_size=0.2, random_state=812)

In [None]:
trn_df = X_train.reset_index(drop=True)
val_df = X_test.reset_index(drop=True)
trn_df

## 공통 Fine-tuning 과정

### Fine-tuning parameters

In [None]:
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output-small-save'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-medium'
        self.config_name = 'microsoft/DialoGPT-medium'
        self.tokenizer_name = 'microsoft/DialoGPT-medium'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 100
        self.save_steps = 350
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 812
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

### Fine-tuning

In [None]:
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [None]:
# Cacheing and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [None]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [None]:
def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda:2")
    torch.cuda.set_device("cuda:2")
    args.n_gpu = 1
    args.device = device
    os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # 사용할 CUDA 장치의 인덱스 설정

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

GPU 번호 설정

In [None]:
device = torch.device("cuda:2")
torch.cuda.set_device("cuda:2")
args.n_gpu = 1
args.device = device
os.environ["CUDA_VISIBLE_DEVICES"] = "2"  # 사용할 CUDA 장치의 인덱스 설정
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
main(trn_df, val_df)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X1_train, X1_test = train_test_split(data_df, test_size=0.2, random_state=812)
X1_train

In [None]:
val_df = X1_test.reset_index(drop=True)

In [None]:
persona_concat = []
persona_contexted = []

for i in tqdm(range(len(X1_train))):
  sample_data = (X1_train["context"][i], "[SEP]", X1_train['persona'][i])
  joined_str = " ".join(sample_data)
  persona_concat.append(joined_str)

for i in tqdm(range(len(X1_train))):
  row = []
  row.append(X1_train["response"][i])
  row.append(persona_concat[i])
  persona_contexted.append(row) 

In [None]:
columns = ['response', 'context'] 
columns

trn_df = pd.DataFrame.from_records(persona_contexted, columns=columns)
trn_df.head(5)

In [None]:
persona_concat = []
persona_contexted = []

for i in tqdm(range(len(data_df))):
  sample_data = (data_df["context"][i], data_df['persona'][i])
  joined_str = " ".join(sample_data)
  persona_concat.append(joined_str)

for i in tqdm(range(len(data_df))):
  row = []
  row.append(data_df["response"][i])
  row.append(persona_concat[i])
  persona_contexted.append(row) 

In [None]:
columns = ['response', 'context'] 
columns

pdata_df = pd.DataFrame.from_records(persona_contexted, columns=columns)
pdata_df.head(5)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(pdata_df, test_size=0.2, random_state=812)

In [None]:
trn_df = X_train.reset_index(drop=True)
val_df = X_test.reset_index(drop=True)
trn_df

### Evaluation

In [None]:
test_query = val_df['context']
test_response = val_df['response']

In [None]:
test_chatbot = []

for i in range(len(test_query)):
  tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
  model = AutoModelWithLMHead.from_pretrained('output-medium-save')
  # append the new user input tokens to the chat history
  bot_input_ids = tokenizer.encode(test_query[i] + tokenizer.eos_token, return_tensors='pt')
  print("Patient: {} \n".format(test_query[i]))
  print("Reference:  {} \n".format(test_response[i]))


  # generated a response while limiting the total chat history to 1000 tokens, 
  chat_history_ids = model.generate(
      bot_input_ids, max_length=100,
      pad_token_id=tokenizer.eos_token_id,  
      no_repeat_ngram_size=3,       
      do_sample=True, 
      top_k=10, 
      top_p=0.7,
      temperature = 0.8
  )

  # pretty print last ouput tokens from bot
  test_chatbot.append(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True))


In [None]:
concat_contexted = []

for i in range(len(test_response)):
  row = []
  row.append(test_response[i])
  row.append(test_chatbot[i])
  concat_contexted.append(row)

concat_contexted

In [None]:
columns_name = ['ground_truth', 'pred_persona_conc'] # pred_persona_baseline, pred_persona_conc_test_concX, pred_persona_conc_notoken
test_result = pd.DataFrame.from_records(concat_contexted, columns=columns_name)
test_result.head(5)

In [None]:
persona_baseline = pd.read_csv('/home/ldy/kdms2023/harry_persona/dataset/persona_baseline.csv')
persona_conc_notoken = pd.read_csv('/home/ldy/kdms2023/harry_persona/dataset/persona_conc_notoken.csv')
persona_conc_test_concX = pd.read_csv('/home/ldy/kdms2023/harry_persona/dataset/persona_conc_test_concX.csv')
persona_conc = pd.read_csv('/home/ldy/kdms2023/harry_persona/dataset/persona_conc.csv')

In [None]:
p = persona_baseline["ground_truth"]
p1 = persona_baseline["pred_base"]
p2 = persona_conc_notoken["pred_persona_conc_notoken"]
p3 = persona_conc_test_concX["pred_persona_conc_notoken"]
p4 = persona_conc["pred_persona_conc"]

In [None]:
datasets = pd.DataFrame({'GT' : p, 'pred_base' : p1, 'pred_persona_conc_notoken' : p2, 'pred_persona_conc_notoken_3' : p5, 'pred_persona_conc' : p4})
datasets

In [None]:
import argparse
import logging
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelWithLMHead, AutoTokenizer
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
import torch.nn as nn # Dataparallel 때문에 추가함..

from tqdm import tqdm   # cs_dataloader 반복문 때문에 추가함..

from nltk.translate.bleu_score import sentence_bleu

from torch.utils import data

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import normalize
import torch.nn.functional as F
import torch.nn as nn

import torch
from transformers import AutoModel, AutoTokenizer
import csv
import gc
from transformers import BertModel
import pickle
import random

import evaluate
import rouge_score
from nltk.translate.meteor_score import meteor_score
from nltk.translate.meteor_score import single_meteor_score
from nltk.metrics.scores import precision
from nltk.metrics.scores import recall
from konlpy.tag import Mecab
from konlpy.tag import Komoran   
import nltk

# nltk.download('wordnet')

In [None]:
def meteor_score(predictions):
    meteor = evaluate.load('meteor')
    bart_ans_sum = 0
    bart_ans_sum2 = 0
    kcc_ans_sum = 0
    kobert_ans_sum = 0
    real_ans_list = []
    bart_ans_list = []
    bart_ans_list2 = []
    kcc_ans_list = []
    kobert_ans_list = []
    cnt=0
    for i in tqdm(range(len(predictions))):
        real_ans_list = []
        bart_ans_list = []
        bart_ans_list2 = []
        kcc_ans_list = []
        kobert_ans_list = []

        real_ans = predictions["GT"][i]
        bart_ans = predictions['pred_base'][i]
        bart_ans2 = predictions['pred_persona_conc_notoken'][i]
        kcc_ans = predictions['pred_persona_conc_test_concX'][i]
        kobert_ans = predictions['pred_persona_conc'][i]

        real_ans_list.append(real_ans)
        bart_ans_list.append(bart_ans)
        bart_ans_list2.append(bart_ans2)
        kcc_ans_list.append(kcc_ans)
        kobert_ans_list.append(kobert_ans)

        # print("real_ans:",real_ans)
        # print("real_ans_list:",real_ans_list)
        # print("bart_ans_list:",bart_ans_list)

        # print("bart_ans",bart_ans)
        # print("kcc_ans",kcc_ans)
        # print("kobert_ans",kobert_ans)

        results_bart = meteor.compute(references=real_ans_list, predictions=bart_ans_list)
        results_bart2 = meteor.compute(references=real_ans_list, predictions=bart_ans_list2)
        results_kcc = meteor.compute(references=real_ans_list, predictions=kcc_ans_list)
        results_kobert = meteor.compute(references=real_ans_list, predictions=kobert_ans_list)
        
        bart_ans_sum = bart_ans_sum + results_bart['meteor']
        bart_ans_sum2 = bart_ans_sum2 + results_bart2['meteor']
        kcc_ans_sum = kcc_ans_sum + results_kcc['meteor']
        kobert_ans_sum = kobert_ans_sum + results_kobert['meteor']
        
        cnt = cnt + 1

    print("meteor_bart: ", bart_ans_sum / cnt)
    print("meteor_bart2: ", bart_ans_sum2 / cnt)
    print("meteor_kcc: ", kcc_ans_sum / cnt)
    print("meteor_kobert: ", kobert_ans_sum / cnt)
    

def bert_score_f1(predictions):
    bertscore = evaluate.load('bertscore')
    real_ans_list = []
    bart_ans_list = []
    bart_ans_list2 = []
    kcc_ans_list = []
    kobert_ans_list = []
    cnt = 0
    for i in tqdm(range(len(predictions))):
        real_ans = predictions['GT'][i]
        bart_ans = predictions['pred_base'][i]
        bart_ans2 = predictions['pred_persona_conc_notoken'][i]
        kcc_ans = predictions['pred_persona_conc_test_concX'][i]
        kobert_ans = predictions['pred_persona_conc'][i]

        real_ans_list.append(real_ans)
        bart_ans_list.append(bart_ans)
        bart_ans_list2.append(bart_ans2)
        kcc_ans_list.append(kcc_ans)
        kobert_ans_list.append(kobert_ans)

        cnt = cnt + 1

    results_bart = bertscore.compute(predictions=bart_ans_list, references=real_ans_list, lang="en")
    results_bart2 = bertscore.compute(predictions=bart_ans_list2, references=real_ans_list, lang="en")
    results_kcc = bertscore.compute(predictions=kcc_ans_list, references=real_ans_list, lang="en")
    results_kobert = bertscore.compute(predictions=kobert_ans_list, references=real_ans_list, lang="en")

    # print("bertscore_bart_f1: ", results_bart['f1'])
    # print("bertscore_kcc_f1: ", results_kcc['f1'])
    # print("cnt", cnt)
    bertscore_bart_f1_sum = sum(results_bart['f1'])
    bertscore_bart_f1_sum2 = sum(results_bart2['f1'])
    bertscore_kcc_f1_sum = sum(results_kcc['f1'])
    bertscore_kobert_f1_sum = sum(results_kobert['f1'])

    print("avg_bertscore_bart_f1_sum", bertscore_bart_f1_sum / cnt)
    print("avg_bertscore_bart_f1_sum2", bertscore_bart_f1_sum2 / cnt)
    print("avg_bertscore_kcc_f1_sum", bertscore_kcc_f1_sum / cnt)
    print("avg_bertscore_kobert_f1_sum", bertscore_kobert_f1_sum / cnt)


def bert_score_recall(predictions):
    bertscore = evaluate.load('bertscore')
    real_ans_list = []
    bart_ans_list = []
    bart_ans_list2 = []
    kcc_ans_list = []
    kobert_ans_list = []
    cnt = 0
    for i in tqdm(range(len(predictions))):
        real_ans = predictions['GT'][i]
        bart_ans = predictions['pred_base'][i]
        bart_ans2 = predictions['pred_persona_conc_notoken'][i]
        kcc_ans = predictions['pred_persona_conc_test_concX'][i]
        kobert_ans = predictions['pred_persona_conc'][i]

        real_ans_list.append(real_ans)
        bart_ans_list.append(bart_ans)
        bart_ans_list2.append(bart_ans2)
        kcc_ans_list.append(kcc_ans)
        kobert_ans_list.append(kobert_ans)

        cnt = cnt + 1

    results_bart = bertscore.compute(predictions=bart_ans_list, references=real_ans_list, lang="en")
    results_bart2 = bertscore.compute(predictions=bart_ans_list2, references=real_ans_list, lang="en")
    results_kcc = bertscore.compute(predictions=kcc_ans_list, references=real_ans_list, lang="en")
    results_kobert = bertscore.compute(predictions=kobert_ans_list, references=real_ans_list, lang="en")

    # print("bertscore_bart_f1: ", results_bart['f1'])
    # print("bertscore_kcc_f1: ", results_kcc['f1'])
    # print("cnt", cnt)
    bertscore_bart_f1_sum = sum(results_bart['recall'])
    bertscore_bart_f1_sum2 = sum(results_bart2['recall'])
    bertscore_kcc_f1_sum = sum(results_kcc['recall'])
    bertscore_kobert_f1_sum = sum(results_kobert['recall'])

    print("avg_bertscore_bart_recall_sum", bertscore_bart_f1_sum / cnt)
    print("avg_bertscore_bart_recall_sum2", bertscore_bart_f1_sum2 / cnt)
    print("avg_bertscore_kcc_recall_sum", bertscore_kcc_f1_sum / cnt)
    print("avg_bertscore_kobert_recall_sum", bertscore_kobert_f1_sum / cnt)


def bert_score_precision(predictions):
    bertscore = evaluate.load('bertscore')
    real_ans_list = []
    bart_ans_list = []
    bart_ans_list2 = []
    kcc_ans_list = []
    kobert_ans_list = []
    cnt = 0
    for i in tqdm(range(len(predictions))):
        real_ans = predictions['GT'][i]
        bart_ans = predictions['pred_base'][i]
        bart_ans2 = predictions['pred_persona_conc_notoken'][i]
        kcc_ans = predictions['pred_persona_conc_test_concX'][i]
        kobert_ans = predictions['pred_persona_conc'][i]

        real_ans_list.append(real_ans)
        bart_ans_list.append(bart_ans)
        bart_ans_list2.append(bart_ans2)
        kcc_ans_list.append(kcc_ans)
        kobert_ans_list.append(kobert_ans)

        cnt = cnt + 1

    results_bart = bertscore.compute(predictions=bart_ans_list, references=real_ans_list, lang="en")
    results_bart2 = bertscore.compute(predictions=bart_ans_list2, references=real_ans_list, lang="en")
    results_kcc = bertscore.compute(predictions=kcc_ans_list, references=real_ans_list, lang="en")
    results_kobert = bertscore.compute(predictions=kobert_ans_list, references=real_ans_list, lang="en")

    # print("bertscore_bart_f1: ", results_bart['f1'])
    # print("bertscore_kcc_f1: ", results_kcc['f1'])
    # print("cnt", cnt)
    bertscore_bart_f1_sum = sum(results_bart['precision'])
    bertscore_bart_f1_sum2 = sum(results_bart2['precision'])
    bertscore_kcc_f1_sum = sum(results_kcc['precision'])
    bertscore_kobert_f1_sum = sum(results_kobert['precision'])

    print("avg_bertscore_bart_precision_sum", bertscore_bart_f1_sum / cnt)
    print("avg_bertscore_bart_precision_sum2", bertscore_bart_f1_sum2 / cnt)
    print("avg_bertscore_kcc_precision_sum", bertscore_kcc_f1_sum / cnt)
    print("avg_bertscore_kobert_precision_sum", bertscore_kobert_f1_sum / cnt)


def bleu_score(predictions):
    bart_ans_sum_1 = 0
    kcc_ans_sum_1 = 0
    kobert_ans_sum_1 = 0

    bart_ans_sum_2 = 0
    kcc_ans_sum_2 = 0
    kobert_ans_sum_2 = 0

    bart_ans_sum_3 = 0
    kcc_ans_sum_3 = 0
    kobert_ans_sum_3 = 0

    bart_ans_sum_4 = 0
    kcc_ans_sum_4 = 0
    kobert_ans_sum_4 = 0

    real_ans_list = []
    bart_ans_list = []
    bart_ans_list2 = []
    kcc_ans_list = []
    kobert_ans_list = []

    real_ans_list2 = []

    cnt=0
    bleu = evaluate.load("bleu")

    for i in tqdm(range(len(predictions))):
        real_ans_list = []
        # bart_ans_list = []
        # kcc_ans_list = []
        # kobert_ans_list = []

        real_ans = predictions['GT'][i]
        bart_ans = predictions['pred_base'][i]
        bart_ans2 = predictions['pred_persona_conc_notoken'][i]
        kcc_ans = predictions['pred_persona_conc_test_concX'][i]
        kobert_ans = predictions['pred_persona_conc'][i]

        real_ans_list.append(real_ans)
        bart_ans_list.append(bart_ans)
        bart_ans_list2.append(bart_ans2)
        kcc_ans_list.append(kcc_ans)
        kobert_ans_list.append(kobert_ans)

        real_ans_list2.append(real_ans_list)


        # print("real_ans",real_ans)
        # print("real_ans_list",real_ans_list)

        # print("bart_ans",bart_ans)
        # print("kcc_ans",kcc_ans)

        # # huggingface bleu score
    results_bart = bleu.compute(predictions=bart_ans_list, references=real_ans_list2, smooth='True')
    results_bart2 = bleu.compute(predictions=bart_ans_list2, references=real_ans_list2, smooth='True')
    results_kcc = bleu.compute(predictions=kcc_ans_list, references=real_ans_list2, smooth='True')
    results_kobert = bleu.compute(predictions=kobert_ans_list, references=real_ans_list2, smooth='True')

    print("results_bart", results_bart['bleu'])
    print("results_bart2", results_bart2['bleu'])
    print("results_kcc", results_kcc['bleu'])
    print("results_kobert", results_kobert['bleu'])

        # # nltk bleu score
        # sen_bleu_1_bart = sentence_bleu(bart_ans, real_ans, weights=(0.25, 0.25, 0.25, 0.25))
        # sen_bleu_1_kcc = sentence_bleu(kcc_ans, real_ans, weights=(0.25, 0.25, 0.25, 0.25))
        # sen_bleu_1_kobert = sentence_bleu(kobert_ans, real_ans, weights=(0.25, 0.25, 0.25, 0.25))

        # sen_bleu_2_bart = sentence_bleu(bart_ans, real_ans, weights=(0,1,0,0))
        # sen_bleu_2_kcc = sentence_bleu(kcc_ans, real_ans, weights=(0,1,0,0))
        # sen_bleu_2_kobert = sentence_bleu(kobert_ans, real_ans, weights=(0,1,0,0))


        # sen_bleu_3_bart = sentence_bleu(bart_ans, real_ans, weights=(0,0,1,0))
        # sen_bleu_3_kcc = sentence_bleu(kcc_ans, real_ans, weights=(0,0,1,0))
        # sen_bleu_3_kobert = sentence_bleu(kobert_ans, real_ans, weights=(0,0,1,0))

        # sen_bleu_4_bart = sentence_bleu(bart_ans, real_ans, weights=(0,0,0,1))
        # sen_bleu_4_kcc = sentence_bleu(kcc_ans, real_ans, weights=(0,0,0,1))
        # sen_bleu_4_kobert = sentence_bleu(kobert_ans, real_ans, weights=(0,0,0,1))

        
        # bart_ans_sum_1 = bart_ans_sum_1 + sen_bleu_1_bart
        # kcc_ans_sum_1 = kcc_ans_sum_1 + sen_bleu_1_kcc
        # kobert_ans_sum_1 = kobert_ans_sum_1 + sen_bleu_1_kobert

        # bart_ans_sum_2 = bart_ans_sum_2 + sen_bleu_2_bart
        # kcc_ans_sum_2 = kcc_ans_sum_2 + sen_bleu_2_kcc
        # kobert_ans_sum_2 = kobert_ans_sum_2 + sen_bleu_2_kobert

        # bart_ans_sum_3 = bart_ans_sum_3 + sen_bleu_3_bart
        # kcc_ans_sum_3 = kcc_ans_sum_3 + sen_bleu_3_kcc
        # kobert_ans_sum_3 = kobert_ans_sum_3 + sen_bleu_3_kobert

        # bart_ans_sum_4 = bart_ans_sum_4 + sen_bleu_4_bart
        # kcc_ans_sum_4 = kcc_ans_sum_4 + sen_bleu_4_kcc
        # kobert_ans_sum_4 = kobert_ans_sum_4 + sen_bleu_4_kobert
        
    #     cnt = cnt + 1

    # print("bleu1_bart: ", bart_ans_sum_1 / cnt)
    # print("bleu1_kcc: ", kcc_ans_sum_1 / cnt)
    # print("bleu1_kobert: ", kobert_ans_sum_1 / cnt)
    # print("")
    # print("bleu2_bart: ", bart_ans_sum_2 / cnt)
    # print("bleu2_kcc: ", kcc_ans_sum_2 / cnt)
    # print("bleu2_kobert: ", kobert_ans_sum_2 / cnt)
    # print("")
    # print("bleu3_bart: ", bart_ans_sum_3 / cnt)
    # print("bleu3_kcc: ", kcc_ans_sum_3 / cnt)
    # print("bleu3_kobert: ", kobert_ans_sum_3 / cnt)
    # print("")
    # print("bleu4_bart: ", bart_ans_sum_4 / cnt)
    # print("bleu4_kcc: ", kcc_ans_sum_4 / cnt)
    # print("bleu4_kobert: ", kobert_ans_sum_4 / cnt)



def rouge_score(predictions):
    rouge = evaluate.load('rouge')
    bart_ans_sum = 0
    bart_ans_sum2 = 0
    kcc_ans_sum = 0
    kobert_ans_sum = 0
    real_ans_list = []
    bart_ans_list = []
    bart_ans_list2 = []
    kcc_ans_list = []
    kobert_ans_list = []
    cnt=0
    rouge = evaluate.load('rouge')
    for i in tqdm(range(len(predictions))):
        # real_ans_list = []
        # bart_ans_list = []
        # kcc_ans_list = []

        real_ans = predictions['GT'][i]
        bart_ans = predictions['pred_base'][i]
        bart_ans2 = predictions['pred_persona_conc_notoken'][i]
        kcc_ans = predictions['pred_persona_conc_test_concX'][i]
        kobert_ans = predictions['pred_persona_conc'][i]

        real_ans_list.append(real_ans)
        bart_ans_list.append(bart_ans)
        bart_ans_list2.append(bart_ans2)
        kcc_ans_list.append(kcc_ans)
        kobert_ans_list.append(kobert_ans)


    results_bart = rouge.compute(predictions=bart_ans_list, references=real_ans_list, use_stemmer=True)
    results_bart2 = rouge.compute(predictions=bart_ans_list2, references=real_ans_list, use_stemmer=True)
    results_kcc = rouge.compute(predictions=kcc_ans_list, references=real_ans_list, use_stemmer=True)
    results_kobert = rouge.compute(predictions=kobert_ans_list, references=real_ans_list, use_stemmer=True)

    print("rouge_bart: ", results_bart)
    print("rouge_bart2: ", results_bart2)
    print("rouge_kcc: ", results_kcc)
    print("rouge_kobert: ", results_kobert)




In [None]:
# predictions = pd.read_csv("/home/ldy/kdms2023/harry_persona/dataset/results.csv")

In [None]:
meteor_score(predictions)

In [None]:
bert_score_f1(predictions)

In [None]:
bert_score_precision(predictions)

In [None]:
bleu_score(predictions)