## 모델 학습
- data_file_name_dict에 원하는 특정 데이터셋을 mode로 입력하고, [ 1. Data Load, 2. Train, 3. Model HuggingFace Upload ]를 반복
- Merging 하고자 하는 모델을 모두 학습

## 모델 병합
- 위 모델 학습 과정을 통해 Merging 하고자 하는 모델을 yaml_data에 (모델 이름, weight, density)를 입력
- 모델 Merging 후 HuggingFace에 업로드

# 1. Data Load

In [None]:
from IPython.display import clear_output
import os, sys
sys.path = [os.path.abspath(os.getcwd())] + sys.path

from data_utils import data_load, data_download

os.environ["CUDA_VISIBLE_DEVICES"] = '0,1,2,3'
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

data_file_name_dict = {
    'mcqa': ['data/mcqa_vf_data.jsonl'],
    'long': ['data/long_v6_data.jsonl'],
}
data_download([x[0] for x in data_file_name_dict.values()])

model_id = "Qwen/Qwen2.5-7B"
mode = 'mcqa'

path = os.path.abspath(os.getcwd())
train_dataset, valid_dataset = data_load(path, data_file_name_dict[mode], model_id)

# 2. Train

In [None]:
from parallel_model_utils import train_args_load, trainer_load

epochs = 5
max_seq_length = 1200

batch_size = {
    'per_device_train_batch_size': 3,
    'gradient_accumulation_steps': 4,
    'per_device_eval_batch_size': 3
}

training_args = train_args_load(path, batch_size, epochs, max_seq_length)
trainer = trainer_load(model_id, train_dataset, valid_dataset, training_args)

trainer.train()

# 3. Model HuggingFace Upload

In [None]:
from parallel_model_merged import model_hf_upload

trained_model_path = 'save/checkpoint-00'
hf_save_model_name = 'T-EVEN/finals_krx_Qwen2.5-7B-mcqa-VF'
hf_token = 'api_key'

model_hf_upload(trained_model_path, hf_save_model_name, hf_token=hf_token)

# 4. Model Merging and Upload

In [None]:
from parallel_model_merged import create_yaml_file, model_merged, model_hf_upload

yaml_data = {
    "models": [
        {
            "model": 'Qwen/Qwen2.5-7B-Instruct',
            "parameters": {
                "weight": 0.3,
                "density": 0.3
            }
        },
        {
            "model": 'T-EVEN/finals_krx_Qwen2.5-7B-mcqa-VF',
            "parameters": {
                "weight": 0.4,
                "density": 0.5
            }
        },
        {
            "model": "T-EVEN/finals_krx_Qwen2.5-7B-long-V4",
            "parameters": {
                "weight": 0.4,
                "density": 0.5
            }
        }
    ],
    "merge_method": "dare_ties",
    "base_model": "Qwen/Qwen2.5-7B",
    "parameters": {
        "normalize": True,
        "int8_mask": True
    },
    "dtype": "bfloat16"
}

yaml_file_name = 'Mergekit_qwen.yaml'
merging_model_path = 'merged_model_v9'
hf_save_model_name = 'T-EVEN/finals_krx_Qwen2.5-7B-merged-V9'
hf_token = 'api_key'

create_yaml_file(yaml_data, yaml_file_name)
model_merged(merging_model_path, yaml_file_name)

model_hf_upload(merging_model_path, hf_save_model_name, hf_token=hf_token)

In [None]:
import json, torch, re, os, random
import pandas as pd
import numpy as np
from IPython.display import clear_output
from tqdm import tqdm

from data_utils import load_from_jsonl, save_to_jsonl

path = '/home/ec2-user/SageMaker/'

long_unethical_file_path_lst = ['data_1124/long_unethical_1128.jsonl']
long_unethical_data = [y for x in long_unethical_file_path_lst for y in load_from_jsonl(path + x)]

save_to_jsonl('n_data/unethical_long_v1.jsonl', long_unethical_data)