In [None]:
from _init import *

import os, torch, random
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from typing import List

from ranger.utils import json_utils
from ranger.vllm.vllm_engine import VllmEngine
from ranger.corag.corag_agent import CoragAgent
from ranger.corag.corag_result import ChainResult, QueryResult

In [None]:
def set_seed(seed: int):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    random.seed(seed)
    print(f'set_seed() seed : {seed}')

seed = 42
set_seed(seed)

In [None]:
model_name = 'meta-llama/Llama-3.2-3B-Instruct'
device = 0
dtype = 'float16'
max_seq_length = 4096
max_new_tokens = 128
temperature = 0.0
gpu_memory_utilization = 0.3

vllm_config = {
    "model_name": model_name,
    'device': f'cuda:{device}',
    'dtype': dtype,
    'max_seq_length': max_seq_length,
    'max_new_tokens': max_new_tokens,
    'temperature': temperature,
    'gpu_memory_utilization': gpu_memory_utilization,
    'n_log_prob': 20
}

corag_config = {
    'top_k_query': 20,
    'top_k_sub_query': 5,
    "task_desc": "answer multi-hop questions"
}

In [None]:
vllm_engine = VllmEngine(
    model_name=vllm_config['model_name'],
    device=vllm_config['device'],
    dtype=vllm_config['dtype'],
    max_seq_length=vllm_config['max_seq_length'],
    max_new_tokens=vllm_config['max_new_tokens'],
    temperature=vllm_config['temperature'],
    gpu_memory_utilization=vllm_config['gpu_memory_utilization'],
    n_log_prob=vllm_config['n_log_prob']
)

vllm_engine._seed = seed

In [None]:
corag_agent = CoragAgent(
    engine=vllm_engine,
    top_k_query=corag_config['top_k_query'],
    top_k_sub_query=corag_config['top_k_sub_query'],
    task_desc=corag_config['task_desc']
)

In [None]:
def datas_shuffle(datas: list, seed: int):
    rng = random.Random(seed)
    rng.shuffle(datas)


def load_datas(train_data_path: str, test_data_path: str, seed: int, do_print=False):
    train_datas = json_utils.load_file(train_data_path)
    test_datas = json_utils.load_file(test_data_path)
    datas_shuffle(train_datas, seed)
    datas_shuffle(test_datas, seed)
    
    return train_datas, test_datas

In [None]:
work_dir = f'/home/nlpshlee/dev_env/git/repos/ranger'
data_dir = f'{work_dir}/data'
out_dir = f'{work_dir}/output'

train_data_path = f'{data_dir}/custom_musique_train_5000_final.jsonl'
test_data_path = f'{data_dir}/custom_multihopqa_eval_1000.jsonl'
train_datas, test_datas = load_datas(train_data_path, test_data_path, seed, do_print=False)

In [None]:
def test_generate_batch(datas, n_chains, chain_depth, adapter_path=''):
    query_results: List[QueryResult] = corag_agent.generate_batch(
        datas=datas,
        n_chains=n_chains,
        chain_depth=chain_depth,
        adapter_path=adapter_path
    )

    return query_results

In [None]:
def print_query_results(query_results: List[QueryResult]):
    print(f'query_results size : {len(query_results)}\n')

    for query_result in query_results:
        print(f'query_id : {query_result._query_id}')
        print(f'query : {query_result._query}')
        print(f'answers (len:{len(query_result._answers)}) : {query_result._answers}')
        print(f'doc_ids (len:{len(query_result._doc_ids)}) : {query_result._doc_ids}')
        print(f'docs :')
        for i, doc in enumerate(query_result._docs):
            doc = doc.replace('\n', ' ')
            print(f'[{i+1}] : {doc}')

        chain_results: List[ChainResult] = query_result._chain_results
        print(f'\n\tchain_results size : {len(chain_results)}\n')

        for chain_idx, chain_result in enumerate(chain_results):
            print(f'\tchain_idx : {chain_idx+1}')
            print(f'\tsub_querys (len:{len(chain_result._sub_querys)}) : {chain_result._sub_querys}')
            print(f'\tsub_answers (len:{len(chain_result._sub_answers)}) : {chain_result._sub_answers}')
            print(f'\tdoc_ids_list : {chain_result._doc_ids_list}')
            
            print(f'\tdocs_list :\n\t[depth][doc_idx]')
            for i, docs in enumerate(chain_result._docs_list):
                for j, doc in enumerate(docs):
                    doc = doc.replace('\n', ' ')
                    print(f'\t[{i+1}][{j+1}] : {doc}')
            print(f'\tfinal_answers (len:{len(chain_result._final_answers)}) : {chain_result._final_answers}\n')

In [None]:
n_chains, chain_depth = 5, 5

query_results = test_generate_batch(train_datas[:10], n_chains, chain_depth)

In [None]:
adapter_path = '/home/nlpshlee/dev_env/git/repos/ranger/outputs/test/lora_adapter_2025-12-03-07-13-16/'

query_results_adapter = test_generate_batch(train_datas[:10], n_chains, chain_depth, adapter_path)

In [None]:
# 이 결과를 파일로 저장해서, diff 비교
print_query_results(query_results_adapter)