In [1]:
import os
import numpy as np
import json

# 검사할 상위 폴더들
gpu_dirs = [
    "vector_paragraphs_gpu0",
    "vector_paragraphs_gpu1",
    "vector_paragraphs_gpu2",
    "vector_paragraphs_gpu3",
]

def count_jsonl(path):
    with open(path, "r") as f:
        return sum(1 for _ in f)

def check_folder(folder):
    print(f"\n=== Checking folder: {folder} ===")

    files = os.listdir(folder)

    # basename 목록 생성 (예: alex_morgan)
    basenames = set()
    for f in files:
        if f.endswith("_embeddings.npy"):
            basenames.add(f.replace("_embeddings.npy", ""))
        if f.endswith("_metadata.jsonl"):
            basenames.add(f.replace("_metadata.jsonl", ""))

    for name in basenames:
        npy_path = os.path.join(folder, f"{name}_embeddings.npy")
        jsonl_path = os.path.join(folder, f"{name}_metadata.jsonl")

        if not os.path.exists(npy_path) or not os.path.exists(jsonl_path):
            print(f"[MISSING] {name}: one of the files is missing")
            continue

        # Load lengths
        npy_len = np.load(npy_path).shape[0]
        jsonl_len = count_jsonl(jsonl_path)

        if npy_len == jsonl_len:
            print(f"[OK] {name} -> npy={npy_len}, jsonl={jsonl_len}")
        else:
            print(f"[MISMATCH] {name} -> npy={npy_len}, jsonl={jsonl_len}")

# 전체 폴더 검사
for d in gpu_dirs:
    if os.path.exists(d):
        check_folder(d)
    else:
        print(f"Folder not found: {d}")



=== Checking folder: vector_paragraphs_gpu0 ===
[OK] kerry_james_marshall -> npy=3629, jsonl=3629
[OK] donald_trump -> npy=26516, jsonl=26516
[OK] guillermo_del_toro -> npy=2965, jsonl=2965
[OK] barbara_rae_venter -> npy=2635, jsonl=2635
[OK] julian_assange -> npy=4010, jsonl=4010
[OK] ashley_graham -> npy=5068, jsonl=5068
[OK] barbara_lynch -> npy=3256, jsonl=3256
[OK] chip_gaines -> npy=3115, jsonl=3115
[OK] alicia_keys -> npy=2544, jsonl=2544
[OK] bernard_tyson -> npy=2711, jsonl=2711
[OK] andres_manuel_lopez -> npy=3244, jsonl=3244
[OK] juan_guaido -> npy=2652, jsonl=2652
[OK] emilia_clarke -> npy=3856, jsonl=3856
[OK] robert_mueller -> npy=2974, jsonl=2974
[OK] ruth_davidson -> npy=3696, jsonl=3696
[OK] john_legend -> npy=5281, jsonl=5281
[OK] glenn_close -> npy=2644, jsonl=2644
[OK] jian_wei_pan -> npy=3397, jsonl=3397
[OK] justin_trudeau -> npy=4983, jsonl=4983
[OK] chuck_schumer -> npy=2254, jsonl=2254
[OK] glenda_gray -> npy=2277, jsonl=2277
[OK] david_hogg -> npy=2043, jsonl

In [2]:
import os
import json
import shutil

gpu_dirs = [
    "vector_paragraphs_gpu0",
    "vector_paragraphs_gpu1",
    "vector_paragraphs_gpu2",
    "vector_paragraphs_gpu3",
]

target_dir = "vector_paragraphs_gpu0"

# -----------------------------
# 1. checkpoint.json 병합
# -----------------------------
merged_processed = set()

for d in gpu_dirs:
    ckpt_path = os.path.join(d, "checkpoint.json")
    if not os.path.exists(ckpt_path):
        print(f"[SKIP] {ckpt_path} 없음.")
        continue

    with open(ckpt_path, "r") as f:
        data = json.load(f)

    processed = data.get("processed_files", [])
    merged_processed.update(processed)
    print(f"[LOAD] {d}: {len(processed)} files")

# 정렬하여 병합 파일 생성
merged_list = sorted(list(merged_processed))
output_ckpt = os.path.join(target_dir, "checkpoint.json")

with open(output_ckpt, "w") as f:
    json.dump({"processed_files": merged_list}, f, indent=2)

print(f"\n[WRITE] 병합 checkpoint 저장 완료: {output_ckpt}")
print(f"총 파일 개수: {len(merged_list)}")

# -----------------------------
# 2. gpu1/2/3의 checkpoint.json 삭제
# -----------------------------
for d in gpu_dirs[1:]:
    ckpt_path = os.path.join(d, "checkpoint.json")
    if os.path.exists(ckpt_path):
        os.remove(ckpt_path)
        print(f"[DELETE] {ckpt_path} 삭제 완료")
    else:
        print(f"[SKIP] {ckpt_path} 없음")

# -----------------------------
# 3. gpu1/2/3의 모든 파일을 gpu0으로 이동
# -----------------------------
for d in gpu_dirs[1:]:
    for fname in os.listdir(d):
        src = os.path.join(d, fname)
        dst = os.path.join(target_dir, fname)

        # 충돌 방지를 위해 중복 파일이 있을 경우 이름 변경
        if os.path.exists(dst):
            base, ext = os.path.splitext(fname)
            new_name = f"{base}_from_{os.path.basename(d)}{ext}"
            dst = os.path.join(target_dir, new_name)
            print(f"[RENAME] {fname} → {new_name}")

        shutil.move(src, dst)
        print(f"[MOVE] {src} → {dst}")

    print(f"[CLEAR] {d} 폴더 파일 이동 완료")

print("\n[FINISHED] 모든 병합·삭제·이동 완료")


[LOAD] vector_paragraphs_gpu0: 15 files
[LOAD] vector_paragraphs_gpu1: 10 files
[LOAD] vector_paragraphs_gpu2: 14 files
[LOAD] vector_paragraphs_gpu3: 9 files

[WRITE] 병합 checkpoint 저장 완료: vector_paragraphs_gpu0/checkpoint.json
총 파일 개수: 48
[DELETE] vector_paragraphs_gpu1/checkpoint.json 삭제 완료
[DELETE] vector_paragraphs_gpu2/checkpoint.json 삭제 완료
[DELETE] vector_paragraphs_gpu3/checkpoint.json 삭제 완료
[MOVE] vector_paragraphs_gpu1/emilia_clarke_embeddings.npy → vector_paragraphs_gpu0/emilia_clarke_embeddings.npy
[MOVE] vector_paragraphs_gpu1/emilia_clarke_metadata.jsonl → vector_paragraphs_gpu0/emilia_clarke_metadata.jsonl
[MOVE] vector_paragraphs_gpu1/emily_comer_embeddings.npy → vector_paragraphs_gpu0/emily_comer_embeddings.npy
[MOVE] vector_paragraphs_gpu1/emily_comer_metadata.jsonl → vector_paragraphs_gpu0/emily_comer_metadata.jsonl
[MOVE] vector_paragraphs_gpu1/emma_gonzalez_embeddings.npy → vector_paragraphs_gpu0/emma_gonzalez_embeddings.npy
[MOVE] vector_paragraphs_gpu1/emma_gonzal