In [3]:
import os
import numpy as np
import json

# 검사할 상위 폴더들
gpu_dirs = [
    "vector_paragraphs"
]

def count_jsonl(path):
    with open(path, "r") as f:
        return sum(1 for _ in f)

def check_folder(folder):
    print(f"\n=== Checking folder: {folder} ===")

    files = os.listdir(folder)

    # basename 목록 생성 (예: alex_morgan)
    basenames = set()
    for f in files:
        if f.endswith("_embeddings.npy"):
            basenames.add(f.replace("_embeddings.npy", ""))
        if f.endswith("_metadata.jsonl"):
            basenames.add(f.replace("_metadata.jsonl", ""))

    for name in basenames:
        npy_path = os.path.join(folder, f"{name}_embeddings.npy")
        jsonl_path = os.path.join(folder, f"{name}_metadata.jsonl")

        if not os.path.exists(npy_path) or not os.path.exists(jsonl_path):
            print(f"[MISSING] {name}: one of the files is missing")
            continue

        # Load lengths
        npy_len = np.load(npy_path).shape[0]
        jsonl_len = count_jsonl(jsonl_path)

        if npy_len == jsonl_len:
            print(f"[OK] {name} -> npy={npy_len}, jsonl={jsonl_len}")
        else:
            print(f"[MISMATCH] {name} -> npy={npy_len}, jsonl={jsonl_len}")

# 전체 폴더 검사
for d in gpu_dirs:
    if os.path.exists(d):
        check_folder(d)
    else:
        print(f"Folder not found: {d}")



=== Checking folder: vector_paragraphs ===
[OK] roger_federer -> npy=4370, jsonl=4370
[OK] mohamed_salah -> npy=2666, jsonl=2666
[OK] bob_bland -> npy=3107, jsonl=3107
[OK] emmanuel_macron -> npy=3863, jsonl=3863
[OK] andres_manuel_lopez -> npy=3244, jsonl=3244
[OK] maxine_waters -> npy=5047, jsonl=5047
[OK] guillermo_del_toro -> npy=2965, jsonl=2965
[OK] john_legend -> npy=5281, jsonl=5281
[OK] chip_gaines -> npy=3115, jsonl=3115
[OK] ann_mckee -> npy=2466, jsonl=2466
[OK] mahershala_ali -> npy=4835, jsonl=4835
[OK] jesmyn_ward -> npy=3750, jsonl=3750
[OK] julian_assange -> npy=4010, jsonl=4010
[OK] prince_harry -> npy=6205, jsonl=6205
[OK] jordan_peele -> npy=5471, jsonl=5471
[OK] judy_chicago -> npy=4164, jsonl=4164
[OK] glenn_close -> npy=2644, jsonl=2644
[OK] moon_jae_in -> npy=4308, jsonl=4308
[OK] sean_hannity -> npy=5331, jsonl=5331
[OK] christine_blasey_ford -> npy=5055, jsonl=5055
[OK] jennifer_hyman -> npy=3197, jsonl=3197
[OK] mohamed_bin_zayed -> npy=5971, jsonl=5971
[OK]

In [2]:
import os
import json
import shutil

gpu_dirs = [
    "vector_paragraphs_gpu0",
    "vector_paragraphs_gpu1",
    "vector_paragraphs_gpu2",
    "vector_paragraphs_gpu3",
]

target_dir = "vector_paragraphs_gpu0"

# -----------------------------
# 1. checkpoint.json 병합
# -----------------------------
merged_processed = set()

for d in gpu_dirs:
    ckpt_path = os.path.join(d, "checkpoint.json")
    if not os.path.exists(ckpt_path):
        print(f"[SKIP] {ckpt_path} 없음.")
        continue

    with open(ckpt_path, "r") as f:
        data = json.load(f)

    processed = data.get("processed_files", [])
    merged_processed.update(processed)
    print(f"[LOAD] {d}: {len(processed)} files")

# 정렬하여 병합 파일 생성
merged_list = sorted(list(merged_processed))
output_ckpt = os.path.join(target_dir, "checkpoint.json")

with open(output_ckpt, "w") as f:
    json.dump({"processed_files": merged_list}, f, indent=2)

print(f"\n[WRITE] 병합 checkpoint 저장 완료: {output_ckpt}")
print(f"총 파일 개수: {len(merged_list)}")

# -----------------------------
# 2. gpu1/2/3의 checkpoint.json 삭제
# -----------------------------
for d in gpu_dirs[1:]:
    ckpt_path = os.path.join(d, "checkpoint.json")
    if os.path.exists(ckpt_path):
        os.remove(ckpt_path)
        print(f"[DELETE] {ckpt_path} 삭제 완료")
    else:
        print(f"[SKIP] {ckpt_path} 없음")

# -----------------------------
# 3. gpu1/2/3의 모든 파일을 gpu0으로 이동
# -----------------------------
for d in gpu_dirs[1:]:
    for fname in os.listdir(d):
        src = os.path.join(d, fname)
        dst = os.path.join(target_dir, fname)

        # 충돌 방지를 위해 중복 파일이 있을 경우 이름 변경
        if os.path.exists(dst):
            base, ext = os.path.splitext(fname)
            new_name = f"{base}_from_{os.path.basename(d)}{ext}"
            dst = os.path.join(target_dir, new_name)
            print(f"[RENAME] {fname} → {new_name}")

        shutil.move(src, dst)
        print(f"[MOVE] {src} → {dst}")

    print(f"[CLEAR] {d} 폴더 파일 이동 완료")

print("\n[FINISHED] 모든 병합·삭제·이동 완료")


[LOAD] vector_paragraphs_gpu0: 91 files
[SKIP] vector_paragraphs_gpu1/checkpoint.json 없음.
[SKIP] vector_paragraphs_gpu2/checkpoint.json 없음.
[SKIP] vector_paragraphs_gpu3/checkpoint.json 없음.

[WRITE] 병합 checkpoint 저장 완료: vector_paragraphs_gpu0/checkpoint.json
총 파일 개수: 91
[SKIP] vector_paragraphs_gpu1/checkpoint.json 없음
[SKIP] vector_paragraphs_gpu2/checkpoint.json 없음
[SKIP] vector_paragraphs_gpu3/checkpoint.json 없음


FileNotFoundError: [Errno 2] No such file or directory: 'vector_paragraphs_gpu1'