In [None]:
!pip install -r requirements.txt

In [7]:
"""A package for downloading and processing Objaverse."""
"""Thanks Objaverse"""
"""
Paper:
Deitke M, Schwenk D, Salvador J, et al. Objaverse: A universe of annotated 3d objects[C]//Proceedings of the IEEE/CVF conference on computer vision and pattern recognition. 2023: 13142-13153.
"""
import glob
import gzip
import json
import multiprocessing
import os
import urllib.request
import warnings
from typing import Any, Dict, List, Optional, Tuple

from tqdm import tqdm

BASE_PATH = os.path.join(os.path.expanduser("./"), "myObjaverse")

__version__ = "<REPLACE_WITH_VERSION>"
_VERSIONED_PATH = os.path.join(BASE_PATH, "hf-objaverse-v1")


def load_annotations(uids: Optional[List[str]] = None) -> Dict[str, Any]:
    """Load the full metadata of all objects in the dataset.

    Args:
        uids: A list of uids with which to load metadata. If None, it loads
        the metadata for all uids.

    Returns:
        A dictionary mapping the uid to the metadata.
    """
    metadata_path = os.path.join(_VERSIONED_PATH, "metadata")
    object_paths = _load_object_paths()
    dir_ids = (
        set(object_paths[uid].split("/")[1] for uid in uids)
        if uids is not None
        else [f"{i // 1000:03d}-{i % 1000:03d}" for i in range(160)]
    )
    if len(dir_ids) > 10:
        dir_ids = tqdm(dir_ids)
    out = {}
    for i_id in dir_ids:
        json_file = f"{i_id}.json.gz"
        local_path = os.path.join(metadata_path, json_file)
        if not os.path.exists(local_path):
            hf_url = f"https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/{i_id}.json.gz"
            # wget the file and put it in local_path
            os.makedirs(os.path.dirname(local_path), exist_ok=True)
            urllib.request.urlretrieve(hf_url, local_path)
        with gzip.open(local_path, "rb") as f:
            data = json.load(f)
        if uids is not None:
            data = {uid: data[uid] for uid in uids if uid in data}
        out.update(data)
        if uids is not None and len(out) == len(uids):
            break
    return out


def _load_object_paths() -> Dict[str, str]:
    """Load the object paths from the dataset.

    The object paths specify the location of where the object is located
    in the Hugging Face repo.

    Returns:
        A dictionary mapping the uid to the object path.
    """
    object_paths_file = "object-paths.json.gz"
    local_path = os.path.join(_VERSIONED_PATH, object_paths_file)
    if not os.path.exists(local_path):
        hf_url = f"https://huggingface.co/datasets/allenai/objaverse/resolve/main/{object_paths_file}"
        # wget the file and put it in local_path
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        urllib.request.urlretrieve(hf_url, local_path)
    with gzip.open(local_path, "rb") as f:
        object_paths = json.load(f)
    return object_paths


def load_uids() -> List[str]:
    """Load the uids from the dataset.

    Returns:
        A list of uids.
    """
    return list(_load_object_paths().keys())


def _download_object(
    uid: str,
    object_path: str,
    total_downloads: float,
    start_file_count: int,
) -> Tuple[str, str]:
    """Download the object for the given uid.

    Args:
        uid: The uid of the object to load.
        object_path: The path to the object in the Hugging Face repo.

    Returns:
        The local path of where the object was downloaded.
    """
    # print(f"downloading {uid}")
    local_path = os.path.join(_VERSIONED_PATH, object_path)
    tmp_local_path = os.path.join(_VERSIONED_PATH, object_path + ".tmp")
    hf_url = (
        f"https://huggingface.co/datasets/allenai/objaverse/resolve/main/{object_path}"
    )
    # wget the file and put it in local_path
    os.makedirs(os.path.dirname(tmp_local_path), exist_ok=True)
    urllib.request.urlretrieve(hf_url, tmp_local_path)

    os.rename(tmp_local_path, local_path)

    files = glob.glob(os.path.join(_VERSIONED_PATH, "glbs", "*", "*.glb"))
    print(
        "Downloaded",
        len(files) - start_file_count,
        "/",
        total_downloads,
        "objects",
    )

    return uid, local_path


def load_objects(uids: List[str], download_processes: int = 1) -> Dict[str, str]:
    """Return the path to the object files for the given uids.

    If the object is not already downloaded, it will be downloaded.

    Args:
        uids: A list of uids.
        download_processes: The number of processes to use to download the objects.

    Returns:
        A dictionary mapping the object uid to the local path of where the object
        downloaded.
    """
    object_paths = _load_object_paths()
    out = {}
    if download_processes == 1:
        uids_to_download = []
        for uid in uids:
            if uid.endswith(".glb"):
                uid = uid[:-4]
            if uid not in object_paths:
                warnings.warn(f"Could not find object with uid {uid}. Skipping it.")
                continue
            object_path = object_paths[uid]
            local_path = os.path.join(_VERSIONED_PATH, object_path)
            if os.path.exists(local_path):
                out[uid] = local_path
                continue
            uids_to_download.append((uid, object_path))
        if len(uids_to_download) == 0:
            return out
        start_file_count = len(
            glob.glob(os.path.join(_VERSIONED_PATH, "glbs", "*", "*.glb"))
        )
        for uid, object_path in uids_to_download:
            uid, local_path = _download_object(
                uid, object_path, len(uids_to_download), start_file_count
            )
            out[uid] = local_path
    else:
        args = []
        for uid in uids:
            if uid.endswith(".glb"):
                uid = uid[:-4]
            if uid not in object_paths:
                warnings.warn(f"Could not find object with uid {uid}. Skipping it.")
                continue
            object_path = object_paths[uid]
            local_path = os.path.join(_VERSIONED_PATH, object_path)
            if not os.path.exists(local_path):
                args.append((uid, object_paths[uid]))
            else:
                out[uid] = local_path
        if len(args) == 0:
            return out
        print(
            f"starting download of {len(args)} objects with {download_processes} processes"
        )
        start_file_count = len(
            glob.glob(os.path.join(_VERSIONED_PATH, "glbs", "*", "*.glb"))
        )
        args_list = [(*arg, len(args), start_file_count) for arg in args]
        with multiprocessing.Pool(download_processes) as pool:
            r = pool.starmap(_download_object, args_list)
            for uid, local_path in r:
                out[uid] = local_path
    return out


def load_lvis_annotations() -> Dict[str, List[str]]:
    """Load the LVIS annotations.

    If the annotations are not already downloaded, they will be downloaded.

    Returns:
        A dictionary mapping the LVIS category to the list of uids in that category.
    """
    hf_url = "https://huggingface.co/datasets/allenai/objaverse/resolve/main/lvis-annotations.json.gz"
    local_path = os.path.join(_VERSIONED_PATH, "lvis-annotations.json.gz")
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    if not os.path.exists(local_path):
        urllib.request.urlretrieve(hf_url, local_path)
    with gzip.open(local_path, "rb") as f:
        lvis_annotations = json.load(f)
    return lvis_annotations

In [None]:
# 加载UIDS
uids = load_uids()
len(uids), type(uids)

In [None]:
# 查看5个UIDs
uids[:5]

In [None]:
annotations = load_annotations(uids[:5])
annotations

In [None]:
annotations[uids[0]]

In [None]:
# 加载所有注释
annotations = load_annotations()

In [None]:
annotations[uids[1]].keys()

In [None]:
# 筛选出小于x面的
objs = []
name = []
description = []
face = []
for uid, annotation in annotations.items():
    if 1 <= annotation.get("faceCount") <= 700 :
        objs.append(uid)
        name.append(annotation.get("name"))
        description.append(annotation.get("description"))
        face.append(annotation.get("faceCount"))
objs[:3], name[:3], description[:3], face[:3]
len(objs)

In [None]:
len(objs)

In [None]:
# 查看线程
import multiprocessing
processes = multiprocessing.cpu_count()
processes

In [None]:
# import random

# random.seed(42)

# object_uids = random.sample(objs, 100)

# object_uids

object_uids = objs[:]
len(object_uids)

In [None]:
objects = load_objects(
    uids=object_uids,
    download_processes=processes
)
objects

遍历统计文件夹中glb文件数量

In [None]:
import os

# 遍历统计文件夹中glb文件数量
def count_glb_files_in_subdirectories(root_folder):
    total_count = 0
    for root, dirs, files in os.walk(root_folder):
        glb_count = 0
        for file in files:
            if file.lower().endswith('.glb'):
                glb_count += 1
                total_count += 1
        # if glb_count > 0:
            # print(f"子目录 {root} 中 .glb 文件的数量: {glb_count}")
    print(f"所有子目录中 .glb 文件的总数: {total_count}")

if __name__ == "__main__":
    # 指定要遍历的根文件夹路径
    root_folder = ''
    count_glb_files_in_subdirectories(root_folder)

In [None]:
num = object_uids
len(num)

查看Cap3D

In [None]:
import pandas as pd
captions = pd.read_csv('./Cap3D_automated_Objaverse_full.csv', header=None)
text = captions[captions[0] == objs[1]][1].values[0]
text

创建exported_data文件

In [None]:
import os
import json
import pandas as pd

export_folder = ""

if not os.path.exists(export_folder):
    os.makedirs(export_folder)

export_file_path = os.path.join(export_folder, "exported_data.json")

captions = pd.read_csv('./Cap3D_automated_Objaverse_full.csv', header=None)

exported_data = []
error_count = 1  
for i in range(len(num)):
    try:
        text = captions[captions[0] == objs[i]][1].values[0]
    except IndexError:
        print(f"[{error_count}] +++00000 IndexError occurred at index {i}, skipping this data.")
        error_count += 1  
        continue  

    print(str(i) + " --- " + text)
    exported_data.append({
        "Object ID": objs[i],
        "Name": name[i],
        "Description": description[i],
        "text": text
    })

with open(export_file_path, 'w') as jsonfile:
    json.dump(exported_data, jsonfile, indent=4)