# **Load Data**

This notebook is meant to go through a workflow of:
1) Downloading ShapeNet data from Hugging Face
2) Processing data (generating SDF examples)
3) Uploading data to a Hugging Face repo

In [None]:
# general utilities
import os
import zipfile

# basic imports
import numpy as np
from tqdm import tqdm

# working with pointclouds
import point_cloud_utils as pcu

# ml
import torch

In [None]:
# define category IDs (these can be found on the ShapeNet Hugging Face dataset)
CANS = "02946921"
CARS = "02691156"
#...

**Download Dataset**

In [None]:
from huggingface_hub import hf_hub_download
from huggingface_hub import login


TOKEN = "YOUR_HUGGING_FACE_ACCESS_TOKEN" # ! Change this to your token
CATEGORY_ID = CARS
folder_name = "shapenet_cars" # create a folder to store your data before running

login(token=TOKEN)

# download
hf_hub_download(
    repo_id="ShapeNet/ShapeNetCore",
    filename="{}.zip".format(CATEGORY_ID),  # This is the category ID for cars
    repo_type="dataset",
    token = TOKEN,
    local_dir="./{}".format(folder_name),

)

# extract from zip
zip_path = os.path.join(folder_name, "{}.zip".format(CATEGORY_ID))
extract_dir = folder_name

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

**Processing**

In [None]:
CATEGORY_ID = CARS
N_MODELS = 70 #None - option to process smaller subset

relative_data_path = "shapenet_cars"
category_path = os.path.join(os.getcwd(), relative_data_path, CATEGORY_ID)
out_path = os.path.join(os.getcwd(), relative_data_path, "out")
os.makedirs(out_path, exist_ok=True)

# Resolution used to convert shapes to watertight manifolds
manifold_resolution = 20_000 # Higher value means better quality and slower

# Number of points in the volume to sample around the shape
num_vol_pts = 30_000

# Number of points on the surface to sample
num_surf_pts = 10_000

# Number of points near the surface to sample
num_near_surf_pts = 10_000
sigma = 0.05


iterable = os.listdir(category_path)[:N_MODELS] if N_MODELS is not None else os.listdir(category_path)
for i, model_path in enumerate(tqdm(iterable, desc="Processing models:")):

    mesh_path = os.path.normpath(os.path.join(category_path, model_path, "models", "model_normalized.obj"))

    # Load mesh
    v, f = pcu.load_mesh_vf(mesh_path) # load object
    # Convert mesh to watertight manifold
    vm, fm = pcu.make_mesh_watertight(v, f, manifold_resolution)

    """ Volume Sampling """
    # Generate random points in the volume around the shape
    p_vol = (np.random.rand(num_vol_pts, 3) - 0.5) * 1.2 # NOTE: ShapeNet shapes are normalized within [-0.5, 0.5]^3

    # Compute the SDF of the random points
    sdf_vol, _, _  = pcu.signed_distance_to_mesh(p_vol, vm, fm)
    
    # Generate data matrix with coordinates and sdf_vol values in format xyzs
    data_vol = np.concatenate((p_vol, sdf_vol.reshape(-1, 1)), axis=1)

    """ Surface Sampling """
    # Sample points on the surface as face ids and barycentric coordinates
    fid_surf, bc_surf = pcu.sample_mesh_random(vm, fm, num_surf_pts)

    # Compute 3D coordinates of surface samples
    p_surf = pcu.interpolate_barycentric_coords(fm, fid_surf, bc_surf, vm)
    
    # Generate data matrix with sdf values in format xyzs (where sdf value = 0)
    data_surf = np.concatenate((p_surf, np.zeros((num_surf_pts, 1))), axis=1)

    """ Near Surface Sampling """
    # Sample points on the surface as face ids and barycentric coordinates
    fid_near_surf, bc_near_surf = pcu.sample_mesh_random(vm, fm, num_near_surf_pts)

    # Compute 3D coordinates of surface samples
    p_near_surf = pcu.interpolate_barycentric_coords(fm, fid_near_surf, bc_near_surf, vm)
    noise = np.random.normal(loc=0.0, scale=sigma, size=p_near_surf.shape)
    p_near_surf = p_near_surf + noise

    # Compute the SDF of the near surface points
    sdf_near_surf, _, _  = pcu.signed_distance_to_mesh(p_near_surf, vm, fm)
    
    # Generate data matrix with coordinates and sdf_near_surf values in format xyzs
    data_near_surf = np.concatenate((p_near_surf, sdf_near_surf.reshape(-1, 1)), axis=1)
    
    """ Saving """
    # Concatenate surface and volume data
    data = np.concatenate((data_vol, data_surf, data_near_surf), axis=0)
    data = torch.from_numpy(data).float()
    
    # Save data to file
    torch.save(data, os.path.normpath(os.path.join(out_path, "{}.pt".format(model_path))))

Processing models:: 100%|██████████| 70/70 [03:32<00:00,  3.03s/it]


In [17]:
# zip the out folder
zip_filename = os.path.join(os.getcwd(), relative_data_path, "out.zip")

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(out_path):
        for file in files:
            file_path = os.path.join(root, file)
            # Write file with relative path inside the zip
            arcname = os.path.relpath(file_path, start=out_path)
            zipf.write(file_path, arcname)

print(f"Zipped folder saved to: {zip_filename}")


Zipped folder saved to: /vast/palmer/home.mccleary/cpsc452_lrk42/cpsc452/shapenet_cars/out.zip


In [None]:
from huggingface_hub import create_repo
from huggingface_hub import HfApi, HfFolder, upload_folder, upload_file

# set dataset parameters
reponame = "carssdf"
username = "YOUR_HUGGING_FACE_USERNAME"
repo_id = "{}/{}".format(username, reponame)

create_repo(repo_id, repo_type="dataset", private=True)

# Upload all contents of the out folder
upload_folder(
    repo_id=repo_id,
    folder_path=out_path,
    path_in_repo="",  # Optional: subfolder inside the repo
    repo_type="dataset"
)

# upload zipped out file
upload_file(
    path_or_fileobj=zip_filename,
    path_in_repo="out.zip",
    repo_id="{}/{}".format(username, reponame),
    repo_type="dataset"
)

out.zip:   0%|          | 0.00/50.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lukasskellijs/carssdf/commit/0f680a13418ef565f4d240e8efd2ea794d3ea7d8', commit_message='Upload cars70.zip with huggingface_hub', commit_description='', oid='0f680a13418ef565f4d240e8efd2ea794d3ea7d8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/lukasskellijs/carssdf', endpoint='https://huggingface.co', repo_type='dataset', repo_id='lukasskellijs/carssdf'), pr_revision=None, pr_num=None)