In [None]:
import os
import json
import numpy as np

In [2]:
DATA_DIR = "./Data/Pollinator-Data-CNN-Extracted"
FEATURES_FILE = "trainValTest_N=2197469_prod2_features_model=efficientnetB3_tsfrm=dataset_labeledOnly.npy"
LABELS_FILE = "trainValTest_N=2197469_prod2_nObs=4242_labels_labeledOnly.npz"
CLASS_MAPPER_FILE = "class_type_mapper.json"

In [30]:
def load_npy(file_dir, file_name):
    return np.load(os.path.join(file_dir, file_name))

def check_classes(y):
    # Load class type mapper file
    with open(os.path.join(DATA_DIR, CLASS_MAPPER_FILE), "r") as f:
        name2id = json.load(f)
    
    # Reverse the mapper dict
    id2name = {v: k for k, v in name2id.items()}

    # Check which classes are present
    present_ids, counts = np.unique(y, return_counts=True)
    present_names = [id2name[i] for i in present_ids if i in id2name]
   
    print("-" * 50)
    print(f"{'Class ID':<10}{'Class Name':<25}{'Num Samples':>12}")
    print("-" * 50)
    
    for id, name, count in zip(present_ids, present_names, counts):
        print(f"{id:<10}{name:<25}{count:>12}")    
    
    # Show sum of samples
    print("-" * 50)
    total = np.sum(counts)
    print(f"{'Total count of samples:':<35}{total:>12}")
    
        

In [16]:
# Load data
X = load_npy(DATA_DIR, FEATURES_FILE)
y = load_npy(DATA_DIR, LABELS_FILE)["y"]
X.shape, y.shape


((704361, 1536), (704361,))

In [31]:
# check classes
check_classes(y)

--------------------------------------------------
Class ID  Class Name                Num Samples
--------------------------------------------------
0         neg                            700990
1         Fourmis                           119
2         Coleoptere                       1110
3         Petite abeille sauvage            406
4         Syrphe                            680
5         Chenille                          321
6         Autre                             239
7         Diptere                           185
8         Lepidoptere                       147
9         Apis_mellifera                    116
10        Bourdon                            48
--------------------------------------------------
Total count of samples:                  704361
