In [73]:
import numpy as np

In [93]:
data_original = np.load("data/original/tissuemnist_224.npz")
data_sample = np.load("data/medmnist_data_subsets/tissuemnist_fine-tune_224.npz")

print(data_original.files)
print(data_sample.files)

['train_images', 'train_labels', 'val_images', 'val_labels', 'test_images', 'test_labels']
['train_imgs_fold1', 'train_labels_fold1', 'val_imgs_fold1', 'val_labels_fold1', 'train_imgs_fold2', 'train_labels_fold2', 'val_imgs_fold2', 'val_labels_fold2', 'train_imgs_fold3', 'train_labels_fold3', 'val_imgs_fold3', 'val_labels_fold3', 'train_imgs_fold4', 'train_labels_fold4', 'val_imgs_fold4', 'val_labels_fold4', 'train_imgs_fold5', 'train_labels_fold5', 'val_imgs_fold5', 'val_labels_fold5']


In [94]:
# Original data source
dist_original = {}
values_original, counts_original = np.unique(data_original["train_labels"], return_counts=True)
total_original = counts_original.sum()

for val, count in zip(values_original, counts_original):
    dist_original[val] = [count, count / total_original]

print("Original distribution:")
for k, v in dist_original.items():
    print(f"Class {k}: Count = {v[0]}, Proportion = {v[1]}")



print("---------------------")
print("Train")


# Sample
avg_dist_train = {} # key: class, value: count
dist_folds_train = {}  # key: fold number, value: {class: [count, proportion]}
total_count_train = 0

for i in range(1, 6):
    key = f"train_labels_fold{i}"
    values, counts = np.unique(data_sample[key], return_counts=True)
    total = counts.sum()

    fold_dist = {}
    for val, count in zip(values, counts):
        fold_dist[val] = [count, count / total]
        if val not in avg_dist_train:
            avg_dist_train[val] = count
        else:
            avg_dist_train[val] += count
        total_count_train += count
    dist_folds_train[i] = fold_dist

for i in range(1, 6):
    print(f"\nFold {i} distribution:")
    for k in sorted(dist_folds_train[i]):
        count, proportion = dist_folds_train[i][k]
        print(f"Class {k}: Count = {count}, Proportion = {proportion}")


print("--------------------")
print("Average Train")
for i in sorted(avg_dist_train):
    print(f"Class {i}: Count: {avg_dist_train[i]}, Proportion: {avg_dist_train[i]/total_count_train}")

Original distribution:
Class 0: Count = 53075, Proportion = 0.3207607605187773
Class 1: Count = 7814, Proportion = 0.04722420315956148
Class 2: Count = 5866, Proportion = 0.03545139182671969
Class 3: Count = 15406, Proportion = 0.09310674096188945
Class 4: Count = 11789, Proportion = 0.07124726529921555
Class 5: Count = 7705, Proportion = 0.04656545755623512
Class 6: Count = 39203, Proportion = 0.23692480630461848
Class 7: Count = 24608, Proportion = 0.14871937437298297
---------------------
Train

Fold 1 distribution:
Class 0: Count = 264, Proportion = 0.33
Class 1: Count = 35, Proportion = 0.04375
Class 2: Count = 32, Proportion = 0.04
Class 3: Count = 74, Proportion = 0.0925
Class 4: Count = 63, Proportion = 0.07875
Class 5: Count = 26, Proportion = 0.0325
Class 6: Count = 184, Proportion = 0.23
Class 7: Count = 122, Proportion = 0.1525

Fold 2 distribution:
Class 0: Count = 248, Proportion = 0.31
Class 1: Count = 34, Proportion = 0.0425
Class 2: Count = 29, Proportion = 0.03625
Cla

In [95]:
# Original data source
dist_original_val = {}
values_original_val, counts_original = np.unique(data_original["val_labels"], return_counts=True)
total_original_val = counts_original.sum()

for val, count in zip(values_original_val, counts_original):
    dist_original_val[val] = [count, count / total_original_val]

print("Original distribution Val:")
for k, v in dist_original_val.items():
    print(f"Class {k}: Count = {v[0]}, Proportion = {v[1]}")

print("---------------------")
print("Validation")


# Sample
avg_dist_val = {} # key: class, value: count
dist_folds_val = {}  # key: fold number, value: {class: [count, proportion]}
total_count_val = 0

for i in range(1, 6):
    key = f"val_labels_fold{i}"
    values, counts = np.unique(data_sample[key], return_counts=True)
    total = counts.sum()

    fold_dist = {}
    for val, count in zip(values, counts):
        fold_dist[val] = [count, count / total]
        if val not in avg_dist_val:
            avg_dist_val[val] = count
        else:
            avg_dist_val[val] += count
        total_count_val += count
    dist_folds_val[i] = fold_dist

for i in range(1, 6):
    print(f"\nFold {i} distribution:")
    for k in sorted(dist_folds_val[i]):
        count, proportion = dist_folds_val[i][k]
        print(f"Class {k}: Count = {count}, Proportion = {proportion}")


print("--------------------")
print("Average val")
for i in sorted(avg_dist_val):
    print(f"Class {i}: Count: {avg_dist_val[i]}, Proportion: {avg_dist_val[i]/total_count_val}")


Original distribution Val:
Class 0: Count = 7582, Proportion = 0.3207275803722504
Class 1: Count = 1117, Proportion = 0.04725042301184433
Class 2: Count = 838, Proportion = 0.03544839255499154
Class 3: Count = 2201, Proportion = 0.09310490693739425
Class 4: Count = 1684, Proportion = 0.07123519458544839
Class 5: Count = 1101, Proportion = 0.0465736040609137
Class 6: Count = 5601, Proportion = 0.23692893401015228
Class 7: Count = 3516, Proportion = 0.14873096446700507
---------------------
Validation

Fold 1 distribution:
Class 0: Count = 55, Proportion = 0.275
Class 1: Count = 6, Proportion = 0.03
Class 2: Count = 5, Proportion = 0.025
Class 3: Count = 16, Proportion = 0.08
Class 4: Count = 14, Proportion = 0.07
Class 5: Count = 18, Proportion = 0.09
Class 6: Count = 63, Proportion = 0.315
Class 7: Count = 23, Proportion = 0.115

Fold 2 distribution:
Class 0: Count = 60, Proportion = 0.3
Class 1: Count = 9, Proportion = 0.045
Class 2: Count = 7, Proportion = 0.035
Class 3: Count = 15, 

In [96]:
original_props = {k: v[1] for k, v in dist_original.items()} 
all_diffs = []

for i in range(1, 6):
    fold_props = dist_folds_train[i]
    print(f"\nFold {i} differences:")

    for cls in sorted(original_props.keys()):
        orig = original_props.get(cls, 0.0)
        train = fold_props.get(cls, [0, 0.0])[1]  # default to 0.0 if class missing
        diff = abs(train - orig)
        all_diffs.append(diff)
        print(f"Class {cls}: Original = {orig:.6f}, Train = {train:.6f}, Diff = {diff:.6f}")

# Step 3: Summary
print("\n====== Summary Across All Train Folds ======")
print(f"Max Difference: {max(all_diffs):.6f}")
print(f"Avg Difference: {sum(all_diffs)/len(all_diffs):.6f}")



Fold 1 differences:
Class 0: Original = 0.320761, Train = 0.330000, Diff = 0.009239
Class 1: Original = 0.047224, Train = 0.043750, Diff = 0.003474
Class 2: Original = 0.035451, Train = 0.040000, Diff = 0.004549
Class 3: Original = 0.093107, Train = 0.092500, Diff = 0.000607
Class 4: Original = 0.071247, Train = 0.078750, Diff = 0.007503
Class 5: Original = 0.046565, Train = 0.032500, Diff = 0.014065
Class 6: Original = 0.236925, Train = 0.230000, Diff = 0.006925
Class 7: Original = 0.148719, Train = 0.152500, Diff = 0.003781

Fold 2 differences:
Class 0: Original = 0.320761, Train = 0.310000, Diff = 0.010761
Class 1: Original = 0.047224, Train = 0.042500, Diff = 0.004724
Class 2: Original = 0.035451, Train = 0.036250, Diff = 0.000799
Class 3: Original = 0.093107, Train = 0.092500, Diff = 0.000607
Class 4: Original = 0.071247, Train = 0.068750, Diff = 0.002497
Class 5: Original = 0.046565, Train = 0.042500, Diff = 0.004065
Class 6: Original = 0.236925, Train = 0.255000, Diff = 0.01807

In [97]:
original_props = {k: v[1] for k, v in dist_original_val.items()} 
all_diffs = []

for i in range(1, 6):
    fold_props = dist_folds_val[i]
    print(f"\nFold {i} differences:")

    for cls in sorted(original_props.keys()):
        orig = original_props.get(cls, 0.0)
        val = fold_props.get(cls, [0, 0.0])[1]  # default to 0.0 if class missing
        diff = abs(val - orig)
        all_diffs.append(diff)
        print(f"Class {cls}: Original = {orig:.6f}, Val = {val:.6f}, Diff = {diff:.6f}")

# Step 3: Summary
print("\n====== Summary Across All Val Folds ======")
print(f"Max Difference: {max(all_diffs):.6f}")
print(f"Avg Difference: {sum(all_diffs)/len(all_diffs):.6f}")



Fold 1 differences:
Class 0: Original = 0.320728, Val = 0.275000, Diff = 0.045728
Class 1: Original = 0.047250, Val = 0.030000, Diff = 0.017250
Class 2: Original = 0.035448, Val = 0.025000, Diff = 0.010448
Class 3: Original = 0.093105, Val = 0.080000, Diff = 0.013105
Class 4: Original = 0.071235, Val = 0.070000, Diff = 0.001235
Class 5: Original = 0.046574, Val = 0.090000, Diff = 0.043426
Class 6: Original = 0.236929, Val = 0.315000, Diff = 0.078071
Class 7: Original = 0.148731, Val = 0.115000, Diff = 0.033731

Fold 2 differences:
Class 0: Original = 0.320728, Val = 0.300000, Diff = 0.020728
Class 1: Original = 0.047250, Val = 0.045000, Diff = 0.002250
Class 2: Original = 0.035448, Val = 0.035000, Diff = 0.000448
Class 3: Original = 0.093105, Val = 0.075000, Diff = 0.018105
Class 4: Original = 0.071235, Val = 0.105000, Diff = 0.033765
Class 5: Original = 0.046574, Val = 0.050000, Diff = 0.003426
Class 6: Original = 0.236929, Val = 0.240000, Diff = 0.003071
Class 7: Original = 0.148731