# Essai normalisation 
Le but est d'avoir 1/3 de chaque classe

In [1]:
import numpy as np
from collections import Counter
from lblearn.datasets import load_galaxies

In [3]:
galaxies = load_galaxies(balance=True, n_samples=100)

In [4]:
y = galaxies.target
count = Counter(y)

In [5]:
min_class, min_val = count.most_common()[-1]

In [6]:
print(f"The least represented class is {min_class} with {min_val} datapoints")

The least represented class is 2 with 100 datapoints


In [7]:
count

Counter({0: 100, 1: 100, 2: 100})

The lowest number of available data is the spiral ones, while the highest number of data points correspond to the 'uncertain' type. We want to equalize the count, so the final length of the data set should be 3x62190 = 186570

We then need to sample $N$ some of these indices.

In [28]:
samples0 = np.random.choice(np.where(y==0)[0], (min_val, ))
samples1 = np.random.choice(np.where(y==1)[0], (min_val, ))
samples2 = np.random.choice(np.where(y==2)[0], (min_val, ))

In [29]:
assert len(samples1) == len(samples2) == len(samples0), f"Sample length are not equal, {len(samples1)} different from {len(samples2)} and {len(samples0)}."
assert np.all(samples1 != samples2), "Some indices overlap but it should not be possible"
assert np.all(samples1 != samples0), "Some indices overlap but it should not be possible"

In [30]:
samples = np.hstack((samples0, samples1, samples2))
samples.shape

(300,)

We now want to resample all the data given the indices in `samples`

In [26]:
y_balanced = y[samples]
Counter(y_balanced)

Counter({0: 62190, 1: 62190, 2: 62190})

In [27]:
data = galaxies.data[samples, :]
print(len(data))

186570
