In [9]:
import pandas as pd
import numpy as np

# Step 1: Create a synthetic dataset
num_classes = 130
samples_per_class = [np.random.randint(1, 2000) for _ in range(num_classes)]  # Random samples per class

data = {
    'class_column': [],
    'feature_1': [],
    'feature_2': [],
}

# Generate data
for class_id in range(num_classes):
    num_samples = samples_per_class[class_id]
    data['class_column'].extend([f'class_{class_id}'] * num_samples)
    data['feature_1'].extend(np.random.rand(num_samples) * 100)  # Random feature 1
    data['feature_2'].extend(np.random.rand(num_samples) * 100)  # Random feature 2

# Create DataFrame
df = pd.DataFrame(data)
print(df)
# Step 2: Define parameters for filtering


# Step 3: Function to filter and sample each class
def filter_and_sample(group):
    min_samples = 50
    max_samples = 1000
    if len(group) < min_samples:
        return pd.DataFrame()  # Return empty DataFrame if below min_samples
    elif len(group) > max_samples:
        return group.sample(max_samples, random_state=42)  # Sample max_samples
    else:
        return group

# Step 4: Group by class column and apply the function
filtered_data = df.groupby('class_column').apply(filter_and_sample).reset_index(drop=True)


# Display the results
print("Original dataset size:", df.shape)
print(df['class_column'].value_counts())  # Check the distribution
print("Filtered dataset size:", filtered_data.shape)
print(filtered_data['class_column'].value_counts())  # Check the distribution


       class_column  feature_1  feature_2
0           class_0  54.435609  99.697240
1           class_0  14.672457  90.604787
2           class_0  20.819897  82.111381
3           class_0   5.781143  71.274732
4           class_0  98.085144  65.497024
...             ...        ...        ...
140210    class_129  21.554358  54.233376
140211    class_129  50.188460  68.974345
140212    class_129  66.586865  73.887394
140213    class_129  69.015632  65.688458
140214    class_129  56.762662  96.015781

[140215 rows x 3 columns]
Original dataset size: (140215, 3)
class_95     1992
class_42     1984
class_25     1981
class_116    1968
class_8      1968
             ... 
class_34       86
class_128      60
class_90       33
class_43       16
class_114       6
Name: class_column, Length: 130, dtype: int64
Filtered dataset size: (102314, 3)
class_0      1000
class_5      1000
class_62     1000
class_61     1000
class_59     1000
             ... 
class_70      130
class_78      102
class_58   