# Handling Imbalance Dataset

1. UpSampling

It referes to the technique where we try to increase the minority data points.

2. DownSampling

It refers to the technique where we try to decrease the majority data points.

In [1]:
import numpy as np
import pandas as pd

# select the random values
np.random.seed(123)


# total samples
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = int(n_samples - n_class_0)

In [2]:

n_class_0, n_class_1

(900, 100)

In [3]:
# Creating the  Imbalance Dataset
class_0 = pd.DataFrame({
 'feature_1' : np.random.normal(loc = 0, scale = 1, size = n_class_0),
 'feature_2' : np.random.normal(loc = 0, scale = 1, size = n_class_0),
 'target'    : [0] * n_class_0
})

class_1 = pd.DataFrame({
   'feature_1' : np.random.normal(loc = 0, scale = 1, size = n_class_1),
   'feature_2' : np.random.normal(loc = 0, scale = 1, size = n_class_1),
   'target'    : [1] * n_class_1
})

In [4]:
class_0, class_1


(     feature_1  feature_2  target
 0    -1.085631   0.551302       0
 1     0.997345   0.419589       0
 2     0.282978   1.815652       0
 3    -1.506295  -0.252750       0
 4    -0.578600  -0.292004       0
 ..         ...        ...     ...
 895   0.238761  -0.003155       0
 896  -1.106386  -0.430660       0
 897   0.366732  -0.146416       0
 898   1.023906   1.160176       0
 899  -0.210056  -0.641512       0
 
 [900 rows x 3 columns],
     feature_1  feature_2  target
 0   -0.300232   0.139033       1
 1   -0.632261   0.025577       1
 2   -0.204317  -0.196443       1
 3    0.213696   1.312255       1
 4    1.033878   1.187417       1
 ..        ...        ...     ...
 95  -0.623629   0.845701       1
 96   0.239810  -1.119923       1
 97  -0.868240  -0.359297       1
 98   0.902006  -1.609695       1
 99   0.697490   0.013570       1
 
 [100 rows x 3 columns])

In [5]:
# concat the two dataframes
df = pd.concat([class_0, class_1]).reset_index(drop = True)

In [6]:
print(df)

     feature_1  feature_2  target
0    -1.085631   0.551302       0
1     0.997345   0.419589       0
2     0.282978   1.815652       0
3    -1.506295  -0.252750       0
4    -0.578600  -0.292004       0
..         ...        ...     ...
995  -0.623629   0.845701       1
996   0.239810  -1.119923       1
997  -0.868240  -0.359297       1
998   0.902006  -1.609695       1
999   0.697490   0.013570       1

[1000 rows x 3 columns]


In [7]:
# last five entities of the dataframe
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,-0.623629,0.845701,1
996,0.23981,-1.119923,1
997,-0.86824,-0.359297,1
998,0.902006,-1.609695,1
999,0.69749,0.01357,1


In [8]:
# count the target values
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [9]:
# Upsampling
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [10]:
from sklearn.utils import resample
df_minority_resample = resample(df_minority, replace = True, n_samples = len(df_majority), random_state = 42)

In [11]:
df_minority_resample

Unnamed: 0,feature_1,feature_2,target
951,-0.874146,-0.156083,1
992,0.196570,-0.602575,1
914,-0.067830,0.998053,1
971,0.272825,1.034197,1
960,0.870056,-0.449515,1
...,...,...,...
952,-0.811098,0.189189,1
965,1.919526,-0.019459,1
976,0.810326,1.604614,1
942,1.621531,0.168229,1


In [12]:
df = pd.concat([df_majority, df_minority_resample]).reset_index(drop = True)

In [13]:
df

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
1795,-0.811098,0.189189,1
1796,1.919526,-0.019459,1
1797,0.810326,1.604614,1
1798,1.621531,0.168229,1


In [14]:
df['target'].value_counts()

1    900
0    900
Name: target, dtype: int64

In [15]:
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

df = pd.concat([class_0, class_1]).reset_index(drop=True)

# Check the class distribution
print(df['target'].value_counts())

0    900
1    100
Name: target, dtype: int64


In [16]:
# Downsampling
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [17]:
from sklearn.utils import resample

In [18]:
df_minority_resample = resample(df_majority, replace = False, n_samples = len(df_minority), random_state = 42)

In [19]:
df_minority_resample.shape

(100, 3)

In [20]:
df = pd.concat([df_minority, df_minority_resample]).reset_index(drop = False)

In [21]:
df['target'].value_counts()

1    100
0    100
Name: target, dtype: int64