# Handling Imbalance Dataset

## 1. UpSampling 

It referes to the technique where we try to increase the minority data points.


## 2. DownSampling

It refers to the technique where  we try to decrease the majority data points.

In [5]:
import numpy as np
import pandas as pd

# select the random values
np.random.seed(123)


# total samples
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = int(n_samples - n_class_0)

In [6]:
n_class_0, n_class_1

(900, 100)

In [9]:
# Creating the  Imbalance Dataset
class_0 = pd.DataFrame({
 'feature_1' : np.random.normal(loc = 0, scale = 1, size = n_class_0),
 'feature_2' : np.random.normal(loc = 0, scale = 1, size = n_class_0),
 'target'    : [0] * n_class_0
})

class_1 = pd.DataFrame({
   'feature_1' : np.random.normal(loc = 0, scale = 1, size = n_class_1),
   'feature_2' : np.random.normal(loc = 0, scale = 1, size = n_class_1),
   'target'    : [1] * n_class_1
})

In [11]:
class_0, class_1

(     feature_1  feature_2  target
 0    -0.300232   0.667532       0
 1    -0.632261   0.100458       0
 2    -0.204317  -0.012610       0
 3     0.213696   0.219907       0
 4     1.033878   0.813623       0
 ..         ...        ...     ...
 895   0.356445  -0.486628       0
 896  -0.266376  -0.818430       0
 897   0.804753  -1.138029       0
 898  -1.750640   1.062592       0
 899   1.539891  -0.831040       0
 
 [900 rows x 3 columns],
     feature_1  feature_2  target
 0   -0.367419   1.300921       1
 1    0.575273  -0.930652       1
 2    0.439351  -0.851313       1
 3   -0.728152   0.136584       1
 4   -0.888530  -0.424254       1
 ..        ...        ...     ...
 95  -1.561866   2.540514       1
 96  -0.767819  -0.082706       1
 97   0.387223   0.444621       1
 98  -1.212918   1.896404       1
 99   2.018714   0.237581       1
 
 [100 rows x 3 columns])

In [19]:
# concat the two dataframes
df = pd.concat([class_0, class_1]).reset_index(drop = True)

In [21]:
print(df)

     feature_1  feature_2  target
0    -0.300232   0.667532       0
1    -0.632261   0.100458       0
2    -0.204317  -0.012610       0
3     0.213696   0.219907       0
4     1.033878   0.813623       0
..         ...        ...     ...
995  -1.561866   2.540514       1
996  -0.767819  -0.082706       1
997   0.387223   0.444621       1
998  -1.212918   1.896404       1
999   2.018714   0.237581       1

[1000 rows x 3 columns]


In [22]:
# last five entities of the dataframe
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,-1.561866,2.540514,1
996,-0.767819,-0.082706,1
997,0.387223,0.444621,1
998,-1.212918,1.896404,1
999,2.018714,0.237581,1


In [25]:
# count the target values
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [27]:
# Upsampling
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [29]:
from sklearn.utils import resample
df_minority_resample = resample(df_minority, replace = True, n_samples = len(df_majority), random_state = 42)

In [32]:
df_minority_resample

Unnamed: 0,feature_1,feature_2,target
951,-0.831334,0.305451,1
992,-0.114295,0.954339,1
914,2.417832,1.790856,1
971,1.666319,-0.268754,1
960,-1.009337,-0.694636,1
...,...,...,...
952,-0.216024,-1.372905,1
965,0.165885,-1.094016,1
976,-0.078676,-1.519595,1
942,-0.481843,-0.687439,1


In [33]:
len(df_minority_resample)

900

In [34]:
df_minority_resample.head()

Unnamed: 0,feature_1,feature_2,target
951,-0.831334,0.305451,1
992,-0.114295,0.954339,1
914,2.417832,1.790856,1
971,1.666319,-0.268754,1
960,-1.009337,-0.694636,1


In [36]:
df_minority_resample.shape

(900, 3)

In [38]:
df = pd.concat([df_majority, df_minority_resample]).reset_index(drop = True)

In [39]:
df

Unnamed: 0,feature_1,feature_2,target
0,-0.300232,0.667532,0
1,-0.632261,0.100458,0
2,-0.204317,-0.012610,0
3,0.213696,0.219907,0
4,1.033878,0.813623,0
...,...,...,...
1795,-0.216024,-1.372905,1
1796,0.165885,-1.094016,1
1797,-0.078676,-1.519595,1
1798,-0.481843,-0.687439,1


In [40]:
df['target'].value_counts()

0    900
1    900
Name: target, dtype: int64

In [41]:
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

df = pd.concat([class_0, class_1]).reset_index(drop=True)

# Check the class distribution
print(df['target'].value_counts())

0    900
1    100
Name: target, dtype: int64


In [43]:
# Downsampling
df_minority = df[df['target'] == 1]
df_majority = df[df['target'] == 0]

In [44]:
from sklearn.utils import resample

In [45]:
df_minority_resample = resample(df_majority, replace = False, n_samples = len(df_minority), random_state = 42)

In [46]:
df_minority_resample.shape

(100, 3)

In [47]:
df = pd.concat([df_minority, df_minority_resample]).reset_index(drop = False)

In [48]:
df['target'].value_counts()

1    100
0    100
Name: target, dtype: int64