## Handling Imbalanced Dataset

1. Up Sampling

2. Down Sampling

### 1. UpSampling

In [216]:
import numpy as np
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [217]:
n_class_0, n_class_1

(900, 100)

In [218]:
## CREATE MY DATAFRAME WITH IMBALANCED DATASET
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [219]:
df = pd.concat([class_0,class_1]).reset_index(drop=True)

In [220]:
df

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
995,1.376371,2.845701,1
996,2.239810,0.880077,1
997,1.131760,1.640703,1
998,2.902006,0.390305,1


In [221]:
df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

## OBSERVATIONS:

1. The above 'Target' dataset is an imbalanced dataset as it has 900 values of 0 category and 100 values of 1 category.

In [222]:
### Performing the Upsampling in the minority class

df_target_0 = df[df['target']==0]

df_target_1 = df[df['target']==1]

In [223]:
df_target_0

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
895,0.238761,-0.003155,0
896,-1.106386,-0.430660,0
897,0.366732,-0.146416,0
898,1.023906,1.160176,0


In [224]:
df_target_1

Unnamed: 0,feature_1,feature_2,target
900,1.699768,2.139033,1
901,1.367739,2.025577,1
902,1.795683,1.803557,1
903,2.213696,3.312255,1
904,3.033878,3.187417,1
...,...,...,...
995,1.376371,2.845701,1
996,2.239810,0.880077,1
997,1.131760,1.640703,1
998,2.902006,0.390305,1


In [225]:
### Perform the Upsampling on the minority class to make it equal to the majority class

from sklearn.utils import resample

df_target_1_new = resample(
    df_target_1                     ,
    replace = True                  ,
    n_samples =  len(df_target_0)   ,
    random_state=42
)

In [226]:
df_target_0

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
895,0.238761,-0.003155,0
896,-1.106386,-0.430660,0
897,0.366732,-0.146416,0
898,1.023906,1.160176,0


In [227]:
df_target_1

Unnamed: 0,feature_1,feature_2,target
900,1.699768,2.139033,1
901,1.367739,2.025577,1
902,1.795683,1.803557,1
903,2.213696,3.312255,1
904,3.033878,3.187417,1
...,...,...,...
995,1.376371,2.845701,1
996,2.239810,0.880077,1
997,1.131760,1.640703,1
998,2.902006,0.390305,1


In [228]:
df_target_1_new

Unnamed: 0,feature_1,feature_2,target
951,1.125854,1.843917,1
992,2.196570,1.397425,1
914,1.932170,2.998053,1
971,2.272825,3.034197,1
960,2.870056,1.550485,1
...,...,...,...
952,1.188902,2.189189,1
965,3.919526,1.980541,1
976,2.810326,3.604614,1
942,3.621531,2.168229,1


In [229]:
df_new = pd.concat([df_target_0,df_target_1_new])

In [230]:
df_new

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
952,1.188902,2.189189,1
965,3.919526,1.980541,1
976,2.810326,3.604614,1
942,3.621531,2.168229,1


In [231]:
### To get the total counts in the target columns

df['target'].value_counts()

target
0    900
1    100
Name: count, dtype: int64

In [232]:
### To get the total counts in the target columns

df_new['target'].value_counts()

target
0    900
1    900
Name: count, dtype: int64

### OBSERVATIONS:

1.  After performing Upsampling technique on the imbalanced dataset, we can see that the target 1 earlier which has 100 samples has been converted into 900 samples that is equal to the majority class.

2. So the total number of samples in the majority and in the minority class is both 900 .