In [1]:
import numpy as np
import pandas as pd

In [2]:
# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio) # 900 data points of class_0
n_class_1 = n_samples - n_class_0 # 100 data points

In [3]:
n_class_0 , n_class_1

(900, 100)

In [8]:
## CREATE MY DATAFRAME WITH IMBALANCED DATASET
# loc is mean and scale is basically std deviation here
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})
# 0 o/p feature 900 banege

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})
# 1 o/p feature 100 banege

In [11]:
class_0

Unnamed: 0,feature_1,feature_2,target
0,-0.367419,-0.016102,0
1,0.575273,0.747010,0
2,0.439351,-0.257842,0
3,-0.728152,0.561622,0
4,-0.888530,0.295353,0
...,...,...,...
895,0.719991,0.787335,0
896,-1.159531,0.732800,0
897,0.609591,-1.995079,0
898,-1.919582,-0.254572,0


In [12]:
class_1

Unnamed: 0,feature_1,feature_2,target
0,0.723566,2.301225,1
1,2.054508,1.935051,1
2,1.364961,1.393597,1
3,1.509824,1.771339,1
4,1.566104,3.776112,1
...,...,...,...
95,2.300311,3.140850,1
96,2.777139,1.428622,1
97,2.589859,2.675585,1
98,0.013286,2.244925,1


In [13]:
pd.concat([class_0 , class_1]).reset_index(drop=True)

Unnamed: 0,feature_1,feature_2,target
0,-0.367419,-0.016102,0
1,0.575273,0.747010,0
2,0.439351,-0.257842,0
3,-0.728152,0.561622,0
4,-0.888530,0.295353,0
...,...,...,...
995,2.300311,3.140850,1
996,2.777139,1.428622,1
997,2.589859,2.675585,1
998,0.013286,2.244925,1


In [14]:
df = pd.concat([class_0 , class_1]).reset_index(drop=True)

In [15]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-0.367419,-0.016102,0
1,0.575273,0.74701,0
2,0.439351,-0.257842,0
3,-0.728152,0.561622,0
4,-0.88853,0.295353,0


In [16]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [17]:
# 1. Upsample converting 100pts -> 900 pts

In [21]:
df_minority = df[df['target']==1]
df_majority = df[df['target']==0]

In [22]:
df_minority

Unnamed: 0,feature_1,feature_2,target
900,0.723566,2.301225,1
901,2.054508,1.935051,1
902,1.364961,1.393597,1
903,1.509824,1.771339,1
904,1.566104,3.776112,1
...,...,...,...
995,2.300311,3.140850,1
996,2.777139,1.428622,1
997,2.589859,2.675585,1
998,0.013286,2.244925,1


In [24]:
from sklearn.utils import resample 

In [26]:
df_minority_upsample = resample(df_minority , n_samples=len(df_majority) , replace=True , random_state=42)

In [27]:
df_minority_upsample

Unnamed: 0,feature_1,feature_2,target
951,2.998884,1.803756,1
992,2.361990,0.739604,1
914,0.932880,0.904999,1
971,2.409614,3.507181,1
960,1.874774,3.425993,1
...,...,...,...
952,2.142778,4.026777,1
965,1.431038,2.383004,1
976,2.015005,2.370883,1
942,2.444941,1.297736,1


In [29]:
df_minority_upsample.shape

(900, 3)

In [31]:
df_upsampled = pd.concat([df_majority , df_minority_upsample])

In [32]:
df_upsampled

Unnamed: 0,feature_1,feature_2,target
0,-0.367419,-0.016102,0
1,0.575273,0.747010,0
2,0.439351,-0.257842,0
3,-0.728152,0.561622,0
4,-0.888530,0.295353,0
...,...,...,...
952,2.142778,4.026777,1
965,1.431038,2.383004,1
976,2.015005,2.370883,1
942,2.444941,1.297736,1


In [33]:
df_upsampled['target'].value_counts()

0    900
1    900
Name: target, dtype: int64

In [34]:
# 2. Downsample majority 900->100

In [35]:
df_minority = df[df['target']==1]
df_majority = df[df['target']==0]

In [36]:
df_majority

Unnamed: 0,feature_1,feature_2,target
0,-0.367419,-0.016102,0
1,0.575273,0.747010,0
2,0.439351,-0.257842,0
3,-0.728152,0.561622,0
4,-0.888530,0.295353,0
...,...,...,...
895,0.719991,0.787335,0
896,-1.159531,0.732800,0
897,0.609591,-1.995079,0
898,-1.919582,-0.254572,0


In [40]:
df_majority_downsample = resample(df_majority , n_samples = len(df_minority) , random_state = 42 , replace = True)

In [41]:
df_majority_downsample

Unnamed: 0,feature_1,feature_2,target
102,-0.851313,-0.322697,0
435,-0.931511,-0.102178,0
860,-0.179744,0.076603,0
270,0.267059,0.925318,0
106,0.680642,0.188076,0
...,...,...,...
201,-0.448991,1.650653,0
269,1.154889,0.794182,0
862,1.630844,0.341856,0
815,2.202473,0.123941,0


In [42]:
df_downsampled = pd.concat([df_minority , df_majority_downsample])

In [43]:
df_downsampled

Unnamed: 0,feature_1,feature_2,target
900,0.723566,2.301225,1
901,2.054508,1.935051,1
902,1.364961,1.393597,1
903,1.509824,1.771339,1
904,1.566104,3.776112,1
...,...,...,...
201,-0.448991,1.650653,0
269,1.154889,0.794182,0
862,1.630844,0.341856,0
815,2.202473,0.123941,0


In [44]:
df_downsampled['target'].value_counts()

1    100
0    100
Name: target, dtype: int64