#Upsampling




Upsampling involves increasing the number of instances in the minority class.
In your dataset, if the minority class is the class represented by the target variable, you would increase the number of instances where the target variable is in the minority class.

For example, we have fewer instances where target is 1, we will create additional samples by randomly selecting existing samples from the minority class and duplicating them.

In [None]:
import pandas as pd
import numpy as np

In [None]:
#setting up random seed for reproducibility
np.random.seed(123)

#creating dataframe with 2 classes


n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

In [None]:
n_class_0,n_class_1

(900, 100)

In [None]:
## CREATE MY DATAFRAME WITH IMBALANCED DATASET
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [None]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [None]:
df.tail()

Unnamed: 0,feature_1,feature_2,target
995,1.376371,2.845701,1
996,2.23981,0.880077,1
997,1.13176,1.640703,1
998,2.902006,0.390305,1
999,2.69749,2.01357,1


In [None]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [None]:
## upsampling
df_minority=df[df['target']==1]
df_majority=df[df['target']==0]

In [None]:
df_majority.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [None]:
df_minority.head()

Unnamed: 0,feature_1,feature_2,target
900,1.699768,2.139033,1
901,1.367739,2.025577,1
902,1.795683,1.803557,1
903,2.213696,3.312255,1
904,3.033878,3.187417,1


In [None]:
from sklearn.utils import resample

In [None]:
#performing upsampling

df_minority_upsample=resample(df_minority,
                              replace=True,#Sample with replacement
                              n_samples=len(df_majority),#to match the majority class
                              random_state=42
                              )

In [None]:
df_minority_upsample.shape

(900, 3)

In [None]:
df_minority_upsample['target'].value_counts()

1    900
Name: target, dtype: int64

In [None]:
df_upsample=pd.concat([df_majority,df_minority_upsample])

In [None]:
df_upsample['target'].value_counts()

0    900
1    900
Name: target, dtype: int64

#Down Sampling



Downsampling involves reducing the number of instances in the majority class.
In your dataset, if the majority class is the class represented by the target variable, you would decrease the number of instances where the target variable is in the majority class.

For example, we have more instances where target is 0, we will randomly select a subset of samples from the majority class to match the number of samples in the minority class, effectively reducing the dataset size.

In [None]:
import pandas as pd

# Set the random seed for reproducibility
np.random.seed(123)

# Create a dataframe with two classes
n_samples = 1000
class_0_ratio = 0.9
n_class_0 = int(n_samples * class_0_ratio)
n_class_1 = n_samples - n_class_0

class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

df = pd.concat([class_0, class_1]).reset_index(drop=True)

# Check the class distribution
print(df['target'].value_counts())

0    900
1    100
Name: target, dtype: int64


In [None]:
## downsampling
df_minority=df[df['target']==1]
df_majority=df[df['target']==0]

In [None]:
from sklearn.utils import resample
df_majority_downsampled=resample(df_majority,replace=False, #Sample With replacement
         n_samples=len(df_minority),
         random_state=42
        )

In [None]:
df_majority_downsampled.shape

(100, 3)

In [None]:
df_downsampled=pd.concat([df_minority,df_majority_downsampled])

In [None]:
df_downsampled['target'].value_counts()

1    100
0    100
Name: target, dtype: int64