# Balancing a Dataset with Downsampling

dataset-binary classification task-classes are imbalanced- downsample the majority class to balance dataset

In [6]:
import pandas as pd
from sklearn.utils import resample

# Creating the DataFrame
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40,45, 50, 55,60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['High', 'Low', 'Low', 'High', 'High', 'Low', 'High', 'High', 'Low', 'Low', 'High', 'High', 'Low']
})

high-7 low-6 instances

In [14]:
df_high=df[df['Class']=='High']

df_low=df[df['Class']=='Low']
print(df_high,"\n",df_low)

    Age  Income Class
0    22    2000  High
3    28    3200  High
4    30    3500  High
6    40    4000  High
7    45    4200  High
10   60    5000  High
11   65    5500  High 
     Age  Income Class
1    25    2500   Low
2    27    2700   Low
5    35    3800   Low
8    50    4300   Low
9    55    4500   Low
12   70    6000   Low


In [8]:
#sampling aim is to balance the dataset
#downsampling=majority sampling,upsampling= minority sampling
#reducing the majority classes is called downsampling
#increasing the minority classes is called upsampling
#duplicates the value so we get equal instances


In [26]:
df_high_downsampled=resample(df_high,replace=False,n_samples=len(df_low),random_state=42)
df_balanced=pd.concat([df_high_downsampled,df_low])
print(df_balanced['Class'].value_counts())

Class
High    6
Low     6
Name: count, dtype: int64


In [30]:
df_low_upsampled=resample(df_low,replace=True,n_samples=len(df_high),random_state=42)
df_balanced=pd.concat([df_low_upsampled,df_high])
print(df_balanced['Class'].value_counts())

Class
Low     7
High    7
Name: count, dtype: int64


In [31]:
import pandas as pd
from sklearn.utils import resample

# Creating the DataFrame
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40,45, 50, 55,60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Minority', 'Minority', 'Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Majority']
})

In [33]:
df_majority=df[df['Class']=='Majority']

df_minority=df[df['Class']=='Minority']
print(df_high,"\n",df_low)

    Age  Income     Class
1    25    2500  Majority
2    27    2700  Majority
3    28    3200  Majority
4    30    3500  Majority
8    50    4300  Majority
9    55    4500  Majority
10   60    5000  Majority
11   65    5500  Majority
12   70    6000  Majority 
    Age  Income     Class
0   22    2000  Minority
5   35    3800  Minority
6   40    4000  Minority
7   45    4200  Minority


In [34]:
df_majority_downsampled=resample(df_majority,replace=False,n_samples=len(df_minority),random_state=42)
df_balanced=pd.concat([df_majority_downsampled,df_minority])
print(df_balanced['Class'].value_counts())

Class
Majority    4
Minority    4
Name: count, dtype: int64


In [35]:
df_minority_upsampled=resample(df_minority,replace=True,n_samples=len(df_majority),random_state=42)
df_balanced=pd.concat([df_minority_upsampled,df_majority])
print(df_balanced['Class'].value_counts())

Class
Minority    9
Majority    9
Name: count, dtype: int64


# SMOTE 

**(synthetic minority oversampling technique)**

In [3]:
pip install imbalanced-learn==0.7.0

Note: you may need to restart the kernel to use updated packages.


1.SMOTE to generate synthetic samples instead of duplicating exixting ones

2.Convert categorical class labels into numeric form for SMOTE to work

3.Apply SMOTE to balance the dataset

4.Convert back to original categorical labels

5.Combine the resampled data into a final balanced dataset


In [13]:
import pandas as pd
from imblearn.over_sampling import SMOTE
df = pd.DataFrame({
    'Age': [22, 25, 27, 28, 30, 35, 40,45, 50, 55,60, 65, 70],
    'Income': [2000, 2500, 2700, 3200, 3500, 3800, 4000, 4200, 4300, 4500, 5000, 5500, 6000],
    'Class': ['Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Minority', 'Minority', 'Minority', 'Majority', 'Majority', 'Majority', 'Majority', 'Majority']
})

In [14]:

df['Class']=df['Class'].map({'Majority':0,'Minority':1})
X=df[['Age','Income']]
y=df['Class']
smote=SMOTE(sampling_strategy='auto',random_state=42,k_neighbors=3)
X_resampled,y_resampled=smote.fit_resample(X,y)
y_resampled=y_resampled.map({0:'Majority',1:'Minority'})

df_balanced=pd.concat([pd.DataFrame(X_resampled,columns=['Age','Income']),pd.DataFrame(y_resampled,columns=['Class'])],axis=1)
print(df_balanced['Class'].value_counts())
print(df_balanced)

Class
Minority    9
Majority    9
Name: count, dtype: int64
    Age  Income     Class
0    22    2000  Minority
1    25    2500  Majority
2    27    2700  Majority
3    28    3200  Majority
4    30    3500  Majority
5    35    3800  Minority
6    40    4000  Minority
7    45    4200  Minority
8    50    4300  Majority
9    55    4500  Majority
10   60    5000  Majority
11   65    5500  Majority
12   70    6000  Majority
13   40    4031  Minority
14   35    3831  Minority
15   44    4176  Minority
16   35    3826  Minority
17   41    4040  Minority
