In [1]:
from sklearn.utils import resample
from sklearn.datasets import make_classification
import pandas as pd

X,y=make_classification(n_classes=2,weights=[0.8,0.2],n_features=4,n_samples=100,random_state=42)
df=pd.DataFrame(X,columns=['feature_1','feature_2','feature_3','feature_4'])

df['balance']=y
print(df)

    feature_1  feature_2  feature_3  feature_4  balance
0   -1.053839  -1.027544  -0.329294   0.826007        1
1    1.569317   1.306542  -0.239385  -0.331376        0
2   -0.658926  -0.357633   0.723682  -0.628277        0
3   -0.136856   0.460938   1.896911  -2.281386        0
4   -0.048629   0.502301   1.778730  -2.171053        0
..        ...        ...        ...        ...      ...
95  -2.241820  -1.248690   2.357902  -2.009185        0
96   0.573042   0.362054  -0.462814   0.341294        1
97  -0.375121  -0.149518   0.588465  -0.575002        0
98   1.042518   1.058239   0.461945  -0.984846        0
99  -0.121203  -0.043997   0.204211  -0.203119        0

[100 rows x 5 columns]


In [2]:
df_major=df[df.balance==0]
df_minor=df[df.balance==1]

df_minor_sample=resample(df_minor,replace=True,n_samples=80,random_state=42)

df_sample = pd.concat([df_major, df_minor_sample])
print(df_sample.balance.value_counts())



0    80
1    80
Name: balance, dtype: int64


In [3]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.datasets import make_classification
X, y = make_classification(n_classes=2,
                           weights=[0.8, 0.2],
                           n_features=4,
                           n_samples=100,
                           random_state=42)

# Printing number of samples in
# each class before Over-Sampling
t = [(d) for d in y if d==0]
s = [(d) for d in y if d==1]
print('Before Over-Sampling: ')
print('Samples in class 0: ',len(t))
print('Samples in class 1: ',len(s))

Before Over-Sampling: 
Samples in class 0:  80
Samples in class 1:  20


In [4]:
OverS = RandomOverSampler(random_state=42)

# Fit predictor (x variable)
# and target (y variable) using fit_resample()
X_Over, Y_Over = OverS.fit_resample(X, y)

# Printing number of samples in
# each class after Over-Sampling
t = [(d) for d in Y_Over if d==0]
s = [(d) for d in Y_Over if d==1]
print('After Over-Sampling: ')
print('Samples in class 0: ',len(t))
print('Samples in class 1: ',len(s))

After Over-Sampling: 
Samples in class 0:  80
Samples in class 1:  80


In [6]:
from imblearn.under_sampling import RandomUnderSampler
from sklearn.datasets import make_classification

X, y = make_classification(n_classes=2,
                           weights=[0.8, 0.2],
                           n_features=4,
                           n_samples=100,
                           random_state=42)

t = [(d) for d in y if d == 0]
s = [(d) for d in y if d == 1]
print('Before Under-Sampling: ')
print('Samples in class 0: ', len(t))
print('Samples in class 1: ', len(s))

UnderS = RandomUnderSampler(random_state=42,
                            replacement=True)

X_Under, Y_Under = UnderS.fit_resample(X, y)

t = [(d) for d in Y_Under if d == 0]
s = [(d) for d in Y_Under if d == 1]
print('After Under-Sampling: ')
print('Samples in class 0: ', len(t))
print('Samples in class 1: ', len(s))

Before Under-Sampling: 
Samples in class 0:  80
Samples in class 1:  20
After Under-Sampling: 
Samples in class 0:  20
Samples in class 1:  20
