## HANDLING IMBALANCED DATASET
### 1. Up Sampling
###  2. Down Sampling

In [33]:
import pandas as pd
import numpy as np

In [34]:
np.random.seed(123)

In [35]:
## TOTAL NUMBER OF DATASETS
n_samples = 1000
## DATAFRAME WITH TWO CLASSES
class_0_ratio = 0.9
n_class_0 = int(n_samples*class_0_ratio)
n_class_1 = n_samples-n_class_0

In [36]:
n_class_0, n_class_1

(900, 100)

In [37]:
## DATAFRAME WITH IMBALANCED DATASET
class_0_df = pd.DataFrame({
    'feature 1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature 2' : np.random.normal(loc=0, scale=1, size=n_class_0),
    'target' : [0]*n_class_0
})

class_1_df = pd.DataFrame({
    'feature 1' : np.random.normal(loc=1, scale=1, size=n_class_1),
    'feature 2' : np.random.normal(loc=1, scale=1, size=n_class_1),
    'target' : [1]*n_class_1
})

In [38]:
df = pd.concat([class_0_df,class_1_df]).reset_index(drop=True)

In [39]:
df.tail()

Unnamed: 0,feature 1,feature 2,target
995,0.376371,1.845701,1
996,1.23981,-0.119923,1
997,0.13176,0.640703,1
998,1.902006,-0.609695,1
999,1.69749,1.01357,1


In [40]:
# IMBALANCED DATASET
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [41]:
df_minority = df[df['target']==1]
df_majority = df[df['target']==0]

In [42]:
df_minority

Unnamed: 0,feature 1,feature 2,target
900,0.699768,1.139033,1
901,0.367739,1.025577,1
902,0.795683,0.803557,1
903,1.213696,2.312255,1
904,2.033878,2.187417,1
...,...,...,...
995,0.376371,1.845701,1
996,1.239810,-0.119923,1
997,0.131760,0.640703,1
998,1.902006,-0.609695,1


In [43]:
df_majority

Unnamed: 0,feature 1,feature 2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.252750,0
4,-0.578600,-0.292004,0
...,...,...,...
895,0.238761,-0.003155,0
896,-1.106386,-0.430660,0
897,0.366732,-0.146416,0
898,1.023906,1.160176,0


### Up sampling


In [45]:
from sklearn.utils import resample
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)



In [46]:
df_minority_upsampled.shape

(900, 3)

In [51]:
df_upsampled = pd.concat([df_majority,df_minority_upsampled]).reset_index(drop=True)

In [52]:
df_upsampled.shape

(1800, 3)

In [53]:
df_upsampled.tail()

Unnamed: 0,feature 1,feature 2,target
1795,0.188902,1.189189,1
1796,2.919526,0.980541,1
1797,1.810326,2.604614,1
1798,2.621531,1.168229,1
1799,0.160887,1.489203,1


### Down sampling

In [54]:
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

In [55]:
df_majority_downsampled.shape

(100, 3)

In [56]:
df_downsampled = pd.concat([df_majority_downsampled,df_minority]).reset_index(drop=True)

In [57]:
df_downsampled.shape

(200, 3)

In [58]:
df_downsampled.tail()

Unnamed: 0,feature 1,feature 2,target
195,0.376371,1.845701,1
196,1.23981,-0.119923,1
197,0.13176,0.640703,1
198,1.902006,-0.609695,1
199,1.69749,1.01357,1
