In [14]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(123)

In [11]:
n_samples=1000
class_0_ratio=.9
n_class_0=int(n_samples*class_0_ratio)
n_class_1=n_samples-n_class_0

In [12]:
n_class_0,n_class_1

(900, 100)

In [15]:
## CREATE MY DATAFRAME WITH IMBALANCED DATASET
class_0 = pd.DataFrame({
    'feature_1': np.random.normal(loc=0, scale=1, size=n_class_0),
    'feature_2': np.random.normal(loc=0, scale=1, size=n_class_0),
    'target': [0] * n_class_0
})

class_1 = pd.DataFrame({
    'feature_1': np.random.normal(loc=2, scale=1, size=n_class_1),
    'feature_2': np.random.normal(loc=2, scale=1, size=n_class_1),
    'target': [1] * n_class_1
})

In [16]:
df=pd.concat([class_0,class_1]).reset_index(drop=True)

In [19]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [22]:
df['target'].value_counts()

0    900
1    100
Name: target, dtype: int64

In [36]:
##Upsampling
df_majority=df[df['target']==0]
df_minority=df[df['target']==1]

In [37]:
df_majority.head()

Unnamed: 0,feature_1,feature_2,target
0,-1.085631,0.551302,0
1,0.997345,0.419589,0
2,0.282978,1.815652,0
3,-1.506295,-0.25275,0
4,-0.5786,-0.292004,0


In [38]:
df_minority.head()

Unnamed: 0,feature_1,feature_2,target
900,1.699768,2.139033,1
901,1.367739,2.025577,1
902,1.795683,1.803557,1
903,2.213696,3.312255,1
904,3.033878,3.187417,1


In [2]:
from sklearn.utils import resample

In [39]:
df_upsample_minority=resample(df_minority,replace='True',n_samples=len(df_majority),random_state=42) ##To match the values of majortiy

In [40]:
df_upsample_minority.shape

(900, 3)

In [42]:
df_upsample=pd.concat([df_majority,df_upsample_minority])

In [50]:
df_upsample['target'].value_counts()

0    900
1    900
Name: target, dtype: int64

In [52]:
## Generating another dataframe

import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate a dataset with 1000 samples
n_samples = 1000

# Features (randomly generated)
features = {
    'Feature1': np.random.normal(0, 1, n_samples),
    'Feature2': np.random.normal(5, 2, n_samples),
    'Feature3': np.random.normal(-3, 1.5, n_samples)
}

# Target variable with imbalance (e.g., 95% of class 0, 5% of class 1)
target = np.zeros(n_samples)
target[:50] = 1  # 5% of the samples are class 1
np.random.shuffle(target)

# Combine features and target into a DataFrame
data = pd.DataFrame(features)
data['Target'] = target

# Save the imbalanced dataset to a CSV file
imbalanced_file_path = 'imbalanced_dataset.csv'
data.to_csv(imbalanced_file_path, index=False)

imbalanced_file_path


'imbalanced_dataset.csv'

In [53]:
df_main=pd.read_csv('imbalanced_dataset.csv')

In [54]:
df_main.head()

Unnamed: 0,Feature1,Feature2,Feature3,Target
0,0.496714,7.798711,-4.012767,0.0
1,-0.138264,6.849267,-3.216778,0.0
2,0.647689,5.119261,-4.18863,0.0
3,1.52303,3.706126,-3.461942,0.0
4,-0.234153,6.396447,-5.840422,0.0


In [55]:
df_main['Target'].value_counts()

0.0    950
1.0     50
Name: Target, dtype: int64

In [68]:
df_majority=df[df_main['Target']==0.0]

In [69]:
df_minority=df[df_main['Target']==1.0]

In [70]:
from sklearn.utils import resample

In [71]:
df_upsample=resample(df_minority,replace='True',n_samples=len(df_majority),random_state=43)

In [82]:
df_upsample['target'].value_counts()

0    820
1    130
Name: target, dtype: int64

In [74]:
df_new_sample=pd.concat([df_upsample,df_majority])

In [75]:
df_new_sample

Unnamed: 0,feature_1,feature_2,target
83,0.807308,0.609791,0
42,-0.390900,-1.112364,0
966,0.671933,1.370988,1
408,0.035941,-0.813571,0
320,-0.817668,-1.066613,0
...,...,...,...
995,1.376371,2.845701,1
996,2.239810,0.880077,1
997,1.131760,1.640703,1
998,2.902006,0.390305,1


In [77]:
df_new_sample['target'].value_counts()

0    1677
1     223
Name: target, dtype: int64

950

In [1]:
##Practice Imbalanced Dataset

import pandas as pd
import numpy as np
from sklearn.datasets import make_classification

# Set random seed for reproducibility
np.random.seed(42)

# Generate an imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=2,
                           n_classes=2, weights=[0.95, 0.05], flip_y=0, random_state=42)

# Create a DataFrame with the features and target
df = pd.DataFrame(X, columns=[f'Feature{i}' for i in range(1, 11)])
df['Target'] = y

# Display the class distribution
class_distribution = df['Target'].value_counts(normalize=True)
print("Class distribution:")
print(class_distribution)

# Save the dataset to a CSV file
df.to_csv('imbalanced_dataset.csv', index=False)

# Display the first few rows of the dataset
df.head()


Class distribution:
0    0.95
1    0.05
Name: Target, dtype: float64


Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,Target
0,-1.67081,-0.555017,0.222569,-1.254161,-1.154315,-2.344112,-0.276889,-0.671721,-1.402369,-0.749472,0
1,-2.272506,1.10671,0.644907,-1.739205,-0.322992,-1.570785,1.555199,-1.114532,-2.274621,-0.118879,0
2,-1.315657,0.283053,1.657337,1.119688,0.013667,0.752199,-0.368619,0.807046,-0.339311,0.392204,0
3,-1.325351,1.502734,0.269946,-1.780626,-0.880073,-1.71793,0.091168,0.788204,0.776229,-0.547307,0
4,-1.267889,-0.790771,0.952897,-0.073107,-0.67078,-0.682904,0.176484,0.451226,1.015937,-0.233383,0


In [2]:
df.head()

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Feature5,Feature6,Feature7,Feature8,Feature9,Feature10,Target
0,-1.67081,-0.555017,0.222569,-1.254161,-1.154315,-2.344112,-0.276889,-0.671721,-1.402369,-0.749472,0
1,-2.272506,1.10671,0.644907,-1.739205,-0.322992,-1.570785,1.555199,-1.114532,-2.274621,-0.118879,0
2,-1.315657,0.283053,1.657337,1.119688,0.013667,0.752199,-0.368619,0.807046,-0.339311,0.392204,0
3,-1.325351,1.502734,0.269946,-1.780626,-0.880073,-1.71793,0.091168,0.788204,0.776229,-0.547307,0
4,-1.267889,-0.790771,0.952897,-0.073107,-0.67078,-0.682904,0.176484,0.451226,1.015937,-0.233383,0


In [7]:
df['Target'].value_counts()

0    950
1     50
Name: Target, dtype: int64

In [8]:
df_majority=df[df['Target']==0]

In [9]:
df_minority=df[df['Target']==1]

In [10]:
from sklearn.utils import resample

In [11]:
df_upsample=resample(df_minority,replace='True',n_samples=len(df_majority))

In [16]:
df_new_sample=pd.concat([df_upsample,df_majority])

In [17]:
df_new_sample['Target'].value_counts()

1    950
0    950
Name: Target, dtype: int64