In [1]:
import exploratory_data_analysis.eda_functions as eda
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
## metrics
from sklearn.metrics import accuracy_score,\
recall_score, precision_score, f1_score,\
confusion_matrix, classification_report, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn.utils import resample

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import scipy
from scipy import stats
from scipy.stats.mstats import winsorize

import warnings as cuidado
cuidado.filterwarnings('ignore')

In [4]:
# Create a random seed
seed = 12345

# Generate 1000 samples of two normally distributed variables
# with mean 1 and standard deviation 1
x1 = np.random.normal(1, 1, 1000)
x2 = np.random.normal(1, 1, 1000)

# Create a binary class label with 10% of the samples being 1
y = np.random.binomial(1, 0.1, 1000)

# Create a Pandas DataFrame
df = pd.DataFrame({'x1': x1, 'x2': x2, 'y': y})

df.head()

Unnamed: 0,x1,x2,y
0,1.269803,1.159649,0
1,0.071632,1.483931,0
2,0.5944,0.45661,0
3,1.900292,-2.606048,0
4,0.469326,-0.066553,0


In [12]:
def upsample_minority_class(data, feature):
    
    '''
    Take a pandas df and one binary feature.
    identify the minority class,
    upsamples the minority in a binary class in a DataFrame 
    to match the size of the majority class.

      Args:
        data: The DataFrame to be upsampled, pandas Data Frame.
        feature: the columns name, string.
        
      Returns:
        A DataFrame with the minority class upsampled.
    '''
    
    ## Identify data points from majority and minority classes
    
    class_1 = data[feature].value_counts().index[0]
    class_2 = data[feature].value_counts().index[1]
    
    majority_class = None
    minority_class = None

    if class_1 > class_2:
        majority_class = class_1
        minority_class = class_2
    else: 
        majority_class = class_2
        minority_class = class_1

    
    data_majority = data[data[feature] == majority_class]
    data_minority = data[data[feature] == minority_class]
    
    n_samples = len(data_majority)
                              
    data_minority_upsampled = resample(
          data_minority,
          replace=True,
          n_samples=n_samples,
          random_state=None)

    data_upsampled = pd.concat([data_majority, \
                                data_minority_upsampled]).reset_index(drop=True)

    return data_upsampled

In [13]:
upsample_minority_class(df, 'y')

Unnamed: 0,x1,x2,y
0,1.198778,2.324036,1
1,1.922204,2.232644,1
2,0.533180,0.712336,1
3,1.989396,1.073900,1
4,1.616655,0.908080,1
...,...,...,...
173,-0.990982,0.627896,0
174,0.900598,1.025413,0
175,1.632645,-0.874339,0
176,1.287369,0.405476,0
