In [41]:
import joblib
import math
import multiprocessing as mp
import os
from typing import List
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])

# Load Data

In [3]:
X_train, y_train = joblib.load("../data/train/preprocessed/train_features_labels.joblib.gz")

X_validation, y_validation = joblib.load("../data/train/preprocessed/validation_features_labels.joblib.gz")

# Hold your SMOTE for a moment

SMOTE has become a ubiquitous way to handle imbalanced classes by oversampling the minority class. However, the fact that many of our features have low variance due to a lot of zero values, generating artificial samples from them can actually become quite counterproductive. Thus, we will experiment with both SMOTE and a custom undersampler that tries to capture most of the variance of the majority class. Whichever strategy yields better results for our baseline model will be the one we move forward with.

### 1. Custom undersampler

In [32]:
class BinaryUndersampler:
    
    def __init__(self, n_iterations: int=5, n_jobs: int=-1):
        self.n_iterations = n_iterations
        self.highest_variance_sample = (None, None)
        self.n_jobs = min(mp.cpu_count(), n_jobs) if n_jobs > 1 else mp.cpu_count()
    
    def fit(X: np.ndarray, y: np.ndarray):
        self.minority_class = pd.value_counts(pd.Series(y)).index[-1]
        self.n_splits = math.floor(X.shape[0] / np.sum(y==self.minority_class))
        # split and select the sample with highest variance (decide how significant via bootstraping)
        with mp.Pool(self.n_jobs) as pool:
            best_of_iteration = pool.map(
                fun,
                
            )

    def _shuffle_split_observations(X: np.ndarray, y_label: np.ndarray, n_splits: int) -> List:
        """Return a list of disjoint samples with the `y_label` concatenated at axis=1 (column)"""
        return np.array_split(np.random.permutation(np.concatenate((X, y_label.reshape(-1, 1)), axis=1)), n_splits)

    def _get_mean_feature_variance(sample: np.ndarray) -> float:
        X, y_label = sample[:, :-1], sample[:, -1]
        mean_variance = np.mean(np.ma.var(X, axis=0))
        return (X, y_label, mean_variance)

In [44]:
np.array_split(np.random.permutation(np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1)), 2)

[array([[ 2.23806427e-04,  0.00000000e+00,  1.00000000e+00, ...,
         -3.32960355e-01,  0.00000000e+00,  0.00000000e+00],
        [-1.23304598e+00,  0.00000000e+00,  1.00000000e+00, ...,
         -1.92218244e-02,  0.00000000e+00,  0.00000000e+00],
        [-1.23304598e+00,  1.00000000e+00,  0.00000000e+00, ...,
         -4.58455768e-01,  0.00000000e+00,  0.00000000e+00],
        ...,
        [-1.31012535e+00,  1.00000000e+00,  0.00000000e+00, ...,
         -4.58455768e-01,  0.00000000e+00,  0.00000000e+00],
        [ 3.08541254e-01,  0.00000000e+00,  1.00000000e+00, ...,
          3.57264413e-01,  0.00000000e+00,  0.00000000e+00],
        [ 7.73031684e-02,  0.00000000e+00,  1.00000000e+00, ...,
         -3.32960355e-01,  0.00000000e+00,  0.00000000e+00]]),
 array([[ 0.53977934,  0.        ,  1.        , ..., -0.39570806,
          0.        ,  0.        ],
        [ 2.8521602 ,  0.        ,  1.        , ..., -0.39570806,
          0.        ,  0.        ],
        [ 0.30854125,  1.

In [59]:
np.mean(np.ma.var(X, axis=0))

0.2392738513537641

In [49]:
y_label

(28792,)

(57584, 17)