# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Outlier removal](#Outlier-removal)
* [Results](#Results)

# Introduction

To speed development, 5-fold train-test splits were preprocessed and prepared as separate .feather files. This notebook uses those splits to create copies of training data with outliers removed by LocalOutlierFactor using Mahalanobis distance.

But it actually ran for over 800 minutes without a single fold completing, so I canceled it.

In [1]:
from time import time

notebook_begin_time = time()

# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2021)
del environ
del random_seed
del np_seed
del set_seed
del reset_seeds

In [2]:
# extensions
%load_ext autotime
%load_ext lab_black
%load_ext nb_black

In [3]:
# core
import pandas as pd

# utility
from gc import collect as gc_collect

# faster sklearn
from sklearnex import patch_sklearn

patch_sklearn()
del patch_sklearn

# local outlier factor
from sklearn.neighbors import LocalOutlierFactor

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
del InteractiveShell

# hide warnings
import warnings

# warnings.filterwarnings("ignore")
del warnings

time: 1.75 s


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Location of CV .feather files
CV_FEATHERS_FOLDER = "."

# Location where this notebook will output
DATA_OUT_FOLDER = "."

_ = gc_collect()

time: 113 ms


# Outlier removal

In [5]:
fold_num = 0
while True:
    fold_begin = time()

    # load training data
    try:
        X_train: pd.DataFrame = pd.read_feather(
            f"{CV_FEATHERS_FOLDER}/cv_{fold_num}/X_train_untransformed.feather"
        )
    except FileNotFoundError:
        break

    print(f"before outlier removal, n = {(n := len(X_train))}")
    X_train = X_train.loc[
        LocalOutlierFactor(metric="mahalanobis", n_jobs=-1).fit_predict(X_train) == 1
    ]
    save_data = lambda df, x_or_y: df.reset_index(drop=True).to_feather(
        f"{DATA_OUT_FOLDER}/cv_{fold_num}/{x_or_y}_train_LOFm.feather"
    )
    save_data(X_train, "X")
    idx = X_train.index
    del X_train
    _ = gc_collect()
    print(f"{n - (n:=len(idx))} outliers removed, n = {n}")

    save_data(
        pd.read_feather(
            f"{CV_FEATHERS_FOLDER}/cv_{fold_num}/y_train_untransformed.feather"
        ).loc[idx],
        "y",
    )
    del idx

    print(f"fold {fold_num + 1} completed in {time() - fold_begin:.2f} s")
    del fold_begin
    fold_num += 1
    _ = gc_collect()

before outlier removal, n = 73258


# Results

Let's peek at the results.

In [None]:
_ = pd.read_feather("./cv_0/X_train_LOFm.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_1/y_train_LOFm.feather")
_.head(5)
_.info()

In [None]:
print(f"Time elapsed since notebook_begin_time: {time() - notebook_begin_time} s")
_ = gc_collect()

[^top](#Contents)