# Contents
* [Introduction](#Introduction)
* [Imports and configuration](#Imports-and-configuration)
* [Outlier removal](#Outlier-removal)
* [Results](#Results)

# Introduction

To speed development, 5-fold train-test splits were preprocessed and prepared as separate .feather files. This notebook uses those splits to create copies of training data with outliers removed by LocalOutlierFactor.

In [1]:
from time import time

notebook_begin_time = time()

# set random seeds

from os import environ
from random import seed as random_seed
from numpy.random import seed as np_seed
from tensorflow.random import set_seed


def reset_seeds(seed: int) -> None:
    """Utility function for resetting random seeds"""
    environ["PYTHONHASHSEED"] = str(seed)
    random_seed(seed)
    np_seed(seed)
    set_seed(seed)


reset_seeds(SEED := 2021)
del environ
del random_seed
del np_seed
del set_seed
del reset_seeds

In [2]:
# extensions
%load_ext autotime
%load_ext lab_black
%load_ext nb_black

In [3]:
# core
import pandas as pd

# utility
from gc import collect as gc_collect

# faster sklearn
from sklearnex import patch_sklearn

patch_sklearn()
del patch_sklearn

# local outlier factor
from sklearn.neighbors import LocalOutlierFactor

# display outputs w/o print calls
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
del InteractiveShell

# hide warnings
import warnings

warnings.filterwarnings("ignore")
del warnings

time: 2.54 s


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Location of CV .feather files
CV_FEATHERS_FOLDER = "."

# Location where this notebook will output
DATA_OUT_FOLDER = "."

_ = gc_collect()

time: 118 ms


# Outlier removal

In [5]:
fold_num = 0
while True:
    fold_begin = time()

    # load training data
    try:
        X_train: pd.DataFrame = pd.read_feather(
            f"{CV_FEATHERS_FOLDER}/cv_{fold_num}/X_train_untransformed.feather"
        )
    except FileNotFoundError:
        break

    print(f"before outlier removal, n = {len(X_train)}")
    X_train = X_train.loc[LocalOutlierFactor(n_jobs=-1).fit_predict(X_train) == 1]
    save_data = lambda df, x_or_y: df.reset_index(drop=True).to_feather(
        f"{DATA_OUT_FOLDER}/cv_{fold_num}/{x_or_y}_train_LOF.feather"
    )
    save_data(X_train, "X")
    idx = X_train.index
    del X_train
    _ = gc_collect()
    print(f"after outlier removal, n = {len(idx)}")

    save_data(
        pd.read_feather(
            f"{CV_FEATHERS_FOLDER}/cv_{fold_num}/y_train_untransformed.feather"
        ).loc[idx],
        "y",
    )
    del idx

    fold_num += 1

    print(f"fold {fold_num + 1} completed in {time() - fold_begin:.2f} s")
    del fold_begin
    _ = gc_collect()

before outlier removal, n = 73258
after outlier removal, n = 72179
fold 2 completed in 515.17 s
before outlier removal, n = 75228
after outlier removal, n = 74109
fold 3 completed in 545.91 s
before outlier removal, n = 67433
after outlier removal, n = 66472
fold 4 completed in 431.36 s
before outlier removal, n = 60869
after outlier removal, n = 59972
fold 5 completed in 330.68 s
before outlier removal, n = 70220
after outlier removal, n = 69190
fold 6 completed in 474.75 s
time: 38min 18s


# Results

Let's peek at the results.

In [6]:
_ = pd.read_feather("./cv_0/X_train_LOF.feather")
_.head(5)
_.info()

_ = pd.read_feather("./cv_1/y_train_LOF.feather")
_.head(5)
_.info()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0.119926,-0.049218,0.039702,0.141101,0.044318,0.049631,0.016178,0.025797,-0.104466,-0.016044,...,-0.226529,-0.091773,0.028184,-0.074794,0.025474,0.046511,-0.002693,-0.017953,-0.146394,-0.06233
1,0.101377,-0.063714,-0.015971,-0.151561,-0.065328,-0.013746,0.008821,0.010461,-0.030857,-0.010367,...,-0.169826,0.051789,0.114774,-0.033263,-0.071615,-0.040535,-0.050813,0.021136,-0.189459,0.040756
2,-0.028015,-0.041617,0.012599,-0.03146,0.021788,0.039503,0.033654,0.114422,-0.080653,-0.03209,...,0.059661,0.020316,0.044898,0.008499,0.009812,-0.047551,0.085392,0.029538,0.05707,-0.029945
3,-0.008712,-0.003798,-0.05217,0.030298,0.049662,0.039806,0.056743,-0.186714,-0.15111,0.021191,...,0.107442,0.053569,-0.067552,-0.017051,-0.109469,-0.082999,-0.049325,0.031638,-0.027172,0.006479
4,-0.000265,-0.050629,-0.005011,-0.169305,-0.087526,-0.022268,-0.020403,-0.149949,-0.047235,0.03298,...,-0.128375,0.128055,0.047202,-0.024305,-0.168291,0.112876,0.017571,0.078376,0.099279,-0.04661


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72179 entries, 0 to 72178
Columns: 2048 entries, 0 to 2047
dtypes: float32(2048)
memory usage: 563.9 MB


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74109 entries, 0 to 74108
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       74109 non-null  int8 
dtypes: int8(1)
memory usage: 72.5 KB
time: 913 ms


In [7]:
print(f"Time elapsed since notebook_begin_time: {time() - notebook_begin_time} s")
_ = gc_collect()

Time elapsed since notebook_begin_time: 2309.190016746521 s
time: 119 ms


[^top](#Contents)