### [UC16] Evaluate dataset preprocessing influence

1. Load datasets

In [5]:
import pandas as pd

DATASET = pd.read_csv('../data/NATICUSdroid.csv')
X = DATASET.iloc[:, :-1]
y, _ = pd.factorize(DATASET.iloc[:, -1])

DATASET_VARIANTS = []

2. Provide dataset preprocessed variants

In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

scaler_standard = StandardScaler()
DATASET_scaled_standard = pd.DataFrame(scaler_standard.fit_transform(X), columns=X.columns)
DATASET_VARIANTS.append({"name": "standard_scaled", "data": DATASET_scaled_standard.to_numpy()})

scaler_min_max = MinMaxScaler()
DATASET_scaled_min_max = pd.DataFrame(scaler_min_max.fit_transform(X), columns=X.columns)
DATASET_VARIANTS.append({"name": "min_max_scaled", "data": DATASET_scaled_min_max.to_numpy()})

scaler_robust = RobustScaler()
DATASET_scaled_robust = pd.DataFrame(scaler_robust.fit_transform(X), columns=X.columns)
DATASET_VARIANTS.append({"name": "robust_scaled", "data": DATASET_scaled_robust.to_numpy()})

pca = PCA(n_components=0.95)
DATASET_pca = pd.DataFrame(pca.fit_transform(X))
DATASET_VARIANTS.append({"name": "pca", "data": DATASET_pca.to_numpy()})

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
DATASET_poly = pd.DataFrame(poly.fit_transform(X), columns=poly.get_feature_names_out(X.columns))
DATASET_VARIANTS.append({"name": "poly", "data": DATASET_poly.to_numpy()})

selector = VarianceThreshold(threshold=0.01)
DATASET_feature_selected = pd.DataFrame(selector.fit_transform(X), columns=X.columns[selector.get_support()])
DATASET_VARIANTS.append({"name": "feature_selected", "data": DATASET_feature_selected.to_numpy()})

3. Define exmperimet parameters

In [7]:
EXPERIMENT_NAME = 'UC16_evaluate_dataset_preprocessing_influence'
HYPERPARAMETERS = {
            'n_layers': 5,
            'filters': 32,
            'kernel_size': 3,
            'activation': 'sigmoid',
            'use_batch_normalization': True,
            'dropout_rate': 0.2,
            'optimizer': 'Adam',
            'learning_rate': 1e-4,
            'batch_size': 32,
        }

4. Run testing

In [None]:
from mlcb.models.tabular.pytorch import PytorchDenseNetTunable
from sklearn.model_selection import train_test_split

for dataset in DATASET_VARIANTS:
    X = dataset['data'][:, :-1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)
    cls = PytorchDenseNetTunable(X_train, y_train, X_test, y_test)
    cls.train(experiment_name=EXPERIMENT_NAME, hyperparameters=HYPERPARAMETERS, run_name=dataset['name'])

4. Analyze training process

Tuning statistics are available on address: http://127.0.0.1:5000/.

5. Close MLFlow for this experiment

In [None]:
cls.logger._close_mlflow()