### Playground testing feature preprocessing

This notebook is a playground for testing different feature preprocessing techniques. The goal is to find the best preprocessing techniques for the given dataset.

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import numpy as np

# create some random data
random_state = 42
X, y = make_regression(random_state=random_state, n_features=32, noise=1, n_samples=50000)

X_train, X_blind, y_train, y_blind = train_test_split(
    X, y, test_size=0.4, random_state=random_state)

X_test, X_cv, y_test, y_cv = train_test_split(
    X_blind, y_blind, test_size=0.5, random_state=random_state)


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

model = make_pipeline(StandardScaler(), LinearRegression())
model.fit(X_train, y_train)

P_train = model.predict(X_train)
P_test = model.predict(X_test)
P_cv = model.predict(X_cv)

err_train = mean_squared_error(y_train, P_train)
err_train = mean_squared_error(y_test, P_test)
err_cv = mean_squared_error(y_cv, P_cv)

print('MSE train:', err_train)
print('MSE test:', err_train)
print('MSE cv:', err_cv)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

params = { 'hidden_layer_sizes' : [70, 70],
    'activation' : 'relu', 'solver' : 'adam',
    'alpha' : 0.0, 'batch_size' : 10,
    'random_state' : 0, 'tol' : 0.0001,
    'nesterovs_momentum' : False,
    'learning_rate' : 'constant',
    'learning_rate_init' : 0.01,
    'max_iter' : 1000, 'shuffle' : True,
    'n_iter_no_change' : 50, 'verbose' : False }

model = make_pipeline(StandardScaler(), MLPRegressor(**params))
model.fit(X_train, y_train)

P_train = model.predict(X_train)
P_test = model.predict(X_test)
P_cv = model.predict(X_cv)

err_train = mean_squared_error(y_train, P_train)
err_train = mean_squared_error(y_test, P_test)
err_cv = mean_squared_error(y_cv, P_cv)

print('MSE train:', err_train)
print('MSE test:', err_train)
print('MSE cv:', err_cv)

Produce Charts

In [None]:
if X_train.shape[1] > 1:

    if X_train.shape[1] == 2:
        # generate a meshgrid for plotting
        ax = plt.figure().add_subplot(projection='3d')
        x1 = np.linspace(X[:,0].min(), X[:,0].max(), 50)
        x2 = np.linspace(X[:,1].min(), X[:,1].max(), 50)
        xv, yv = np.meshgrid(x1, x2)

        xf = np.concatenate([xv.reshape(-1, 1), yv.reshape(-1, 1)], axis=1)
        yf = model.predict(xf)
        ax.scatter(xf[:,0], xf[:,1], yf, color="grey", alpha=0.2, s=2, label="model")

        ax.scatter(X_train[:,0], X_train[:,1], y_train, alpha=0.5, color="blue", marker='^', label=f"train ({X_train.shape[0]})")
        ax.scatter(X_test[:,0], X_test[:,1], y_test, alpha=0.5, color="red", label=f"test ({X_test.shape[0]})")
        ax.scatter(X_cv[:,0], X_cv[:,1], y_cv, alpha=0.5, color="green", label=f"cv ({X_cv.shape[0]})")


        ax.set_xlabel('Feature 1')
        ax.set_ylabel('Feature 2')
        ax.set_zlabel('Y')

        plt.title(f'Regression with MSE train: {err_train:.2f}, test: {err_train:.2f}, cv: {err_cv:.2f}')
        plt.legend()
    else:
        # Set the number of rows and columns for the subplot grid
        nrows, ncols = 4, 4

        # Create a figure
        fig = plt.figure(figsize=(15, 15))

        # Create subplots in a 4x4 grid
        for row in range(nrows):
            for col in range(ncols):
                # Compute the subplot index
                index = row * ncols + col
                skip = index * 2

                if X_cv.shape[1] > skip + 1:
                # Add a 3D subplot
                    ax = fig.add_subplot(nrows, ncols, index + 1, projection='3d')
                    ax.title.set_text(f'Feature {skip} vs Feature {skip + 1}')
                    ax.scatter(X_cv[:, skip], X_cv[:, skip + 1], y_cv, alpha=0.5, color="green", label=f"cv ({X_cv.shape[0]})")
                    ax.set_zlabel('Y')
else:
    plt.scatter(X_train, y_train, alpha=0.5, color="blue", marker='^', label=f"train ({X_train.shape[0]})")
    plt.scatter(X_test, y_test, alpha=0.5, color="red", label=f"test ({X_test.shape[0]})")
    plt.scatter(X_cv, y_cv, alpha=0.5, color="green", label=f"cv ({X_cv.shape[0]})")

    xl = np.linspace(X.min(), X.max(), 20)
    yl = model.predict(xl.reshape(-1, 1))

    plt.plot(xl, yl, color="black", label="model")
    plt.xlabel('Feature')
    plt.ylabel('Y')
    
    plt.title(f'Regression with MSE train: {err_train:.2f}, test: {err_train:.2f}, cv: {err_cv:.2f}')
    plt.legend()

plt.show()