In [19]:
from pprint import pprint

import numpy as np
from datasets import id_map, load_dataset, split_data
from linear_models import players, scale
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import ARDRegression, BayesianRidge, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer, RobustScaler, StandardScaler
from wrappers import BFS, RFS, RobustRegressor


In [20]:
pprint(id_map, sort_dicts=False)
pprint(list(players.keys()))


{'student_performance': 320,
 'concrete': 165,
 'computer_hardware': 29,
 'kidney_disease': 857,
 'fertility': 244,
 'algerian_forest_fires': 547,
 'airfoil_self_noise': 291,
 'istanbul_stock_exchange': 247}
['Ridge(1/(2n))',
 'ARDRegression',
 'BFS(inner)',
 'BFS(inner, use_positive=False)',
 'BFS(inner, use_scaling=False)',
 'RFS(inner)',
 'RFS(inner, use_positive=False)']


In [21]:
X, y = load_dataset("airfoil_self_noise")
X_train, y_train, X_test, y_test = split_data(X, y, test_size=0.5, bad_features=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(751, 5) (752, 5) (751,) (752,)


In [22]:
inner = Ridge(1 / (2 * len(X_train)))
models = [
    scale(Ridge(1 / (2 * len(X_train)))),
    scale(ARDRegression()),
    scale(BFS(inner, use_intercept=False)),
    scale(BFS(inner, use_intercept=True)),
    # scale(RFS(inner)),
    # scale(RFS(inner, use_positive=False)),
]
model_names = [
    "Ridge(1/(2n))",
    "ARDRegression",
    "BFS(inner, intercept=False)",
    "BFS(inner, intercept=True)",
]
for i, (name, model) in enumerate(zip(model_names, models)):
    model.fit(X_train, y_train)
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    train_mse = np.square(train_pred - y_train).mean()
    test_mse = np.square(test_pred - y_test).mean()
    print(f"{i} {name:<40} {train_mse:<8.4f} (train),   {test_mse:<8.4f} (test)")


0 Ridge(1/(2n))                            0.8257   (train),   0.8594   (test)
1 ARDRegression                            0.8266   (train),   0.8297   (test)
2 BFS(inner, intercept=False)              0.8281   (train),   0.8314   (test)
3 BFS(inner, intercept=True)               0.8267   (train),   0.8366   (test)


In [14]:
A = models[1]
B = models[2]


In [15]:
A[1].coef_


array([ 0.        , -0.33267927, -0.41888946,  0.08423909,  0.        ],
      dtype=float32)

In [16]:
B[1].feature_indices_


[array([2]),
 array([2, 1]),
 array([2, 1, 3]),
 array([2, 1, 3, 4]),
 array([2, 1, 3, 4, 0])]

In [17]:
# A[1].coef_
np.set_printoptions(suppress=True, precision=6, floatmode="fixed")

meta_coef = B[1].meta_estimator_[1].coef_
total_coef = np.zeros(X_train.shape[1])
for i, meta_w in enumerate(meta_coef):
    feats = B[1].feature_indices_[i]
    weights = B[1].estimators_[i].coef_
    total_coef[feats] += weights * meta_w
total_coef


array([ 0.000000, -0.130719, -0.164257,  0.026614,  0.004112])

In [7]:
def EDA(name):
    X, y = load_dataset(name)
    assert isinstance(X, np.ndarray) and isinstance(y, np.ndarray)
    n_samples, n_features = X.shape
    assert y.shape == (n_samples,)
    print(f"{n_samples} samples, {n_features} features")
    k = 10
    subset = np.random.choice(n_samples, k)
    for i in range(n_features):
        n_unique = len(np.unique(X[:, i]))
        mn, mx = np.min(X[:, i]), np.max(X[:, i])
        mean = np.mean(X[:, i])
        std = np.std(X[:, i])
        C = 8
        sample = [int(C * x) / C for x in sorted(X[subset, i].tolist())]
        correlation = np.corrcoef(X[:, i], y)[0, 1]
        mutual_info = mutual_info_regression(X[:, i].reshape(-1, 1), y)[0]
        print(
            f"#{i}: {mn:.2f}..({mean:.2f} ± {std:.2f})..{mx:.2f}, {n_unique} unique, {sample} | C={correlation:.2f}, MI={mutual_info:.2f}"
        )


In [8]:
EDA("istanbul_stock_exchange")


536 samples, 8 features
#0: 0.00..(267.50 ± 154.73)..535.00, 536 unique, [6.0, 75.0, 108.0, 199.0, 366.0, 433.0, 444.0, 506.0, 506.0, 525.0] | C=-0.03, MI=0.05
#1: -0.05..(0.00 ± 0.01)..0.07, 519 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.44, MI=0.09
#2: -0.05..(0.00 ± 0.01)..0.06, 527 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.60, MI=0.21
#3: -0.05..(0.00 ± 0.01)..0.05, 523 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.62, MI=0.26
#4: -0.05..(0.00 ± 0.01)..0.06, 503 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.26, MI=0.00
#5: -0.05..(0.00 ± 0.02)..0.06, 508 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.43, MI=0.06
#6: -0.05..(0.00 ± 0.01)..0.07, 532 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.66, MI=0.24
#7: -0.04..(0.00 ± 0.01)..0.05, 536 unique, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] | C=0.60, MI=0.23
