In [11]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import numpy as np
from kondo_ml.instance_selection import RegEnnSelector
from kondo_ml.utils import transform_selector_output_into_mask

In [2]:
# Generate random instances with random linear coefficients
nr_samples = 1000
nr_features =10 
rs = np.random.RandomState(42)
X = rs.normal(0, 1, size=(nr_samples, nr_features))
y = np.zeros(nr_samples)
coefs = np.round(rs.uniform(-10, 10, nr_features), 2)
for i in range(nr_features):
    y += coefs[i] * X[:, i]
y += rs.normal(0, 1, size=nr_samples)

In [3]:
# Split data into train, val, test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_val, X_test = X_test[:200], X_test[200:]
y_val, y_test = y_test[:200], y_test[200:]

In [4]:
def add_random_noise_arnaiz(
    y: np.ndarray, noise_frac: float = 0.1):
    """
    As described in paper "Instance Selection for regression" by Arnaiz-Gonzalez under 4.5, to add random noise
    for some % of the samples, we simply exchange the target values. Thus neither the feature nor target distribution
    is changed

    Parameters
    ----------
    y
        array containing target values
    noise_frac
        % of samples that are affected. Thus noise_pct/2 is the number of affected sample pairs

    Returns
    -------
    array with the swapped values
    """
    if noise_frac == 0:
        return y, []
    else:
        y_noisy = y.copy()
        possible_idx = np.arange(len(y_noisy))
        nr_swapping_pairs = int(len(y_noisy) * noise_frac / 2)
        swapping_pairs = np.random.choice(
            possible_idx, (nr_swapping_pairs, 2), replace=False
        )
        first_half = swapping_pairs[:, 0]
        second_half = swapping_pairs[:, 1]
        y_noisy[first_half], y_noisy[second_half] = (
            y_noisy[second_half],
            y_noisy[first_half],
        )
        noisy_indices = swapping_pairs.flatten()
    return y_noisy, noisy_indices


In [5]:
# Add noise by randomly swapping the y value for 30% all instances
y_train_noisy, noisy_idx = add_random_noise_arnaiz(y_train, noise_frac=0.3)

In [6]:
# R2 of the model trained on the clean data set
model_clean = LinearRegression().fit(X_train, y_train)
r2_score(y_test, model_clean.predict(X_test))

0.9972212002457562

In [7]:
# R2 of the model trained on the noisy data set
model_clean = LinearRegression().fit(X_train, y_train_noisy)
r2_score(y_test, model_clean.predict(X_test))

0.9169246457903473

In [13]:
# R2 of the model trained on the noisy data set, but with only those instance selected by the RegENN algorithm
reg_enn = RegEnnSelector(alpha=1,nr_of_neighbors=3)
labels = reg_enn.fit_predict(X_train,y_train_noisy)
boolean_labels = transform_selector_output_into_mask(labels)
model_selector = LinearRegression().fit(X_train[boolean_labels],y_train_noisy[boolean_labels])
r2_score(y_test,model_selector.predict(X_test))

0.9926253257413306