#CompIntel-Lab6: Active Learning: Simulated-user Active Regression with pool-based sampling for numerical features

Based on:

* https://scikit-learn.org/stable/_downloads/21b82d82985712b5de6347f382c77c86/plot_partial_dependence.ipynb
* https://modal-python.readthedocs.io/en/latest/content/examples/active_regression.html
* https://modal-python.readthedocs.io/en/latest/content/examples/ensemble_regression.html
* https://modal-python.readthedocs.io/en/latest/content/examples/query_by_committee.html
* https://modal-python.readthedocs.io/en/latest/content/examples/bootstrapping_and_bagging.html

In [None]:
%matplotlib inline

In [None]:
!pip install -U scikit-learn==1.0.2

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

cal_housing = fetch_california_housing()
X = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)
y = cal_housing.target

#y -= y.mean()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [None]:
X

In [None]:
X_train

In [None]:
y_train

In [None]:
from time import time
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.neural_network import MLPRegressor

print("Training MLPRegressor...")
tic = time()
est = make_pipeline(
    QuantileTransformer(),
    MLPRegressor(
        hidden_layer_sizes=(30, 15),
        learning_rate_init=0.01,
        early_stopping=True,
        random_state=0,
    ),
)
est.fit(X_train, y_train)
print(f"done in {time() - tic:.3f}s")
print(f"Test R2 score: {est.score(X_test, y_test):.2f}")

In [None]:
import numpy as np

In [None]:
print([np.random.randint(10) for _ in range(10)])
print([np.random.randint(10) for _ in range(10)])
# Szwabe: What is it for?
# 148141: Sprawdzenie czy losowaość działa

#One-dimensional active regression toy example

In this example, we are going to demonstrate how can the ActiveLearner be used for active regression using Gaussian processes. Since Gaussian processes provide a way to quantify uncertainty of the predictions as the covariance function of the process, they can be used in an active learning setting.

In [None]:
#!pip install modAL
!pip install modAL-python

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF
from modAL.models import ActiveLearner

%matplotlib inline

In [None]:
X = np.random.choice(np.linspace(0, 20, 10000), size=200, replace=False).reshape(-1, 1)
y = np.sin(X) + np.random.normal(scale=0.3, size=X.shape)

In [None]:
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 5))
    plt.scatter(X, y, c='k', s=20)
    plt.title('sin(x) + noise')
    plt.show()

In [None]:
def GP_regression_std(regressor, X):
    _, std = regressor.predict(X, return_std=True)
    query_idx = np.argmax(std)
    return query_idx, X[query_idx]

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 5))
    plt.plot(X_grid, y_pred)
    plt.fill_between(X_grid, y_pred - y_std, y_pred + y_std, alpha=0.2)
    plt.scatter(X, y, c='k', s=20)
    plt.title('Initial prediction')
    plt.show()

In [None]:
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))

In [None]:
y_pred_final, y_std_final = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred_final, y_std_final = y_pred_final.ravel(), y_std_final.ravel()

In [None]:
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 8))
    plt.plot(X_grid, y_pred_final)
    plt.fill_between(X_grid, y_pred_final - y_std_final, y_pred_final + y_std_final, alpha=0.2)
    plt.scatter(X, y, c='k', s=20)
    plt.title('Prediction after active learning')
    plt.show()

### Despite variance underestimation (light blue area), GPR with a 'standard' kernel function parameters space enables both: accurate expected value prediction (dark blue curve) and sufficient exploration:   

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 11
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

In [None]:
regressor.estimator.get_params()

#The impact of the Gaussian process regression kernel function parameters optimization on the variance prediction quality

## GRP kernel parameters are not fixed - their ranges are:

"kernelkernel instance, default=None

The kernel specifying the covariance function of the GP. If None is passed, the kernel ConstantKernel(1.0, constant_value_bounds="fixed" * RBF(1.0, length_scale_bounds="fixed") is used as default. Note that the kernel hyperparameters are optimized during fitting unless the bounds are marked as “fixed”.
alphafloat or ndarray of shape (n_samples,), default=1e-10

Value added to the diagonal of the kernel matrix during fitting. This can prevent a potential numerical issue during fitting, by ensuring that the calculated values form a positive definite matrix. It can also be interpreted as the variance of additional Gaussian measurement noise on the training observations. Note that this is different from using a WhiteKernel. If an array is passed, it must have the same number of entries as the data used for fitting and is used as datapoint-dependent noise level. Allowing to specify the noise level directly as a parameter is mainly for convenience and for consistency with Ridge.
optimizer“fmin_l_bfgs_b” or callable, default=”fmin_l_bfgs_b”

Can either be one of the internally supported optimizers for optimizing the kernel’s parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:

    def optimizer(obj_func, initial_theta, bounds):
        # * 'obj_func': the objective function to be minimized, which
        #   takes the hyperparameters theta as a parameter and an
        #   optional flag eval_gradient, which determines if the
        #   gradient is returned additionally to the function value
        # * 'initial_theta': the initial value for theta, which can be
        #   used by local optimizers
        # * 'bounds': the bounds on the values of theta
        ....
        # Returned are the best found hyperparameters theta and
        # the corresponding value of the target function.
        return theta_opt, func_min

Per default, the L-BFGS-B algorithm from scipy.optimize.minimize is used. If None is passed, the kernel’s parameters are kept fixed. Available internal optimizers are: {'fmin_l_bfgs_b'}.
n_restarts_optimizerint, default=0

The number of restarts of the optimizer for finding the kernel’s parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel’s initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer == 0 implies that one run is performed."

[https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html]

###Szwabe: What may be the impact of setting the GPR kernel parameters to fixed values (i.e. turning off the default optimization function) to the AR performance? Does variance overestimation inavoidably lead to poor exploration?

# 148141: Wariancja jest przeestymowana. Eksploracja działa jak random sampling, ale nie oznacza to że działa słabo.

In [None]:
kernel = RBF(length_scale=1.0,length_scale_bounds="fixed") + WhiteKernel(noise_level=1, noise_level_bounds="fixed")

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

#kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 11
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

###Szwabe: Can a radical (10 orders of magnitude) increase of the lower range of the homoskedastic ('white') factor of the GPR kernel make both mean and variance predictions more robust? Is the improvement possible for very few training samples?

### 148141: 

In [None]:
kernel = RBF(length_scale=1.0,length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=1, noise_level_bounds=(1e-1, 1e+1))

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

#kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 11
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

## Szwabe: What if we knew the optimal kernel parameters... Wait! In this particular toy example we actually know them! Let's fix just the most important one. Can it improve the robustness of the predictions for a very low number of training samples? Is the exploration capability compromised by fixing the parameter of the homoskedastic 'ingredient' of the GPR kernel?  

# 148141: Tak pomogło otrzymać dobre rezultaty znając niewiele próbek

In [None]:
kernel = RBF(length_scale=1.0,length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=0.3, noise_level_bounds="fixed")

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

#kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 21
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

In [None]:
regressor.estimator.get_params()

In [None]:
np.pi

##Szwabe: What if we fix the kernel length scale as well? To what value?
# 148141: To rozwiązanie również pomaga uzyskać dobre resultaty po poznaniu niewielu punktów. Length scale = 1.0

In [None]:
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 5))
    plt.scatter(X, np.sin(X), c='k', s=20)
    plt.title('sin(x)')
    plt.show()

In [None]:
import scipy.stats as stats
import math

In [None]:
#temp_length_scale = math.sqrt(np.pi/2)
temp_length_scale = 1.0
temp_kernel = RBF(length_scale=temp_length_scale)
temp_estimator = GaussianProcessRegressor(kernel=temp_kernel)
temp_estimator.fit([[np.pi/2]],[1.0])

In [None]:
X_aux = np.linspace(-np.pi/2, np.pi*1.5, 100)
mu = np.pi/2
variance = 1
sigma = math.sqrt(variance)
sigma2 = sigma
with plt.style.context('seaborn-white'):
    plt.figure(figsize=(10, 5))
    plt.plot(X_aux, np.sin(X_aux), label="sin(x)")
    plt.plot(X_aux, 2*stats.norm.pdf(X_aux, mu, sigma2)/stats.norm.pdf(mu, mu, sigma2)-1.0, label="scaled N(pi/2,"+str(sigma2*2)+"1)(x)-1.0")
    plt.plot(X_aux, 0.01 + 2*temp_estimator.predict(X_aux.reshape(-1, 1))-1.0, label="2*RBF(length_scale="+str(temp_length_scale)+")-1.0")
    plt.legend()
    plt.show()

In [None]:
temp_estimator.get_params()

In [None]:
#kernel = RBF(length_scale=np.pi,length_scale_bounds="fixed") + WhiteKernel(noise_level=0.3, noise_level_bounds="fixed")
kernel = RBF(length_scale=1.0,length_scale_bounds="fixed") + WhiteKernel()

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

#kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 11
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

In [None]:
regressor.estimator.get_params()

In [None]:
from sklearn.gaussian_process.kernels import ExpSineSquared

seasonal_kernel = (RBF(length_scale=1.0, length_scale_bounds="fixed") * ExpSineSquared(length_scale=np.pi, periodicity=1.0))

In [None]:
#kernel = RBF(length_scale=np.pi,length_scale_bounds="fixed") + WhiteKernel(noise_level=0.3, noise_level_bounds="fixed")
#kernel = ExpSineSquared(length_scale=1.0, periodicity=np.pi/2, periodicity_bounds="fixed")
kernel = seasonal_kernel

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

#kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 11
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

In [None]:
regressor.estimator.get_params()

#Szwabe: Can using an tree-based variance-aware regressor (computationally much less complex) instead of GPR help? What is the difference between the default configurations of ET regressors for the generic use of Scikit-Learn and for the case of using ETR for BO? Why? Why are the exploration capabilities of AR based on ETR of BO-like parameters limited?  

In [None]:
!pip install scikit-optimize

In [None]:
!pip install scikit-learn

In [None]:
import sklearn

In [None]:
SklearnExtraTreesRegressor = sklearn.ensemble.ExtraTreesRegressor()
SklearnExtraTreesRegressor.get_params()

In [None]:
from skopt.learning import ExtraTreesRegressor

In [None]:
TreeBasedRegressor = ExtraTreesRegressor()

In [None]:
TreeBasedRegressor.get_params()

In [None]:
TreeBasedRegressor = ExtraTreesRegressor(n_estimators=10, min_samples_split=2, min_samples_leaf=1)

In [None]:
#!pip install -U scikit-learn==1.0.2

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

#kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=TreeBasedRegressor,
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 11
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

##Szwabe: The impact of the tree-based regressor... hyperparameters (!) (kind-of chicken and egg or infinite number of optimizers of other optimizers)
### What makes the AR exploration problem different from analogical BO exploration problem?
### Poor exploration capablities of AR based on ETR of 'genereric' hyperparameters' configuration:


In [None]:
TreeBasedRegressor = ExtraTreesRegressor(n_estimators=100, min_samples_split=2, min_samples_leaf=1)

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

#kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=TreeBasedRegressor,
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 11
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

### Can changing values of ETR hyperparameters other than the most important one help?

In [None]:
TreeBasedRegressor = ExtraTreesRegressor(n_estimators=10, min_samples_split=2*2, min_samples_leaf=1*1)

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

#kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
#         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

regressor = ActiveLearner(
    estimator=TreeBasedRegressor,
    query_strategy=GP_regression_std,
    X_training=X_training.reshape(-1, 1), y_training=y_training.reshape(-1, 1)
)

In [None]:
X_grid = np.linspace(0, 20, 1000)
y_pred, y_std = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
y_pred, y_std = y_pred.ravel(), y_std.ravel()

In [None]:
n_queries = 11
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    y_pred_temp, y_std_temp = regressor.predict(X_grid.reshape(-1, 1), return_std=True)
    y_pred_temp, y_std_temp = y_pred_temp.ravel(), y_std_temp.ravel()
    y_pred_temp_aux, _ = regressor.predict(X.reshape(-1, 1), return_std=True)
    y_pred_temp_aux = y_pred_temp_aux.ravel()
    temp_X_train = regressor.X_training
    temp_y_train = regressor.y_training
    if idx > 0:
      with plt.style.context('seaborn-white'):
        plt.figure(figsize=(10, 8))
        plt.plot(X_grid_old, y_pred_temp_old)
        plt.fill_between(X_grid_old, y_pred_temp_old - y_std_temp_old, y_pred_temp_old + y_std_temp_old, alpha=0.2)
        plt.scatter(X, y, c='k', s=20)
        #plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='r', s=100, marker="X")
        plt.scatter(temp_X_train, temp_y_train, c='green', s=100, marker="X")
        plt.scatter(X[query_idx], y_pred_temp_aux_old[query_idx], c='orange', s=200, marker="*")
        plt.title('Prediction after active learning' + " - step " + str(idx))
        plt.show()
    y_pred_temp_old = y_pred_temp
    y_std_temp_old = y_std_temp
    X_grid_old = X_grid
    query_idx_old = query_idx
    y_pred_temp_aux_old = y_pred_temp_aux

In [None]:
X = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names).values
y = cal_housing.target

In [None]:
n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
X_training, y_training = X[initial_idx], y[initial_idx]

kernel = RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) \
         + WhiteKernel(noise_level=1, noise_level_bounds=(1e-10, 1e+1))

#The not-so-correct methodology of AR evaluation presented in ModAL

Based on https://modal-python.readthedocs.io/en/latest/content/examples/pool-based_sampling.html

In [None]:
learner = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training, y_training=y_training.reshape(-1, 1)
)

Mind the score method in https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html#sklearn.gaussian_process.GaussianProcessRegressor.score

In [None]:
X_raw = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)
y_raw = cal_housing.target

In [None]:
X_raw

In [None]:
y_raw

In [None]:
X = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names).values
y = cal_housing.target

In [None]:
initial_idx

In [None]:
X_training.shape

In [None]:
X.shape

In [None]:
X_training.reshape(-1, 1).shape

In [None]:
regressor = ActiveLearner(
    estimator=GaussianProcessRegressor(kernel=kernel),
    query_strategy=GP_regression_std,
    X_training=X_training, y_training=y_training.reshape(-1, 1)
)

In [None]:
n_queries = 10
for idx in range(n_queries):
    query_idx, query_instance = regressor.query(X)
    regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
initial_idx = np.array([18234, 3571, 18557, 13201, 12983])
#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_GaussianProcessRegressor = []
MAE_scores_of_GaussianProcessRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

#n_queries = 1000
n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = ActiveLearner(estimator=GaussianProcessRegressor(kernel=kernel), query_strategy=GP_regression_std, X_training=X_train, y_training=y_train.reshape(-1, 1))
    query_idx, query_instance = regressor.query(X)
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_GaussianProcessRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_GaussianProcessRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

###The order of samples in the raw/original dataset should be random, so one should be suspicious about any order of samples selected by the query strategy:



In [None]:
plt.plot(known_idx[20:])

In [None]:
8510 in unknown_idx

In [None]:
X.shape

In [None]:
X_query.shape

In [None]:
#np.arange(len(X)) not in np.array(known_idx)

In [None]:
set(known_idx)

In [None]:
regressor.X_training.shape

In [None]:
known_idx

In [None]:
regressor.estimator

#Sanity check based on Scikit-Optimize - with additional randomization of the order of samples in the dataset ('just in case'). One should remain suspicious about any order of samples selected by the query strategy:

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_GaussianProcessRegressor = []
MAE_scores_of_GaussianProcessRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

#n_queries = 1000
n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = GaussianProcessRegressor(kernel=kernel)
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_GaussianProcessRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_GaussianProcessRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
cal_housing = fetch_california_housing()

In [None]:
cal_housing.data

In [None]:
pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)

In [None]:
cal_housing_df = pd.concat([pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names), pd.DataFrame(cal_housing.target, columns=["target"])], axis=1)
cal_housing_df

In [None]:
print(cal_housing.DESCR)

### Quick exploratory data analysis (EDA) of the dataset:

In [None]:
import seaborn as sns

In [None]:
#sns.pairplot(cal_housing_df, hue='target', palette="tab10")
sns.pairplot(cal_housing_df)

In [None]:
#cal_housing_df = pd.DataFrame(cal_housing.data, columns=cal_housing.feature_names)

In [None]:
cal_housing_df.sample(frac=1).reset_index(drop=True)

In [None]:
X = cal_housing_df[cal_housing.feature_names].values
y = cal_housing_df.target.values

In [None]:
#X, _X, y, _y = train_test_split(X, y, test_size=0.5)

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])
#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_GaussianProcessRegressor = []
MAE_scores_of_GaussianProcessRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

#n_queries = 1000
n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = ActiveLearner(estimator=GaussianProcessRegressor(kernel=kernel), query_strategy=GP_regression_std, X_training=X_train, y_training=y_train.reshape(-1, 1))
    query_idx, query_instance = regressor.query(X)
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_GaussianProcessRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_GaussianProcessRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
8510 in unknown_idx

In [None]:
X.shape

In [None]:
X_query.shape

In [None]:
#np.arange(len(X)) not in np.array(known_idx)

In [None]:
set(known_idx)

In [None]:
regressor.X_training.shape

In [None]:
known_idx

In [None]:
regressor.estimator

#Sanity check based on Scikit-Optimize

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_GaussianProcessRegressor = []
MAE_scores_of_GaussianProcessRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

#n_queries = 1000
n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = GaussianProcessRegressor(kernel=kernel)
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_GaussianProcessRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_GaussianProcessRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
plt.plot(y_pred_STD)

In [None]:
kernel = RBF(length_scale=1.0,) + WhiteKernel(noise_level=1, noise_level_bounds=(1e-2, 1e+1))

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_GaussianProcessRegressor = []
MAE_scores_of_GaussianProcessRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = GaussianProcessRegressor(kernel=kernel)
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_GaussianProcessRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_GaussianProcessRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
plt.plot(y_pred_STD)

In [None]:
kernel = RBF(length_scale=1.0,) + WhiteKernel(noise_level=1, noise_level_bounds=(10, 100))

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_GaussianProcessRegressor = []
MAE_scores_of_GaussianProcessRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = GaussianProcessRegressor(kernel=kernel)
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_GaussianProcessRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_GaussianProcessRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
plt.plot(y_pred_STD)

#Szwabe: Are the GPR kernel parameters really well "optimized"?

"kernelkernel instance, default=None

The kernel specifying the covariance function of the GP.
        
If None is passed, the kernel ConstantKernel(1.0, constant_value_bounds="fixed" * RBF(1.0, length_scale_bounds="fixed") is used as default.
        
**Note that the kernel hyperparameters are optimized during fitting unless the bounds are marked as “fixed”.**"

[https://scikit-learn.org/stable/modules/generated/sklearn.gaussian_process.GaussianProcessRegressor.html]

In [None]:
kernel = RBF(length_scale=1.0,length_scale_bounds="fixed") + WhiteKernel(noise_level=1, noise_level_bounds="fixed")

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_GaussianProcessRegressor = []
MAE_scores_of_GaussianProcessRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = GaussianProcessRegressor(kernel=kernel)
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_GaussianProcessRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_GaussianProcessRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
plt.plot(y_pred_STD)

#Szwabe: What do the above-seen results indicate?

#Szwabe: Can using a extra trees regressor (instead of GPR) help to diversify target variable predictions (i.e. to actually enable 'real AR')? To what extent?

In [None]:
!pip install scikit-optimize

In [None]:
from skopt.learning import ExtraTreesRegressor

In [None]:
TreeBasedRegressor = ExtraTreesRegressor

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_TreeBasedRegressor = []
MAE_scores_of_TreeBasedRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    #regressor = GaussianProcessRegressor(kernel=kernel)
    regressor = TreeBasedRegressor()
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_TreeBasedRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_TreeBasedRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
plt.plot(y_pred_STD)

#Szwabe: Can ETR hyperparameters tuning help to deal with the problem of some predictions equal to zero?

In [None]:
TreeBasedRegressor = ExtraTreesRegressor(n_estimators=10, min_samples_split=2, min_samples_leaf=1)

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_TreeBasedRegressor = []
MAE_scores_of_TreeBasedRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    #regressor = GaussianProcessRegressor(kernel=kernel)
    #regressor = TreeBasedRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1)
    regressor = TreeBasedRegressor
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_TreeBasedRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_TreeBasedRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
plt.plot(y_pred_STD)

#Szwabe: Which ETR hyperparameter requires especially careful tuning? Why?

In [None]:
#TreeBasedRegressor = ExtraTreesRegressor(n_estimators=20, min_samples_split=2*2, min_samples_leaf=1)

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)

n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_TreeBasedRegressor = []
MAE_scores_of_TreeBasedRegressor = []

known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    #regressor = GaussianProcessRegressor(kernel=kernel)
    #regressor = TreeBasedRegressor(n_estimators=10, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1)
    #regressor = TreeBasedRegressor
    regressor = ExtraTreesRegressor(n_estimators=20, min_samples_split=2*2, min_samples_leaf=1)
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    #temp_predictions =
    #temp_score = mean_squared_error(y_query, temp_predictions)
    #MSE_scores_of_TreeBasedRegressor.append(temp_score)
    #temp_score = mean_absolute_error(y_query, temp_predictions)
    #MAE_scores_of_TreeBasedRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
plt.plot(known_idx[20:])

In [None]:
plt.plot(y_pred_STD)

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

#Szwabe: Can a piggybacked gradient boosting regressor enable to achieve a higher performance? Are these results deterministic?

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)


n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_TreeBasedRegressor = []
MAE_scores_of_TreeBasedRegressor = []

MSE_scores_of_piggybackedRegressor = []
MAE_scores_of_piggybackedRegressor = []


known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    #regressor = GaussianProcessRegressor(kernel=kernel)
    regressor = ExtraTreesRegressor(n_estimators=20, min_samples_split=2*2, min_samples_leaf=1)
    #regressor = TreeBasedRegressor
    #piggybackedRegressor = MLPRegressor()
    piggybackedRegressor = GradientBoostingRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_depth=4)
    #piggybackedRegressor = LinearRegression()
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    temp_predictions = y_pred
    temp_score = mean_squared_error(y_query, temp_predictions)
    MSE_scores_of_TreeBasedRegressor.append(temp_score)
    temp_score = mean_absolute_error(y_query, temp_predictions)
    MAE_scores_of_TreeBasedRegressor.append(temp_score)
    piggybackedRegressor.fit(X_training, y_training)
    temp_predictions = piggybackedRegressor.predict(X_query)
    temp_score = mean_squared_error(y_query, temp_predictions)
    MSE_scores_of_piggybackedRegressor.append(temp_score)
    temp_score = mean_absolute_error(y_query, temp_predictions)
    MAE_scores_of_piggybackedRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
#plt.plot(known_idx[20:])
plt.plot(known_idx)

In [None]:
plt.plot(y_pred_STD)

In [None]:
plt.plot(MSE_scores_of_TreeBasedRegressor, label="ARRegressor")
plt.plot(MSE_scores_of_piggybackedRegressor, label="piggybackedRegressor")
plt.legend()
plt.show()

In [None]:
plt.plot(MAE_scores_of_TreeBasedRegressor, label="ARRegressor")
plt.plot(MAE_scores_of_piggybackedRegressor, label="piggybackedRegressor")
plt.legend()
plt.show()

In [None]:
#regressor = ActiveLearner(
#    estimator=GaussianProcessRegressor(kernel=kernel),
#    query_strategy=GP_regression_std,
#    X_training=X_training, y_training=y_training.reshape(-1, 1)
#)


n_initial = 5
initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)
#initial_idx = np.array([18234, 3571, 18557, 13201, 12983])

#old_X_training, old_y_training = X[initial_idx], y[initial_idx]

MSE_scores_of_TreeBasedRegressor = []
MAE_scores_of_TreeBasedRegressor = []

MSE_scores_of_piggybackedRegressor = []
MAE_scores_of_piggybackedRegressor = []


known_idx = initial_idx.tolist()
print("known_idx: ", known_idx)

X_train = X[known_idx]
y_train = y[known_idx]


unknown_idx = set(range(len(X))).difference(set(known_idx))
print("unknown_idx: ", unknown_idx)

X_query = X[list(unknown_idx)]
y_query = y[list(unknown_idx)]

n_queries = 100
for idx in range(n_queries):
    X_train = X[known_idx]
    y_train = y[known_idx]
    #regressor = GaussianProcessRegressor(kernel=kernel)
    regressor = ExtraTreesRegressor(n_estimators=20, min_samples_split=2*2, min_samples_leaf=1)
    #regressor = TreeBasedRegressor
    #piggybackedRegressor = MLPRegressor()
    piggybackedRegressor = GradientBoostingRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_depth=4)
    #piggybackedRegressor = LinearRegression()
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    #query_idx, query_instance = regressor.query(X)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    highest_STD_unknown_idx = np.argmax(y_pred_STD)
    query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    #print("query_instance: ", query_instance)

    #regressor.teach(X[query_idx].reshape(1, -1), y[query_idx].reshape(1, -1))
    temp_shape = X_train.shape
    print("X_train_shape: ", temp_shape)
    temp_shape = X_query.shape
    print("X_query_shape: ", temp_shape)
    #y_query =
    temp_predictions = y_pred
    temp_score = mean_squared_error(y_query, temp_predictions)
    MSE_scores_of_TreeBasedRegressor.append(temp_score)
    temp_score = mean_absolute_error(y_query, temp_predictions)
    MAE_scores_of_TreeBasedRegressor.append(temp_score)
    piggybackedRegressor.fit(X_training, y_training)
    temp_predictions = piggybackedRegressor.predict(X_query)
    temp_score = mean_squared_error(y_query, temp_predictions)
    MSE_scores_of_piggybackedRegressor.append(temp_score)
    temp_score = mean_absolute_error(y_query, temp_predictions)
    MAE_scores_of_piggybackedRegressor.append(temp_score)
    print("known_idx: ", known_idx)
    #X_query = X_query[np.arange(len(X)) not in known_idx]
    #y_query = y_query[np.arange(len(y)) not in known_idx]

In [None]:
#plt.plot(known_idx[20:])
plt.plot(known_idx)

In [None]:
plt.plot(y_pred_STD)

In [None]:
plt.plot(MSE_scores_of_TreeBasedRegressor, label="ARRegressor")
plt.plot(MSE_scores_of_piggybackedRegressor, label="piggybackedRegressor")
plt.legend()
plt.show()

In [None]:
plt.plot(MAE_scores_of_TreeBasedRegressor, label="ARRegressor")
plt.plot(MAE_scores_of_piggybackedRegressor, label="piggybackedRegressor")
plt.legend()
plt.show()

In [None]:
from random import sample

In [None]:
query_idx = sample(list(range(X_query.shape[0])), 1)[0]
query_idx

In [None]:
def test_comparing_with_baseline(annealing_factor, number_of_cycles):
  #shuffled_indices = np.arange(y.shape[0])
  #np.random.shuffle(shuffled_indices)
  #y = y[shuffled_indices]
  #X = X[shuffled_indices]

  cal_housing_df.sample(frac=1).reset_index(drop=True)
  X = cal_housing_df[cal_housing.feature_names].values
  y = cal_housing_df.target.values

  #committee = AlternativeCommitteeRegressor(XGBRegressor, number_of_regressors_in_commitee)

  #number_of_initial_samples = 2 * number_of_regressors_in_commitee #temporary
  #initial_X_samples = X[:number_of_initial_samples]
  #initial_y_samples = y[:number_of_initial_samples]
  #X_query = X[number_of_initial_samples:]
  #y_query = y[number_of_initial_samples:]
  #for temp_initial_sample_idx in range(number_of_initial_samples):
  #  X_train_sample = initial_X_samples[temp_initial_sample_idx]
  #  y_train_sample = initial_y_samples[temp_initial_sample_idx]
  #  committee.teach(X_train_sample, y_train_sample)

  n_initial = 5
  initial_idx = np.random.choice(range(len(X)), size=n_initial, replace=False)


  #query_result = committee.query(X_query)
  #MSE_scores_of_AlternativeCommitteeRegressor = []
  #MAE_scores_of_AlternativeCommitteeRegressor = []

  MSE_scores_of_ARRegressor = []
  MAE_scores_of_ARRegressor = []

  MSE_scores_of_piggybackedRegressor = []
  MAE_scores_of_piggybackedRegressor = []

  known_idx = initial_idx.tolist()
  print("known_idx: ", known_idx)

  X_train = X[known_idx]
  y_train = y[known_idx]


  unknown_idx = set(range(len(X))).difference(set(known_idx))
  print("unknown_idx: ", unknown_idx)

  X_query = X[list(unknown_idx)]
  y_query = y[list(unknown_idx)]


  temp_probas = []
  for i in range(number_of_cycles):
    #query_result = committee.query(X_query)
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = ExtraTreesRegressor(n_estimators=20, min_samples_split=2*2, min_samples_leaf=1)
    piggybackedRegressor = GradientBoostingRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_depth=4)
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    temp_prob = np.random.rand()
    temp_prob = np.log(temp_prob)
    temp_prob = temp_prob / (annealing_factor * i + 1)
    temp_prob = int(round(temp_prob)) % 2
    temp_probas.append(temp_prob)
    print("temp_prob: ", temp_prob)
    temp_probas.append(temp_prob)
    if temp_prob > 0.5:
        query_idx = sample(list(range(len(unknown_idx))), 1)[0]
        print("this time it's random")
    else:
        highest_STD_unknown_idx = np.argmax(y_pred_STD)
        query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    temp_predictions = y_pred
    temp_score = mean_squared_error(y_query, temp_predictions)
    MSE_scores_of_ARRegressor.append(temp_score)
    temp_score = mean_absolute_error(y_query, temp_predictions)
    MAE_scores_of_ARRegressor.append(temp_score)
    piggybackedRegressor.fit(X_training, y_training)
    temp_predictions = piggybackedRegressor.predict(X_query)
    temp_score = mean_squared_error(y_query, temp_predictions)
    MSE_scores_of_piggybackedRegressor.append(temp_score)
    temp_score = mean_absolute_error(y_query, temp_predictions)
    MAE_scores_of_piggybackedRegressor.append(temp_score)


  #initial_X_samples = X[:number_of_initial_samples]
  #initial_y_samples = y[:number_of_initial_samples]
  #X_query = X[number_of_initial_samples:]
  #y_query = y[number_of_initial_samples:]
  known_idx = initial_idx.tolist()
  print("known_idx: ", known_idx)

  X_train = X[known_idx]
  y_train = y[known_idx]


  unknown_idx = set(range(len(X))).difference(set(known_idx))
  print("unknown_idx: ", unknown_idx)

  X_query = X[list(unknown_idx)]
  y_query = y[list(unknown_idx)]


  temp_probas = []



#  for temp_initial_sample_idx in range(number_of_initial_samples):
#      X_train_sample = initial_X_samples[temp_initial_sample_idx]
#      y_train_sample = initial_y_samples[temp_initial_sample_idx]
#      committee.teach(X_train_sample, y_train_sample)
#  query_result = committee.query(X_query)
  MSE_scores_of_random_baseline = []
  MAE_scores_of_random_baseline = []
  MSE_scores_of_random_baseline_piggybackedRegressor = []
  MAE_scores_of_random_baseline_piggybackedRegressor = []

  for i in range(number_of_cycles):
    X_train = X[known_idx]
    y_train = y[known_idx]
    regressor = ExtraTreesRegressor(n_estimators=20, min_samples_split=2*2, min_samples_leaf=1)
    piggybackedRegressor = GradientBoostingRegressor(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_depth=4)
    X_training = X_train
    y_training = y_train.reshape(-1, 1)
    regressor.fit(X_training, y_training)
    X_query = X[list(unknown_idx)]
    y_query = y[list(unknown_idx)]
    y_pred, y_pred_STD = regressor.predict(X_query, return_std=True)
    temp_prob = np.random.rand()
    temp_prob = np.log(temp_prob)
    temp_prob = temp_prob / (annealing_factor * i + 1)
    temp_prob = int(round(temp_prob)) % 2
    temp_probas.append(temp_prob)
    print("temp_prob: ", temp_prob)
    temp_probas.append(temp_prob)
    #if temp_prob > 0.5:
    if True:
        query_idx = sample(list(range(len(unknown_idx))), 1)[0]
        print("this time it's random")
    else:
        highest_STD_unknown_idx = np.argmax(y_pred_STD)
        query_idx = list(unknown_idx)[highest_STD_unknown_idx]
    known_idx.append(query_idx)
    temp_predictions = y_pred
    temp_score = mean_squared_error(y_query, temp_predictions)
    MSE_scores_of_random_baseline.append(temp_score)
    temp_score = mean_absolute_error(y_query, temp_predictions)
    MAE_scores_of_random_baseline.append(temp_score)
    piggybackedRegressor.fit(X_training, y_training)
    temp_predictions = piggybackedRegressor.predict(X_query)
    temp_score = mean_squared_error(y_query, temp_predictions)
    MSE_scores_of_random_baseline_piggybackedRegressor.append(temp_score)
    temp_score = mean_absolute_error(y_query, temp_predictions)
    MAE_scores_of_random_baseline_piggybackedRegressor.append(temp_score)

  return MSE_scores_of_ARRegressor, MSE_scores_of_piggybackedRegressor, MSE_scores_of_random_baseline, MSE_scores_of_random_baseline_piggybackedRegressor, MAE_scores_of_ARRegressor, MAE_scores_of_piggybackedRegressor, MAE_scores_of_random_baseline, MAE_scores_of_random_baseline_piggybackedRegressor

In [None]:
#number_of_tests = 10
annealing_factor = 0.03
number_of_cycles = 200
MSE_scores_of_ARRegressor, MSE_scores_of_piggybackedRegressor, MSE_scores_of_random_baseline, MSE_scores_of_random_baseline_piggybackedRegressor, MAE_scores_of_ARRegressor, MAE_scores_of_piggybackedRegressor, MAE_scores_of_random_baseline, MAE_scores_of_random_baseline_piggybackedRegressor = test_comparing_with_baseline(annealing_factor, number_of_cycles)


In [None]:
plt.plot(MSE_scores_of_ARRegressor, label="ARRegressor")
plt.plot(MSE_scores_of_piggybackedRegressor, label="piggybackedRegressor")
plt.plot(MSE_scores_of_random_baseline, label="random_baseline")
plt.plot(MSE_scores_of_random_baseline_piggybackedRegressor, label="random_baseline_piggybackedRegressor")
plt.legend()
plt.show()

In [None]:
plt.plot(MAE_scores_of_ARRegressor, label="ARRegressor")
plt.plot(MAE_scores_of_piggybackedRegressor, label="piggybackedRegressor")
plt.plot(MAE_scores_of_random_baseline, label="random_baseline")
plt.plot(MAE_scores_of_random_baseline_piggybackedRegressor, label="random_baseline_piggybackedRegressor")
plt.legend()
plt.show()

#Szwabe: How many test repetitions ar needed to observe any significant value of any AR algorithm based on a non-random query strategy? How does this value depend on the sample target variable attribution/assignment budget (the number of AR cycles)?  

In [None]:
def multiple_tests_comparing_with_baseline(number_of_tests, annealing_factor, number_of_cycles):
  means_and_stds = {"MAE": {}, "MSE": {}}
  MSE_scores_of_ARRegressor_list = []
  MSE_scores_of_piggybackedRegressor_list = []
  MSE_scores_of_random_baseline_list = []
  MAE_scores_of_ARRegressor_list = []
  MAE_scores_of_piggybackedRegressor_list = []
  MAE_scores_of_random_baseline_list = []
  for test_number in range(number_of_tests):
    #MSE_scores_of_ARRegressor, MSE_scores_of_random_baseline, MAE_scores_of_ARRegressor, MAE_scores_of_random_baseline = test_comparing_with_baseline(annealing_factor, number_of_cycles)
    MSE_scores_of_ARRegressor, MSE_scores_of_piggybackedRegressor, MSE_scores_of_random_baseline, MSE_scores_of_random_baseline_piggybackedRegressor, MAE_scores_of_ARRegressor, MAE_scores_of_piggybackedRegressor, MAE_scores_of_random_baseline, MAE_scores_of_random_baseline_piggybackedRegressor = test_comparing_with_baseline(annealing_factor, number_of_cycles)
    MSE_scores_of_ARRegressor_list.append(MSE_scores_of_ARRegressor)
    MSE_scores_of_piggybackedRegressor_list.append(MSE_scores_of_piggybackedRegressor)
    MSE_scores_of_random_baseline_list.append(MSE_scores_of_random_baseline)
    MAE_scores_of_ARRegressor_list.append(MAE_scores_of_ARRegressor)
    MAE_scores_of_piggybackedRegressor_list.append(MAE_scores_of_piggybackedRegressor)
    MAE_scores_of_random_baseline_list.append(MAE_scores_of_random_baseline)
  temp_mean = np.mean(np.array(MSE_scores_of_ARRegressor_list), axis=0)
  temp_std = np.std(np.array(MSE_scores_of_ARRegressor_list), axis=0)
  means_and_stds["MSE"]["ARRegressor"] = {"mean": temp_mean, "std": temp_std}
  temp_mean = np.mean(np.array(MSE_scores_of_piggybackedRegressor_list), axis=0)
  temp_std = np.std(np.array(MSE_scores_of_piggybackedRegressor_list), axis=0)
  means_and_stds["MSE"]["piggybackedRegressor"] = {"mean": temp_mean, "std": temp_std}
  temp_mean = np.mean(np.array(MAE_scores_of_ARRegressor_list), axis=0)
  temp_std = np.std(np.array(MAE_scores_of_ARRegressor_list), axis=0)
  means_and_stds["MAE"]["ARRegressor"] = {"mean": temp_mean, "std": temp_std}
  temp_mean = np.mean(np.array(MAE_scores_of_piggybackedRegressor_list), axis=0)
  temp_std = np.std(np.array(MAE_scores_of_piggybackedRegressor_list), axis=0)
  means_and_stds["MAE"]["piggybackedRegressor"] = {"mean": temp_mean, "std": temp_std}
  temp_mean = np.mean(np.array(MSE_scores_of_random_baseline_list), axis=0)
  temp_std = np.std(np.array(MSE_scores_of_random_baseline_list), axis=0)
  means_and_stds["MSE"]["random_baseline"] = {"mean": temp_mean, "std": temp_std}
  temp_mean = np.mean(np.array(MAE_scores_of_random_baseline_list), axis=0)
  temp_std = np.std(np.array(MAE_scores_of_random_baseline_list), axis=0)
  means_and_stds["MAE"]["random_baseline"] = {"mean": temp_mean, "std": temp_std}
  MSE_scores_of_random_baseline_mean = np.mean(np.array(MSE_scores_of_random_baseline_list), axis=0)
  MSE_scores_of_random_baseline_std = np.std(np.array(MSE_scores_of_random_baseline_list), axis=0)
  return means_and_stds

In [None]:
def plot_means_and_stds(means_and_stds, temp_measure):
  plt.clf
  plt.rcParams["figure.figsize"] = (20,14)
  temp_color_letters = ["b", "y", "g"]
  #temp_measure = "MAE"
  temp_measure_data = means_and_stds[temp_measure]
  temp_algorithms_labes = list(temp_measure_data)
  for temp_algorithm_label_idx, temp_algorithm_label in enumerate(temp_algorithms_labes):
    temp_color_letter = temp_color_letters[temp_algorithm_label_idx]
    temp_mean = means_and_stds[temp_measure][temp_algorithm_label]["mean"]
    temp_std = means_and_stds[temp_measure][temp_algorithm_label]["std"]
    x = np.arange(len(temp_mean))
    plt.plot(x, temp_mean, temp_color_letter+"-", label=temp_algorithm_label)
    plt.fill_between(x, temp_mean - temp_std, temp_mean + temp_std, color=temp_color_letter, alpha=0.2)
  plt.legend()

In [None]:
#number_of_tests = 20
number_of_tests = 10
annealing_factor = 0.03
number_of_cycles = 200

means_and_stds = multiple_tests_comparing_with_baseline(number_of_tests, annealing_factor, number_of_cycles)

In [None]:
temp_measure = "MSE"
plot_means_and_stds(means_and_stds, temp_measure)

In [None]:
temp_measure = "MAE"
plot_means_and_stds(means_and_stds, temp_measure)

#Szwabe: Set an appropriate number of AR cycles and test repetitions (below) in order to compare practical value of the three sampling strategies in the most reliable way that is still feasible considering the amount of available time (of the classwork). Compare and comment on the practical value of the strategies.

In [None]:
number_of_tests = 20
annealing_factor = 0.0
number_of_cycles = 200

means_and_stds = multiple_tests_comparing_with_baseline(number_of_tests, annealing_factor, number_of_cycles)

In [None]:
temp_measure = "MSE"
plot_means_and_stds(means_and_stds, temp_measure)

In [None]:
temp_measure = "MAE"
plot_means_and_stds(means_and_stds, temp_measure)

In [None]:
number_of_tests = 20
annealing_factor = 1.0
number_of_cycles = 200

means_and_stds = multiple_tests_comparing_with_baseline(number_of_tests, annealing_factor, number_of_cycles)

In [None]:
temp_measure = "MSE"
plot_means_and_stds(means_and_stds, temp_measure)

In [None]:
temp_measure = "MAE"
plot_means_and_stds(means_and_stds, temp_measure)

In [None]:
!pip freeze

absl-py==1.0.0
alabaster==0.7.12
albumentations==0.1.12
altair==4.2.0
appdirs==1.4.4
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arviz==0.11.4
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
atari-py==0.2.9
atomicwrites==1.4.0
attrs==21.4.0
audioread==2.1.9
autograd==1.3
Babel==2.9.1
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==4.1.0
blis==0.4.1
bokeh==2.3.3
Bottleneck==1.3.4
branca==0.4.2
bs4==0.0.1
CacheControl==0.12.10
cached-property==1.5.2
cachetools==4.2.4
catalogue==1.0.0
certifi==2021.10.8
cffi==1.15.0
cftime==1.5.2
chardet==3.0.4
charset-normalizer==2.0.12
click==7.1.2
cloudpickle==1.3.0
cmake==3.12.0
cmdstanpy==0.9.5
colorcet==3.0.0
colorlover==0.3.0
community==1.0.0b1
contextlib2==0.5.5
convertdate==2.4.0
coverage==3.7.1
coveralls==0.5
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.2.7
cvxpy==1.0.31
cycler==0.11.0
cymem==2.0.6
Cython==0.29.28
daft==0.0.4
dask==2.12.0
datascience==0.10.6
debugpy==1.0.0
decorator==4.4.2
defusedxml==0.7.1
descartes==1.1.0
dill==0.3.4
distributed==1.25.3
dlib @ file:///dlib-19.18.0-cp37-cp37m-linux_x86_64.whl
dm-tree==0.1.6
docopt==0.6.2
docutils==0.17.1
dopamine-rl==1.0.5
earthengine-api==0.1.300
easydict==1.9
ecos==2.0.10
editdistance==0.5.3
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz
entrypoints==0.4
ephem==4.1.3
et-xmlfile==1.1.0
fa2==0.3.5
fastai==1.0.61
fastdtw==0.3.4
fastprogress==1.0.2
fastrlock==0.8
fbprophet==0.7.1
feather-format==0.4.1
filelock==3.6.0
firebase-admin==4.4.0
fix-yahoo-finance==0.0.22
Flask==1.1.4
flatbuffers==2.0
folium==0.8.3
future==0.16.0
gast==0.5.3
GDAL==2.2.2
gdown==4.2.2
gensim==3.6.0
geographiclib==1.52
geopy==1.17.0
gin-config==0.5.0
glob2==0.7
google==2.0.3
google-api-core==1.26.3
google-api-python-client==1.12.10
google-auth==1.35.0
google-auth-httplib2==0.0.4
google-auth-oauthlib==0.4.6
google-cloud-bigquery==1.21.0
google-cloud-bigquery-storage==1.1.0
google-cloud-core==1.0.3
google-cloud-datastore==1.8.0
google-cloud-firestore==1.7.0
google-cloud-language==1.2.0
google-cloud-storage==1.18.1
google-cloud-translate==1.5.0
google-colab @ file:///colabtools/dist/google-colab-1.0.0.tar.gz
google-pasta==0.2.0
google-resumable-media==0.4.1
googleapis-common-protos==1.55.0
googledrivedownloader==0.4
graphviz==0.10.1
greenlet==1.1.2
grpcio==1.44.0
gspread==3.4.2
gspread-dataframe==3.0.8
gym==0.17.3
h5py==3.1.0
HeapDict==1.0.1
hijri-converter==2.2.3
holidays==0.10.5.2
holoviews==1.14.8
html5lib==1.0.1
httpimport==0.5.18
httplib2==0.17.4
httplib2shim==0.0.3
humanize==0.5.1
hyperopt==0.1.2
ideep4py==2.0.0.post3
idna==2.10
imageio==2.4.1
imagesize==1.3.0
imbalanced-learn==0.8.1
imblearn==0.0
imgaug==0.2.9
importlib-metadata==4.11.2
importlib-resources==5.4.0
imutils==0.5.4
inflect==2.1.0
iniconfig==1.1.1
intel-openmp==2022.0.2
intervaltree==2.1.0
ipykernel==4.10.1
ipython==5.5.0
ipython-genutils==0.2.0
ipython-sql==0.3.9
ipywidgets==7.6.5
itsdangerous==1.1.0
jax==0.3.1
jaxlib @ https://storage.googleapis.com/jax-releases/cuda11/jaxlib-0.3.0+cuda11.cudnn805-cp37-none-manylinux2010_x86_64.whl
jedi==0.18.1
jieba==0.42.1
Jinja2==2.11.3
joblib==1.1.0
jpeg4py==0.1.4
jsonschema==4.3.3
jupyter==1.0.0
jupyter-client==5.3.5
jupyter-console==5.2.0
jupyter-core==4.9.2
jupyterlab-pygments==0.1.2
jupyterlab-widgets==1.0.2
kaggle==1.5.12
kapre==0.3.7
keras==2.8.0
Keras-Preprocessing==1.1.2
keras-vis==0.4.1
kiwisolver==1.3.2
korean-lunar-calendar==0.2.1
libclang==13.0.0
librosa==0.8.1
lightgbm==2.2.3
llvmlite==0.34.0
lmdb==0.99
LunarCalendar==0.0.9
lxml==4.2.6
Markdown==3.3.6
MarkupSafe==2.0.1
matplotlib==3.2.2
matplotlib-inline==0.1.3
matplotlib-venn==0.11.6
missingno==0.5.1
mistune==0.8.4
mizani==0.6.0
mkl==2019.0
mlxtend==0.14.0
modAL==0.4.1
more-itertools==8.12.0
moviepy==0.2.3.5
mpmath==1.2.1
msgpack==1.0.3
multiprocess==0.70.12.2
multitasking==0.0.10
murmurhash==1.0.6
music21==5.5.0
natsort==5.5.0
nbclient==0.5.11
nbconvert==5.6.1
nbformat==5.1.3
nest-asyncio==1.5.4
netCDF4==1.5.8
networkx==2.6.3
nibabel==3.0.2
nltk==3.2.5
notebook==5.3.1
numba==0.51.2
numexpr==2.8.1
numpy==1.21.5
nvidia-ml-py3==7.352.0
oauth2client==4.1.3
oauthlib==3.2.0
okgrade==0.4.3
opencv-contrib-python==4.1.2.30
opencv-python==4.1.2.30
openpyxl==3.0.9
opt-einsum==3.3.0
osqp==0.6.2.post0
packaging==21.3
palettable==3.3.0
pandas==1.3.5
pandas-datareader==0.9.0
pandas-gbq==0.13.3
pandas-profiling==1.4.1
pandocfilters==1.5.0
panel==0.12.1
param==1.12.0
parso==0.8.3
pathlib==1.0.1
patsy==0.5.2
pep517==0.12.0
pexpect==4.8.0
pickleshare==0.7.5
Pillow==7.1.2
pip-tools==6.2.0
plac==1.1.3
plotly==5.5.0
plotnine==0.6.0
pluggy==0.7.1
pooch==1.6.0
portpicker==1.3.9
prefetch-generator==1.0.1
preshed==3.0.6
prettytable==3.1.1
progressbar2==3.38.0
prometheus-client==0.13.1
promise==2.3
prompt-toolkit==1.0.18
protobuf==3.17.3
psutil==5.4.8
psycopg2==2.7.6.1
ptyprocess==0.7.0
py==1.11.0
pyaml==21.10.1
pyarrow==6.0.1
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycocotools==2.0.4
pycparser==2.21
pyct==0.4.8
pydata-google-auth==1.3.0
pydot==1.3.0
pydot-ng==2.0.0
pydotplus==2.0.2
PyDrive==1.3.1
pyemd==0.5.1
pyerfa==2.0.0.1
pyglet==1.5.0
Pygments==2.6.1
pygobject==3.26.1
pymc3==3.11.4
PyMeeus==0.5.11
pymongo==4.0.1
pymystem3==0.2.0
PyOpenGL==3.1.6
pyparsing==3.0.7
pyrsistent==0.18.1
pysndfile==1.3.8
PySocks==1.7.1
pystan==2.19.1.1
pytest==3.6.4
python-apt==0.0.0
python-chess==0.23.11
python-dateutil==2.8.2
python-louvain==0.16
python-slugify==6.1.1
python-utils==3.1.0
pytz==2018.9
pyviz-comms==2.1.0
PyWavelets==1.2.0
PyYAML==3.13
pyzmq==22.3.0
qdldl==0.1.5.post0
qtconsole==5.2.2
QtPy==2.0.1
regex==2019.12.20
requests==2.23.0
requests-oauthlib==1.3.1
resampy==0.2.2
rpy2==3.4.5
rsa==4.8
scikit-image==0.18.3
scikit-learn==1.0.2
scikit-optimize==0.9.0
scipy==1.4.1
screen-resolution-extra==0.0.0
scs==3.2.0
seaborn==0.11.2
semver==2.13.0
Send2Trash==1.8.0
setuptools-git==1.2
Shapely==1.8.1.post1
simplegeneric==0.8.1
six==1.15.0
sklearn==0.0
sklearn-pandas==1.8.0
smart-open==5.2.1
snowballstemmer==2.2.0
sortedcontainers==2.4.0
SoundFile==0.10.3.post1
spacy==2.2.4
Sphinx==1.8.6
sphinxcontrib-serializinghtml==1.1.5
sphinxcontrib-websupport==1.2.4
SQLAlchemy==1.4.31
sqlparse==0.4.2
srsly==1.0.5
statsmodels==0.10.2
sympy==1.7.1
tables==3.7.0
tabulate==0.8.9
tblib==1.7.0
tenacity==8.0.1
tensorboard==2.8.0
tensorboard-data-server==0.6.1
tensorboard-plugin-wit==1.8.1
tensorflow @ file:///tensorflow-2.8.0-cp37-cp37m-linux_x86_64.whl
tensorflow-datasets==4.0.1
tensorflow-estimator==2.8.0
tensorflow-gcs-config==2.8.0
tensorflow-hub==0.12.0
tensorflow-io-gcs-filesystem==0.24.0
tensorflow-metadata==1.7.0
tensorflow-probability==0.16.0
termcolor==1.1.0
terminado==0.13.1
testpath==0.6.0
text-unidecode==1.3
textblob==0.15.3
Theano-PyMC==1.1.2
thinc==7.4.0
threadpoolctl==3.1.0
tifffile==2021.11.2
tomli==2.0.1
toolz==0.11.2
torch @ https://download.pytorch.org/whl/cu111/torch-1.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl
torchaudio @ https://download.pytorch.org/whl/cu111/torchaudio-0.10.0%2Bcu111-cp37-cp37m-linux_x86_64.whl
torchsummary==1.5.1
torchtext==0.11.0
torchvision @ https://download.pytorch.org/whl/cu111/torchvision-0.11.1%2Bcu111-cp37-cp37m-linux_x86_64.whl
tornado==5.1.1
tqdm==4.63.0
traitlets==5.1.1
tweepy==3.10.0
typeguard==2.7.1
typing-extensions==3.10.0.2
tzlocal==1.5.1
uritemplate==3.0.1
urllib3==1.24.3
vega-datasets==0.9.0
wasabi==0.9.0
wcwidth==0.2.5
webencodings==0.5.1
Werkzeug==1.0.1
widgetsnbextension==3.5.2
wordcloud==1.5.0
wrapt==1.13.3
xarray==0.18.2
xgboost==0.90
xkit==0.0.0
xlrd==1.1.0
xlwt==1.3.0
yellowbrick==1.4
zict==2.1.0
zipp==3.7.0