In [None]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.simplefilter('ignore')

In [None]:
import numpy as np
import matplotlib.pyplot as plt 
from mliv.dgps import get_data, get_tau_fn, fn_dict
from mliv.ensemble import EnsembleIV, EnsembleIVStar
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

# DGPs

In [None]:
n = 5000
n_z = 1
iv_strength = .6
fname = 'abs'
dgp_num = 5
Z, T, Y, true_fn = get_data(n, n_z, iv_strength, get_tau_fn(fn_dict[fname]), dgp_num)

In [None]:
ind = 0
x_grid = np.linspace(np.quantile(T[:, ind], .01), np.quantile(T[:, ind], .99), 100)
T_test = np.zeros((100, T.shape[1])) + np.median(T, axis=0, keepdims=True)
T_test[:, ind] = x_grid

In [None]:
plt.figure(figsize=(10,3))
plt.subplot(1, 2, 1)
plt.scatter(Z[:, 0], Y)
plt.subplot(1, 2, 2)
plt.scatter(T[:, 0], Y)
plt.plot(T[np.argsort(T[:, ind]), ind], true_fn(T[np.argsort(T[:, ind])]))
plt.show()

# Ensemble Learning

In [None]:
est = EnsembleIV(n_iter=100,
                 max_abs_value=2,
                 adversary=RandomForestRegressor(max_depth=3, n_estimators=40, min_samples_leaf=50),
                 learner=RandomForestClassifier(max_depth=3, bootstrap=False, min_samples_leaf=50, 
                                                n_estimators=5)).fit(Z, T, Y)

In [None]:
est = EnsembleIVStar(n_iter=100,
                     max_abs_value=4,
                     adversary=RandomForestRegressor(max_depth=3, n_estimators=2, min_samples_leaf=50),
                     learner=RandomForestClassifier(max_depth=3, bootstrap=False, min_samples_leaf=50, 
                                                    n_estimators=5)).fit(Z, T, Y)

In [None]:
y_pred = est.predict(T_test)
y_pred_train = est.predict(T)

In [None]:
R2train = 1 - np.mean((true_fn(T).flatten() - y_pred_train.flatten())**2)/np.var(true_fn(T).flatten())
R2test = np.mean((true_fn(T_test).flatten() - y_pred.flatten())**2)
plt.title("R2Train: {:.3f}, MSETest: {:.3f}".format(R2train, R2test))
plt.plot(T_test[:, ind], y_pred, label='est')
plt.plot(T_test[:, ind], RandomForestRegressor(n_estimators=100, max_depth=10).fit(T, Y.flatten()).predict(T_test),
         label='direct')
plt.plot(T_test[:, ind], true_fn(T_test), label='true')
plt.legend()
plt.show()

In [None]:
y_direct = RandomForestRegressor(n_estimators=100, max_depth=10).fit(T, Y.flatten()).predict(T_test)
np.mean((true_fn(T_test).flatten() - y_direct.flatten())**2)

In [None]:
y_direct = RandomForestRegressor(n_estimators=100, max_depth=10).fit(T, Y.flatten()).predict(T)
1 - np.mean((true_fn(T).flatten() - y_direct.flatten())**2)/np.var(true_fn(T).flatten())