In [70]:
%load_ext autoreload
%autoreload 2
import os, sys
sys.path.append('..')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster")
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (11,8)
from merf.utils import MERFDataGenerator
from merf.merf import MERF

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Experimental Setup

There are some global parameters for all the experiments. Each experiment is run N_per_experiment times. The experiment itself is parametrized by three parameters of the generative model. We collect up the results of the experiments in a big list of dictinaries. This is then used to compute certain summary statistics after all the experiments are over.

In [71]:
# Globals
num_clusters_each_size = 20
train_sizes = [1, 3, 5, 7, 9]
known_sizes = [9, 27, 45, 63, 81]
new_sizes = [10, 30, 50, 70, 90]
n_estimators = 300
max_iterations = 2
train_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(train_sizes, num_clusters_each_size)
known_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(known_sizes, num_clusters_each_size)
new_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(new_sizes, num_clusters_each_size)

In [72]:
# Number of times to run each experiemnts 
N_per_experiment = 2

In [73]:
# Defining the experiments to run
experiments = [{'id': 0, 'm': .8, 'sigma_b_sq': 0.9, 'sigma_e': 1},
               {'id': 1, 'm': .7, 'sigma_b_sq': 2.7, 'sigma_e': 1},
               {'id': 2, 'm': .6, 'sigma_b_sq': 4.5, 'sigma_e': 1},
               {'id': 3, 'm': .3, 'sigma_b_sq': 0.2, 'sigma_e': 1},
               {'id': 4, 'm': .3, 'sigma_b_sq': 0.5, 'sigma_e': 1},
               {'id': 5, 'm': .2, 'sigma_b_sq': 0.8, 'sigma_e': 1}]

# Run Experiments

In [77]:
# Creating a dictionary to hold the results of the experiments
results = []
for experiment in experiments:
    results.append({'id': experiment['id'], 'ptev': [], 'prev': [],
                    'mse_known_rf_fixed': [], 'mse_known_rf_ohe': [], 'mse_known_merf': [], 
                    'mse_new_rf_fixed': [], 'mse_new_rf_ohe': [], 'mse_new_merf': []})
    
for experiment, result in zip(experiments, results): 
    for experiment_iteration in range(0, N_per_experiment):
        print("Experiment iteration: {}".format(experiment_iteration))
        # Generate data for experiment
        dgm = MERFDataGenerator(m=experiment['m'], sigma_b=np.sqrt(experiment['sigma_b_sq']), sigma_e=experiment['sigma_e'])
        train, test_known, test_new, train_cluster_ids, ptev, prev = dgm.generate_split_samples(train_cluster_sizes, known_cluster_sizes, new_cluster_sizes)
        
        # Store off PTEV and PREV
        result['ptev'].append(ptev)
        result['prev'].append(prev)
        
        # Training Data Extract
        X_train = train[['X_0', 'X_1', 'X_2']]
        Z_train = train[['Z']]
        clusters_train = train['cluster']
        y_train = train['y']

        # Known Cluster Data Extract
        X_known = test_known[['X_0', 'X_1', 'X_2']]
        Z_known = test_known[['Z']]
        clusters_known = test_known['cluster']
        y_known = test_known['y']

        # New Cluster Data Extract
        X_new = test_new[['X_0', 'X_1', 'X_2']]
        Z_new = test_new[['Z']]
        clusters_new = test_new['cluster']
        y_new = test_new['y']

        # MERF
        print("---------------------MERF----------------------")
        mrf = MERF(n_estimators=n_estimators, max_iterations=max_iterations)
        mrf.fit(X_train, Z_train, clusters_train, y_train)
        y_hat_known_merf = mrf.predict(X_known, Z_known, clusters_known)
        y_hat_new_merf = mrf.predict(X_new, Z_new, clusters_new)
        mse_known_merf = np.mean((y_known - y_hat_known_merf) ** 2)
        mse_new_merf = np.mean((y_new - y_hat_new_merf) ** 2)
        result['mse_known_merf'].append(mse_known_merf)
        result['mse_new_merf'].append(mse_new_merf)

        # Random Forest Fixed Only 
        print("---------------------Random Forest Fixed Effect Only----------------------")
        rf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_hat_known_rf_fixed = rf.predict(X_known)
        y_hat_new_rf_fixed = rf.predict(X_new)
        mse_known_rf_fixed = np.mean((y_known - y_hat_known_rf_fixed) ** 2)
        mse_new_rf_fixed = np.mean((y_new - y_hat_new_rf_fixed) ** 2)
        result['mse_known_rf_fixed'].append(mse_known_rf_fixed)
        result['mse_new_rf_fixed'].append(mse_new_rf_fixed)
        
        # Random Forest with OHE Cluster
        print("---------------------Random Forest w OHE Cluster----------------------")
        X_train_w_ohe = MERFDataGenerator.create_X_with_ohe_clusters(X_train, clusters_train, train_cluster_ids)
        X_known_w_ohe = MERFDataGenerator.create_X_with_ohe_clusters(X_known, clusters_known, train_cluster_ids)
        X_new_w_ohe = MERFDataGenerator.create_X_with_ohe_clusters(X_new, clusters_new, train_cluster_ids)
        rf_ohe = RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1)
        rf_ohe.fit(X_train_w_ohe, y_train)
        y_hat_known_rf_ohe = rf_ohe.predict(X_known_w_ohe)
        y_hat_new_rf_ohe = rf_ohe.predict(X_new_w_ohe)
        mse_known_rf_ohe = np.mean((y_known - y_hat_known_rf_ohe) ** 2)
        mse_new_rf_ohe = np.mean((y_new - y_hat_new_rf_ohe) ** 2)
        result['mse_known_rf_ohe'].append(mse_known_rf_ohe)
        result['mse_new_rf_ohe'].append(mse_new_rf_ohe)


INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 89.89608754199945, PREV = 10.115591746917799.


Experiment iteration: 0
---------------------MERF----------------------


INFO     [merf.py:235] GLL is 898.8831739979123 at iteration 1.
INFO     [merf.py:235] GLL is 884.8857878632623 at iteration 2.


---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 89.72414570879856, PREV = 10.307447108046842.


Experiment iteration: 1
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 969.3998149204131 at iteration 1.
INFO     [merf.py:235] GLL is 926.9129378868814 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 89.85546678609913, PREV = 30.482552322314234.


Experiment iteration: 0
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 1132.7034229758258 at iteration 1.
INFO     [merf.py:235] GLL is 1039.713089837637 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 89.8094853034404, PREV = 30.636396130929537.


Experiment iteration: 1
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 1027.531663191939 at iteration 1.
INFO     [merf.py:235] GLL is 979.9123872869907 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 90.08538977730439, PREV = 49.526062008970186.


Experiment iteration: 0
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 1132.369721562736 at iteration 1.
INFO     [merf.py:235] GLL is 983.3267770137679 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 89.93527062114542, PREV = 50.35986648179024.


Experiment iteration: 1
---------------------MERF----------------------


INFO     [merf.py:235] GLL is 1112.8139214713249 at iteration 1.
INFO     [merf.py:235] GLL is 962.1085995647846 at iteration 2.


---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 56.60108993518346, PREV = 15.335008606553217.


Experiment iteration: 0
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 595.8859273146685 at iteration 1.
INFO     [merf.py:235] GLL is 536.1849585922074 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 56.96314445004167, PREV = 15.110421296248106.


Experiment iteration: 1
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 600.5179112498043 at iteration 1.
INFO     [merf.py:235] GLL is 549.9883175806602 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 62.095301992844654, PREV = 30.52138953404493.


Experiment iteration: 0
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 550.7819381769276 at iteration 1.
INFO     [merf.py:235] GLL is 500.87023581078876 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 61.63110744497759, PREV = 31.127862329325357.


Experiment iteration: 1
---------------------MERF----------------------


INFO     [merf.py:235] GLL is 671.4288918182554 at iteration 1.
INFO     [merf.py:235] GLL is 634.5217542465908 at iteration 2.


---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 56.86293509195036, PREV = 60.68918509154649.


Experiment iteration: 0
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 706.5209420503456 at iteration 1.
INFO     [merf.py:235] GLL is 648.6911673866513 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))
INFO     [utils.py:164] Drew 10000 samples from 200 clusters.
INFO     [utils.py:165] PTEV = 56.39907682781137, PREV = 61.84629341406278.


Experiment iteration: 1
---------------------MERF----------------------


INFO     [merf.py:235] GLL is 573.2791552037188 at iteration 1.
INFO     [merf.py:235] GLL is 503.54080998194473 at iteration 2.


---------------------Random Forest Fixed Effect Only----------------------
---------------------Random Forest w OHE Cluster----------------------


'Categorical.from_codes(codes, categories)'?
  return self.make_block(Categorical(self.values, **kwargs))


In [78]:
results

[{'id': 0,
  'mse_known_merf': [2.6363674944169193, 2.6661732302860872],
  'mse_known_rf_fixed': [3.4091131218341162, 3.3027745241171167],
  'mse_known_rf_ohe': [3.3513540573410254, 3.2660012999161041],
  'mse_new_merf': [3.4826861212646261, 3.2380332223656594],
  'mse_new_rf_fixed': [3.5272654183930734, 3.2391664686121997],
  'mse_new_rf_ohe': [3.6845934086691021, 3.2805311320264932],
  'prev': [10.115591746917799, 10.307447108046842],
  'ptev': [89.896087541999449, 89.724145708798559]},
 {'id': 1,
  'mse_known_merf': [2.5269444941769308, 2.3502460819578377],
  'mse_known_rf_fixed': [5.5458270845108073, 4.5313787884293548],
  'mse_known_rf_ohe': [4.4388186904858031, 4.0262816487090332],
  'mse_new_merf': [4.6969672097944706, 4.588511502467683],
  'mse_new_rf_fixed': [4.9003689988689816, 4.7236638552216395],
  'mse_new_rf_ohe': [4.7596187822577667, 4.6501374567497313],
  'prev': [30.482552322314234, 30.636396130929537],
  'ptev': [89.855466786099129, 89.809485303440397]},
 {'id': 2,
  