In [34]:
%load_ext autoreload
%autoreload 2
import os, sys
sys.path.append('..')
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster")
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (11,8)
from merf.utils import MERFDataGenerator
from merf.merf import MERF

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Experimental Setup

There are some global parameters for all the experiments. Each experiment is run N_per_experiment times. The experiment itself is parametrized by three parameters of the generative model. We collect up the results of the experiments in a big list of dictinaries. This is then used to compute certain summary statistics after all the experiments are over.

In [35]:
# Globals
num_clusters_each_size = 20
train_sizes = [1, 3, 5, 7, 9]
known_sizes = [9, 27, 45, 63, 81]
new_sizes = [10, 30, 50, 70, 90]
n_estimators=300
max_iterations=2
train_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(train_sizes, num_clusters_each_size)
known_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(known_sizes, num_clusters_each_size)
new_cluster_sizes = MERFDataGenerator.create_cluster_sizes_array(new_sizes, num_clusters_each_size)

In [48]:
# Number of times to run each experiemnts 
N_per_experiment = 2

In [49]:
# Defining the experiments to run
experiments = [{'id': 0, 'm': .6, 'sigma_b_sq': 4.5, 'sigma_e': 1},
               {'id': 1, 'm': .4, 'sigma_b_sq': 3.3, 'sigma_e': 1}]

# Run Experiments

In [52]:
# Creating a dictionary to hold the results of the experiments
results = []
for experiment in experiments:
    results.append({'id': experiment['id'], 'mse_known_rf_fixed': [], 'mse_known_rf_ohe': [], 'mse_known_merf': [], 'mse_new_rf_fixed': [], 'mse_new_rf_ohe': [], 'mse_new_merf': []})
    
for experiment, result in zip(experiments, results): 
    for experiment_iteration in range(0, N_per_experiment):
        print("Experiment iteration: {}".format(experiment_iteration))
        # Generate data for experiment
        dgm = MERFDataGenerator(m=experiment['m'], sigma_b=np.sqrt(experiment['sigma_b_sq']), sigma_e=experiment['sigma_e'])
        train, test_known, test_new = dgm.generate_split_samples(train_cluster_sizes, known_cluster_sizes, new_cluster_sizes)

        # Training Data Extract
        X_train = train[['X_0', 'X_1', 'X_2']]
        Z_train = train[['Z']]
        clusters_train = train['cluster']
        y_train = train['y']

        # Known Cluster Data Extract
        X_known = test_known[['X_0', 'X_1', 'X_2']]
        Z_known = test_known[['Z']]
        clusters_known = test_known['cluster']
        y_known = test_known['y']

        # New Cluster Data Extract
        X_new = test_new[['X_0', 'X_1', 'X_2']]
        Z_new = test_new[['Z']]
        clusters_new = test_new['cluster']
        y_new = test_new['y']

        # MERF
        print("---------------------MERF----------------------")
        mrf = MERF(n_estimators=n_estimators, max_iterations=max_iterations)
        mrf.fit(X_train, Z_train, clusters_train, y_train)
        y_hat_known_merf = mrf.predict(X_known, Z_known, clusters_known)
        y_hat_new_merf = mrf.predict(X_new, Z_new, clusters_new)
        mse_known_merf = np.mean((y_known - y_hat_known_merf) ** 2)
        mse_new_merf = np.mean((y_new - y_hat_new_merf) ** 2)
        result['mse_known_merf'].append(mse_known_merf)
        result['mse_new_merf'].append(mse_new_merf)

        # Random Forest Fixed Only 
        print("---------------------Random Forest Fixed Effect Only----------------------")
        rf = RandomForestRegressor(n_estimators=n_estimators, n_jobs=-1)
        rf.fit(X_train, y_train)
        y_hat_known_rf_fixed = rf.predict(X_known)
        y_hat_new_rf_fixed = rf.predict(X_new)
        mse_known_rf_fixed = np.mean((y_known - y_hat_known_rf_fixed) ** 2)
        mse_new_rf_fixed = np.mean((y_new - y_hat_new_rf_fixed) ** 2)
        result['mse_known_rf_fixed'].append(mse_known_rf_fixed)
        result['mse_new_rf_fixed'].append(mse_new_rf_fixed)

INFO     [utils.py:135] Drew 10000 samples from 200 clusters.
INFO     [utils.py:136] PTEV = 90.29700475174558, PREV = 48.35540086538786.


Experiment iteration: 0
---------------------MERF----------------------


INFO     [merf.py:235] GLL is 1114.8189544847446 at iteration 1.
INFO     [merf.py:235] GLL is 947.3995403757851 at iteration 2.


---------------------Random Forest Fixed Effect Only----------------------


INFO     [utils.py:135] Drew 10000 samples from 200 clusters.
INFO     [utils.py:136] PTEV = 90.00256350780256, PREV = 49.985758695626764.


Experiment iteration: 1
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 1142.3343719193601 at iteration 1.
INFO     [merf.py:235] GLL is 1009.0158005721009 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------


INFO     [utils.py:135] Drew 10000 samples from 200 clusters.
INFO     [utils.py:136] PTEV = 84.24758674684308, PREV = 61.702614570577865.


Experiment iteration: 0
---------------------MERF----------------------


INFO     [merf.py:235] GLL is 960.3131386432229 at iteration 1.
INFO     [merf.py:235] GLL is 819.1470191678178 at iteration 2.


---------------------Random Forest Fixed Effect Only----------------------


INFO     [utils.py:135] Drew 10000 samples from 200 clusters.
INFO     [utils.py:136] PTEV = 83.91045242906765, PREV = 63.27639220985035.


Experiment iteration: 1
---------------------MERF----------------------

INFO     [merf.py:235] GLL is 947.6970163442148 at iteration 1.
INFO     [merf.py:235] GLL is 836.675862996841 at iteration 2.



---------------------Random Forest Fixed Effect Only----------------------


In [53]:
results

[{'id': 0,
  'mse_known_merf': [2.3444596752614615, 2.3665212404909872],
  'mse_known_rf_fixed': [7.0643903891885262, 6.4318108324085417],
  'mse_known_rf_ohe': [],
  'mse_new_merf': [6.8293937852898843, 6.2719985336567969],
  'mse_new_rf_fixed': [7.3376362164679882, 6.79549140018975],
  'mse_new_rf_ohe': []},
 {'id': 1,
  'mse_known_merf': [1.8120943357711161, 1.8425068509397735],
  'mse_known_rf_fixed': [5.3248424188724632, 5.5741559360454476],
  'mse_known_rf_ohe': [],
  'mse_new_merf': [5.0110103870816962, 5.3272541739922872],
  'mse_new_rf_fixed': [5.5040513382755583, 5.9087922465606075],
  'mse_new_rf_ohe': []}]