In [1]:
from Taweret.core.base_model import BaseModel
import numpy as np
import pymc as pm
import pandas as pd
from sklearn.metrics import mean_squared_error
from Taweret.core.base_mixer import BaseMixer
from math import sqrt
from scipy.optimize import minimize
import logging
from Taweret.mix.trees import Trees
import bambi as bmb
import arviz as az
import matplotlib.pyplot as plt
from Taweret.mix.gaussian import Multivariate
import time



In [2]:
#Fixed random seed to ensure reproducibility and the possiblility for optimization
RANDOM_SEED = 9572404
rng = np.random.default_rng(RANDOM_SEED)

In [3]:
# Defining to disable output later for ease of visibility
logger = logging.getLogger("pymc")
# Disabling sampling messages
logger.setLevel(logging.ERROR)

In [4]:
# Load dataset
PISA2018 = pd.read_csv("pisa2018.BayesBook.csv")
# Data processing: converting categorical values to numerical values
PISA2018['Female'] = PISA2018['Female'].replace({'Female': 1.0, 'Male': 0.0})
# Converting numerical to categorical values
PISA2018['SchoolID'] = pd.Categorical(PISA2018['SchoolID']).codes

In [5]:
variables = ["Female", "ESCS","METASUM","PERFEED","HOMEPOS","ADAPTIVITY","TEACHINT",
             "ICTRES","JOYREAD","ATTLNACT","COMPETE","WORKMAST","GFOFAIL","SWBP",
             "MASTGOAL","BELONG","SCREADCOMP","SCREADDIFF","PISADIFF","PV1READ", "SchoolID"]
PISA2018 = PISA2018[variables]

In [6]:
%%time
#PV1READ ~ Female + ESCS + HOMEPOS + ICTRES + (1 + ICTRES | SchoolID)
start_time = time.time()
model1 = bmb.Model("PV1READ ~ Female + ESCS + HOMEPOS + ICTRES + (1 + ICTRES | SchoolID)", PISA2018, categorical = ["SchoolID"])
priors = {"Intercept": bmb.Prior("Normal", mu=0, sigma=100),
          "Female": bmb.Prior("Normal", mu=0, sigma=10),
          "ESCS": bmb.Prior("Normal", mu=np.mean(PISA2018["ESCS"]), sigma=np.std(PISA2018["ESCS"])),
          "HOMEPOS": bmb.Prior("Normal", mu=np.mean(PISA2018["HOMEPOS"]), sigma=100),
          "ICTRES": bmb.Prior("Normal", mu=np.mean(PISA2018["ICTRES"]), sigma=np.std(PISA2018["ICTRES"])),
          "1|SchoolID": bmb.Prior("Normal", mu=0, sigma=bmb.Prior("HalfNormal", sigma=100)),
          "ICTRES|SchoolID": bmb.Prior("Normal", mu=0, sigma=bmb.Prior("HalfNormal", sigma=100)),
          "sigma": bmb.Prior("HalfNormal", sigma=10)}
model1.set_priors(priors = priors)

trace1 = model1.fit(draws=2000, random_seed=RANDOM_SEED)

post_pred1 = model1.predict(trace1,data = PISA2018, inplace=False).posterior["PV1READ_mean"]
mean_pred = np.array(post_pred1.mean(dim=["chain", "draw"]))
print(f'The RMSE for model 1 - PV1READ ~ Female + ESCS + HOMEPOS + ICTRES + (1 + ICTRES | SchoolID) is: {sqrt(mean_squared_error(PISA2018["PV1READ"], mean_pred))}')


end_time = time.time()
elapsed_time = end_time - start_time

print(f"Computation time: {elapsed_time} seconds")


The RMSE for model 1 - PV1READ ~ Female + ESCS + HOMEPOS + ICTRES + (1 + ICTRES | SchoolID) is: 94.16605562627963
Computation time: 123.25808453559875 seconds
CPU times: total: 33.1 s
Wall time: 2min 3s


In [7]:
%%time
#PV1READ ~ JOYREAD + PISADIFF + SCREADCOMP + SCREADDIFF + (1|SchoolID)
start_time = time.time()
model2 = bmb.Model("PV1READ ~ JOYREAD + PISADIFF + SCREADCOMP + SCREADDIFF + (1|SchoolID)", PISA2018, categorical = ["SchoolID"])

priors = {"Intercept": bmb.Prior("Normal", mu=0, sigma=100),
          "JOYREAD": bmb.Prior("Normal", mu=np.mean(PISA2018["JOYREAD"]), sigma=np.std(PISA2018["JOYREAD"])),
          "PISADIFF": bmb.Prior("Normal", mu=0, sigma=100),
          "SCREADCOMP": bmb.Prior("Normal", mu=np.mean(PISA2018["SCREADCOMP"]), sigma=10),
          "SCREADDIFF": bmb.Prior("Normal", mu=np.mean(PISA2018["SCREADDIFF"]), sigma=np.std(PISA2018["SCREADDIFF"])),
          "1|SchoolID": bmb.Prior("Normal", mu=0, sigma=bmb.Prior("HalfNormal", sigma=100)),
          "sigma": bmb.Prior("HalfNormal", sigma=10)}
model2.set_priors(priors = priors)

trace2 = model2.fit(draws=2000, random_seed=RANDOM_SEED)

post_pred2 = model2.predict(trace2, data = PISA2018, inplace=False).posterior["PV1READ_mean"]
mean_pred = np.array(post_pred2.mean(dim=["chain", "draw"]))
print(f'The RMSE for model 2 - PV1READ ~ JOYREAD + PISADIFF + SCREADCOMP + SCREADDIFF + (1|SchoolID) is: {sqrt(mean_squared_error(PISA2018["PV1READ"], mean_pred))}')

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Computation time: {elapsed_time} seconds")


The RMSE for model 2 - PV1READ ~ JOYREAD + PISADIFF + SCREADCOMP + SCREADDIFF + (1|SchoolID) is: 86.02700914019462
Computation time: 94.735032081604 seconds
CPU times: total: 22.5 s
Wall time: 1min 34s


In [8]:
%%time
#PV1READ ~ METASUM + GFOFAIL + MASTGOAL + SWBP + WORKMAST + ADAPTIVITY + COMPETE + (1|SchoolID)
start_time = time.time()
model3 = bmb.Model("PV1READ ~ METASUM + GFOFAIL + MASTGOAL + SWBP + WORKMAST + ADAPTIVITY + COMPETE + (1|SchoolID)", PISA2018, categorical = ["SchoolID"])

priors = {"Intercept": bmb.Prior("Normal", mu=0, sigma=100),
          "METASUM": bmb.Prior("Normal", mu=np.mean(PISA2018["METASUM"]), sigma=np.std(PISA2018["METASUM"])),
          "GFOFAIL": bmb.Prior("Normal", mu=0, sigma=100),
          "MASTGOAL": bmb.Prior("Normal", mu=np.mean(PISA2018["MASTGOAL"]), sigma=10),
          "SWBP": bmb.Prior("Normal", mu=0, sigma=100),
          "WORKMAST": bmb.Prior("Normal", mu=np.mean(PISA2018["WORKMAST"]), sigma=10),
          "ADAPTIVITY": bmb.Prior("Normal", mu=np.mean(PISA2018["ADAPTIVITY"]), sigma=100),
          "COMPETE": bmb.Prior("Normal", mu=np.mean(PISA2018["COMPETE"]), sigma=np.std(PISA2018["COMPETE"])),
          "1|SchoolID": bmb.Prior("Normal", mu=0, sigma=bmb.Prior("HalfNormal", sigma=100)),
          "sigma": bmb.Prior("HalfNormal", sigma=10)}
model3.set_priors(priors = priors)
 
trace3 = model3.fit(draws=2000, random_seed=RANDOM_SEED)

post_pred3 = model3.predict(trace3, data = PISA2018, inplace=False).posterior["PV1READ_mean"]
mean_pred = np.array(post_pred3.mean(dim=["chain", "draw"]))
print(f'The RMSE for model 3 - PV1READ ~ METASUM + GFOFAIL + MASTGOAL + SWBP + WORKMAST + ADAPTIVITY + COMPETE + (1|SchoolID) is: {sqrt(mean_squared_error(PISA2018["PV1READ"], mean_pred))}')


end_time = time.time()
elapsed_time = end_time - start_time

print(f"Computation time: {elapsed_time} seconds")

The RMSE for model 3 - PV1READ ~ METASUM + GFOFAIL + MASTGOAL + SWBP + WORKMAST + ADAPTIVITY + COMPETE + (1|SchoolID) is: 91.30394364453704
Computation time: 189.38602781295776 seconds
CPU times: total: 42.8 s
Wall time: 3min 9s


In [9]:
%%time
#PV1READ ~ PERFEED + TEACHINT + BELONG + (1 + TEACHINT | SchoolID)
start_time = time.time()
model4 = bmb.Model("PV1READ ~ PERFEED + TEACHINT + BELONG + (1 + TEACHINT | SchoolID)", PISA2018, categorical = ["SchoolID"])

priors = {"Intercept": bmb.Prior("Normal", mu=0, sigma=100),
          "PERFEED": bmb.Prior("Normal", mu=np.mean(PISA2018["PERFEED"]), sigma=np.std(PISA2018["PERFEED"])),
          "TEACHINT": bmb.Prior("Normal", mu=np.mean(PISA2018["TEACHINT"]), sigma=np.std(PISA2018["TEACHINT"])),
          "BELONG": bmb.Prior("Normal", mu=np.mean(PISA2018["BELONG"]), sigma=100),
          "1|SchoolID": bmb.Prior("Normal", mu=0, sigma=bmb.Prior("HalfNormal", sigma=100)),
          "TEACHINT|SchoolID": bmb.Prior("Normal", mu=0, sigma=bmb.Prior("HalfNormal", sigma=100)),
          "sigma": bmb.Prior("HalfNormal", sigma=10)}
model4.set_priors(priors = priors)

trace4 = model4.fit(draws=2000, random_seed=RANDOM_SEED)

post_pred4 = model4.predict(trace4, data = PISA2018, inplace=False).posterior["PV1READ_mean"]
mean_pred = np.array(post_pred4.mean(dim=["chain", "draw"]))
print(f'The RMSE for model 4 - PV1READ ~ PERFEED + TEACHINT + BELONG + (1 + TEACHINT | SchoolID) is: {sqrt(mean_squared_error(PISA2018["PV1READ"], mean_pred))}')

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Computation time: {elapsed_time} seconds")

The RMSE for model 4 - PV1READ ~ PERFEED + TEACHINT + BELONG + (1 + TEACHINT | SchoolID) is: 94.7193720019732
Computation time: 126.01985669136047 seconds
CPU times: total: 30.6 s
Wall time: 2min 6s


In [10]:
# A wrapper class for the Bambi/PYMC models to be compatible with the Taweret framework
class BMBWrapper(BaseModel):
    def __init__(self, model, idata, posterior_predictive):
        self.model = model
        self.idata = idata
        self.posterior_predictive = posterior_predictive
        
    def evaluate(self, model_parameters):
        post_pred = self.model.predict(self.idata, data = model_parameters, inplace=False).posterior[self.posterior_predictive]
        return np.array(post_pred.mean(dim=["chain", "draw"])).reshape(-1, 1), np.sqrt(np.array(post_pred.var(dim=["chain", "draw"]))).flatten().reshape(-1, 1)

    
    def log_likelihood_elementwise(self,x_exp, y_exp, y_err, model_params):
        y = self.evaluate(model_params)[0]
        
        return np.exp(-(y - y_exp) **2 / (2 * y_err ** 2)) \
            / np.sqrt(2 * np.pi * y_err ** 2)
    
    def set_prior(self, prior_dict):
        self.model.set_priors(priors=prior_dict)


In [11]:
models = {
            "1": BMBWrapper(model1, trace1, "PV1READ_mean"),
            "2": BMBWrapper(model2, trace2, "PV1READ_mean"),
            "3": BMBWrapper(model3, trace3, "PV1READ_mean"),
            "4": BMBWrapper(model4, trace4, "PV1READ_mean")
}


In [15]:
# Fit the BMM Model
# Initialize the Trees class instance
mix = Trees(model_dict = models, local_openbt_path = "//wsl.localhost/Ubuntu-22.04")

# Set prior information
mix.set_prior(k=2.5,ntree=30,overallnu=5,overallsd=0.01,inform_prior=False)

# Train the model
fit = mix.train(X=PISA2018, y=np.array(PISA2018["PV1READ"]), ndpost = 10000, nadapt = 2000, nskip = 2000, adaptevery = 500, minnumbot = 4)

Results stored in temporary path: C:\Users\EVANED~1\AppData\Local\Temp\openbtpy_r9529koz
Running model...


FileNotFoundError: Cannot find openbt executables. Please specify the path using the argument local_openbt_path in the constructor.