# Simple Growth Model
### Evan Edwards

- Simple growth model with set parameters using the following error metrics:
    - RMSE
    - R^2
    - Maximum Error
- Optimization (Work in Progress)

In [3]:
import matplotlib.pyplot as plt
import pandas as pd
import pymc as pm
import numpy as np
from sklearn.metrics import mean_squared_error, max_error, r2_score
from sklearn.model_selection import train_test_split
import arviz as az
import scipy as sp
import time
from scipy.optimize import minimize

In [4]:
ECLSK2011 = pd.read_csv('ECLSK2011.csv')
ECLSK2011 = ECLSK2011.dropna(axis=1, how='all')
missing_math_rows = ECLSK2011[ECLSK2011.groupby('CHILDID')['MATH'].transform(lambda x: any(x.isna()))]

ECLSK2011 = ECLSK2011.drop(missing_math_rows.index)



In [5]:
ECLSK2011_train, ECLSK2011_test = train_test_split(ECLSK2011, test_size=0.1)

In [6]:

# Level 1

#std_y_ti | pi_0i, pi_1i, pi_2i, sigma ~ normal(pi_0i + pi_1i  (X1)+ pi_2i (X2), sigma^2) 
# pi_0i | beta00, sigma00 ~ normal(beta00, sigma00^2)
# pi_1i | beta10, sigma01 ~ normal(beta10, sigma10^2)
# pi_2i | beta20, sigma02 ~ normal(beta20, sigma20^2)
# sigma^2 ~ inv_gamma(nu/2, nu * lambda/2)

# Level 2
# beta00 ~ N(0, tau00^2)
# beta10 ~ N(0, tau10^2)
# beta20 ~ N(0, tau20^2)

#sigma00 = sigma10 = sigma = .3
#tau00 = tau10 = tau20 = .4


In [7]:
'''
# Level 1 priors
sigma00 = sigma10 = sigma20 = 0.3

# Level 2 priors
tau00 = tau10 = tau20 = 0.4

nu = 3
lamb = 0.194
'''

# Level 1 priors
sigma00 = sigma10 = sigma20 = 0.3

# Level 2 priors
tau00 = tau10 = tau20 = 0.4

nu = 3
lamb = 0.194

std_y_ti_data = sp.stats.zscore(ECLSK2011_train['MATH'])
time_data = ECLSK2011_train['TIME']
start_time = time.time()
with pm.Model() as model:
    std_y_ti_data = pm.MutableData("std_y_ti_data", std_y_ti_data)
    time_data = pm.MutableData("time_data", time_data)
    

    beta00 = pm.Normal('beta00', mu=0, sigma=tau00**2)
    beta10 = pm.Normal('beta10', mu=0, sigma=tau10**2)
    beta20 = pm.Normal('beta20', mu=0, sigma=tau20**2)

    pi_0i = pm.Normal('pi_0i', mu=beta00, sigma=sigma00**2)
    pi_1i = pm.Normal('pi_1i', mu=beta10, sigma=sigma10**2)
    pi_2i = pm.Normal('pi_2i', mu=beta20, sigma=sigma20**2)
    
    sigma_sq = pm.InverseGamma('sigma_sq', alpha=nu/2, beta=nu*lamb/2)
    T_ti = time_data
    T_ti_hat = pm.math.mean(time_data)


    #Taylor Series
    X1 = T_ti - T_ti_hat
    X2 = (X1**2)/2

    std_y_ti = pm.Normal('std_y_ti', mu=pi_0i + pi_1i*(X1) + pi_2i*(X2), sigma=sigma_sq, observed=std_y_ti_data)

    trace = pm.sample(2000, tune=1000)

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Computation time: {elapsed_time} seconds")
pm.summary(trace,hdi_prob = 0.95)


Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta00, beta10, beta20, pi_0i, pi_1i, pi_2i, sigma_sq]


Sampling 4 chains for 1_000 tune and 2_000 draw iterations (4_000 + 8_000 draws total) took 89 seconds.


Computation time: 132.76073837280273 seconds


Unnamed: 0,mean,sd,hdi_2.5%,hdi_97.5%,mcse_mean,mcse_sd,ess_bulk,ess_tail,r_hat
beta00,0.02,0.078,-0.133,0.173,0.001,0.001,11160.0,6261.0,1.0
beta10,0.254,0.079,0.097,0.408,0.001,0.001,10224.0,6025.0,1.0
beta20,-0.006,0.077,-0.154,0.15,0.001,0.001,11534.0,6437.0,1.0
pi_0i,0.027,0.005,0.019,0.036,0.0,0.0,9310.0,6124.0,1.0
pi_1i,0.334,0.001,0.332,0.336,0.0,0.0,12003.0,6361.0,1.0
pi_2i,-0.008,0.001,-0.01,-0.006,0.0,0.0,9261.0,5986.0,1.0
sigma_sq,0.503,0.002,0.498,0.507,0.0,0.0,10838.0,5770.0,1.0


In [8]:
def RMSE(pred, true):
    return np.sqrt(np.sum(np.power(np.subtract(true, pred),2))/len(true))

In [9]:
with model:
    post_pred = pm.sample_posterior_predictive(trace, model=model)
is_pred = post_pred.posterior_predictive["std_y_ti"].mean(dim=["chain", "draw"])


is_hdi = az.hdi(post_pred.posterior_predictive, hdi_prob=0.95)


with model:
    pm.set_data({"std_y_ti_data": sp.stats.zscore(ECLSK2011_test['MATH']), "time_data": ECLSK2011_test['TIME']}) 
    post_pred = pm.sample_posterior_predictive(trace, model=model)
oos_pred = post_pred.posterior_predictive["std_y_ti"].mean(dim=["chain", "draw"])


oos_hdi = az.hdi(post_pred.posterior_predictive, hdi_prob=0.95)


Sampling: [std_y_ti]


Sampling: [std_y_ti]


In [11]:
print("Growth Model Summary:")
print(az.summary(trace, hdi_prob=0.95))
print("\nComputation time: ", elapsed_time)
print("In-Sample Predictions:")
print(f'The RMSE (IS) is: {RMSE(sp.stats.zscore(ECLSK2011_train["MATH"]), is_pred)}')
print(f'The Maximum Residual Error is: {max_error(sp.stats.zscore(ECLSK2011_train["MATH"]), is_pred)}')
print(f'The R^2 (IS) is: {r2_score(sp.stats.zscore(ECLSK2011_train["MATH"]), is_pred)}')
print(f"Mean upper and lower bounds of the 95% CI: {np.mean(np.array(is_hdi.to_array().sel(hdi='lower'))).astype(float)} - {np.mean(np.array(is_hdi.to_array().sel(hdi='higher'))).astype(float)}")

print("\nOut-Of-Sample Predictions:")
print(f'The RMSE (OOS) is: {RMSE(sp.stats.zscore(ECLSK2011_test["MATH"]), oos_pred)}')
print(f'The Maximum Residual Error is: {max_error(sp.stats.zscore(ECLSK2011_test["MATH"]), oos_pred)}')
print(f'The R^2 (OOS) is: {r2_score(sp.stats.zscore(ECLSK2011_test["MATH"]), oos_pred)}')
print(f"Mean upper and lower bounds of the 95% CI: {np.mean(np.array(oos_hdi.to_array().sel(hdi='lower'))).astype(float)} - {np.mean(np.array(oos_hdi.to_array().sel(hdi='higher'))).astype(float)}")

Growth Model Summary:
           mean     sd  hdi_2.5%  hdi_97.5%  mcse_mean  mcse_sd  ess_bulk  \
beta00    0.020  0.078    -0.133      0.173      0.001    0.001   11160.0   
beta10    0.254  0.079     0.097      0.408      0.001    0.001   10224.0   
beta20   -0.006  0.077    -0.154      0.150      0.001    0.001   11534.0   
pi_0i     0.027  0.005     0.019      0.036      0.000    0.000    9310.0   
pi_1i     0.334  0.001     0.332      0.336      0.000    0.000   12003.0   
pi_2i    -0.008  0.001    -0.010     -0.006      0.000    0.000    9261.0   
sigma_sq  0.503  0.002     0.498      0.507      0.000    0.000   10838.0   

          ess_tail  r_hat  
beta00      6261.0    1.0  
beta10      6025.0    1.0  
beta20      6437.0    1.0  
pi_0i       6124.0    1.0  
pi_1i       6361.0    1.0  
pi_2i       5986.0    1.0  
sigma_sq    5770.0    1.0  

Computation time:  132.76073837280273
In-Sample Predictions:
The RMSE (IS) is: 0.5027238694191412
The Maximum Residual Error is: 2.97542