In [None]:
import math
import torch
import numpy as np
import pandas as pd
import gpytorch
from matplotlib import pyplot as plt

In [None]:
import sys
import os

py_file_location = '../utils'
sys.path.append(os.path.abspath(py_file_location))
py_file_location = '../models'
sys.path.append(os.path.abspath(py_file_location))

import count_utils
import count_models

## load STD data

In [None]:
# load drive data
df_STD = pd.read_csv("data/STDs.csv")
df_STD.head()

In [None]:
df_STD["time"] = np.arange(len(df_STD))
df_STD.dropna(inplace=True)
df_STD.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.scatter(df_STD["time"], df_STD["gonorrhea_count"])
plt.show();

# build a model for STD data


In [None]:
from sklearn.model_selection import train_test_split
X_STD = torch.tensor(df_STD["time"].values)
Y_STD = torch.tensor(df_STD["gonorrhea_count"].values)

x_STD = torch.tensor(np.array(range(X_STD.min() - 5, X_STD.max() + 5)))

X_STD_train, X_STD_test, y_STD_train, y_STD_test = train_test_split(X_STD, Y_STD, test_size=0.33, random_state=42)

y_STD_train_mu, y_STD_train_var = count_models.transform_y_Gamma_LM(y_STD_train, alpha_eps=0.1, counts=1)
y_STD_test_mu, y_STD_test_var = count_models.transform_y_Gamma_LM(y_STD_test, alpha_eps=0.1, counts=1)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.scatter(X_STD, Y_STD)
plt.show();

In [None]:
model, likelihood = count_models.create_LM_Gamma_GP_model(X_STD_train, y_STD_train_mu, y_STD_train_var,
                    kernel="RBF", init_lengthscale=5, learn_additional_noise=True, fixed_likelihood=False)

In [None]:
model, likelihood = count_models.train_LM_Gamma_GP_model(X_STD_train, y_STD_train_mu, model, likelihood,
                                            num_iter=200, lr=0.1, report_iter=50)

In [None]:
# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()

# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    observed_pred = likelihood(model(x_STD))

In [None]:
with torch.no_grad():
    # Initialize plot
    f, ax = plt.subplots(1, 1, figsize=(10, 5))

    # Get upper and lower confidence bounds
    lower, upper = observed_pred.confidence_region()
    # Plot training data as black stars
    ax.plot(X_STD_train.numpy(), y_STD_train_mu.numpy(), 'k*')
    ax.plot(X_STD_test.numpy(), y_STD_test_mu.numpy(), '*', color="orange")
    # Plot predictive means as blue line
    ax.plot(x_STD.numpy(), observed_pred.mean.numpy(), 'b')
    # Shade between the lower and upper confidence bounds
    ax.fill_between(x_STD.numpy(), lower.numpy(), upper.numpy(), alpha=0.5)

    ax.legend(['Observed Data', 'Mean', 'Confidence'])
    

In [None]:
STD_mean = np.exp(observed_pred.mean.numpy())
lb_STD, ub_STD = observed_pred.confidence_region()
lb_STD, ub_STD = np.exp(lb_STD), np.exp(ub_STD)

fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.scatter(df_STD["time"], df_STD["gonorrhea_count"])
plt.ylim(-0.5, 50000)
#plt.ylabel('number of awards', size=14)
#plt.xlabel('math score', size=14)

#plt.scatter(poisson_STD_scatter_df['math'], poisson_STD_scatter_df['num_awards'],
#            s=10* poisson_STD_scatter_df['count'], color='black', label='STD')
ax.plot(x_STD, STD_mean, lw=3, color='orange')
ax.plot(x_STD, ub_STD, lw=1, color='orange')
ax.plot(x_STD, lb_STD, lw=1, color='orange')
ax.fill_between(x_STD, lb_STD, ub_STD, alpha=0.3, color='orange')

plt.tight_layout()

#plt.savefig('figures/STD_GP.pdf')

plt.plot();

In [None]:
count_models.evaluate_LM_Gamma_GP(model, likelihood, X_STD_test, y_STD_test, 
                                  num_samples=1000, fixed_likelihood=False)

In [None]:
# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()

# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    observed_pred = likelihood(model(x_STD))

In [None]:
with torch.no_grad():
    # Initialize plot
    f, ax = plt.subplots(1, 1, figsize=(10, 5))

    # Get upper and lower confidence bounds
    lower, upper = observed_pred.confidence_region()
    # Plot training data as black stars
    ax.plot(X_STD_train.numpy(), y_STD_train_mu.numpy(), 'k*')
    ax.plot(X_STD_test.numpy(), y_STD_test_mu.numpy(), '*', color="orange")
    # Plot predictive means as blue line
    ax.plot(x_STD.numpy(), observed_pred.mean.numpy(), 'b')
    # Shade between the lower and upper confidence bounds
    ax.fill_between(x_STD.numpy(), lower.numpy(), upper.numpy(), alpha=0.5)

    ax.legend(['Observed Data', 'Mean', 'Confidence'])
    

In [None]:
STD_mean = np.exp(observed_pred.mean.numpy())
lb_STD, ub_STD = observed_pred.confidence_region()
lb_STD, ub_STD = np.exp(lb_STD), np.exp(ub_STD)

fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.scatter(df_STD["time"], df_STD["gonorrhea_count"])
plt.ylim(-0.5, 50000)
#plt.ylabel('number of awards', size=14)
#plt.xlabel('math score', size=14)

#plt.scatter(poisson_STD_scatter_df['math'], poisson_STD_scatter_df['num_awards'],
#            s=10* poisson_STD_scatter_df['count'], color='black', label='STD')
ax.plot(x_STD, STD_mean, lw=3, color='orange')
ax.plot(x_STD, ub_STD, lw=1, color='orange')
ax.plot(x_STD, lb_STD, lw=1, color='orange')
ax.fill_between(x_STD, lb_STD, ub_STD, alpha=0.3, color='orange')

plt.tight_layout()

#plt.savefig('figures/STD_GP.pdf')

plt.plot();

## check alternative with SVI

In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_STD_train, y_STD_train)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

test_dataset = TensorDataset(X_STD_test, y_STD_test)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [None]:
model_SVI, likelihood_SVI = count_models.create_SVIGP_model(X_STD_train, init_lengthscale=1, num_inducing_points=1000)

In [None]:
model_SVI, likelihood_SVI = count_models.train_SVIGP_model(train_loader, y_STD_train, model_SVI, likelihood_SVI,
                                            num_iter=200, lr=1, report_iter=50)

In [None]:
count_models.evaluate_SVIGP(model_SVI, likelihood_SVI, test_loader, y_STD_test)

In [None]:
model.eval()
likelihood.eval()
means = torch.tensor([0.])
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        preds_SVI = model_SVI(x_batch)
        means = torch.cat([means, preds_SVI.mean.cpu()])
means = means[1:]

In [None]:
model.eval()
likelihood.eval()
means = torch.tensor([0.])
with torch.no_grad():
    preds_SVI = model_SVI(x_STD)

In [None]:
STD_mean_SVI = preds_SVI.mean
lb_STD_SVI, ub_STD_SVI = preds_SVI.confidence_region()


fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.scatter(df_STD["time"], df_STD["gonorrhea_count"], color="k")
plt.ylim(-0.5, 50000)
#plt.ylabel('number of awards', size=14)
#plt.xlabel('math score', size=14)

#plt.scatter(poisson_STD_scatter_df['math'], poisson_STD_scatter_df['num_awards'],
#            s=10* poisson_STD_scatter_df['count'], color='black', label='STD')
ax.plot(x_STD, STD_mean, lw=3, color='orange')
ax.plot(x_STD, ub_STD, lw=1, color='orange')
ax.plot(x_STD, lb_STD, lw=1, color='orange')
ax.fill_between(x_STD, lb_STD, ub_STD, alpha=0.3, color='orange')
ax.plot(x_STD, STD_mean_SVI, lw=3, color='blue')
ax.plot(x_STD, ub_STD_SVI, lw=1, color='blue')
ax.plot(x_STD, lb_STD_SVI, lw=1, color='blue')
ax.fill_between(x_STD, lb_STD_SVI, ub_STD_SVI, alpha=0.3, color='blue')

plt.tight_layout()

#plt.savefig('figures/STD_GP.pdf')

plt.plot();

## compare to SVIGP+log transformed data

In [None]:
from torch.utils.data import TensorDataset, DataLoader

train_dataset_log = TensorDataset(X_STD_train, torch.log(y_STD_train))
train_loader_log = DataLoader(train_dataset_log, batch_size=1024, shuffle=True)

test_dataset = TensorDataset(X_STD_test, y_STD_test)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [None]:
model_SVI_log, likelihood_SVI_log = count_models.create_SVIGP_model(X_STD_train, init_lengthscale=1,
                                                                   num_inducing_points=1000)
model_SVI_log, likelihood_SVI_log = count_models.train_SVIGP_model(train_loader_log, torch.log(y_STD_train), 
                                                          model_SVI_log, likelihood_SVI_log,
                                            num_iter=500, lr=1, report_iter=50)

In [None]:
count_models.evaluate_SVIGP_log(model_SVI_log, likelihood_SVI_log, test_loader, y_STD_test)

In [None]:
model.eval()
likelihood.eval()
means = torch.tensor([0.])
with torch.no_grad():
    preds_SVI_log = model_SVI_log(x_STD)

In [None]:
STD_mean_SVI_log = torch.exp(preds_SVI.mean)
lb_STD_SVI_log, ub_STD_SVI_log = preds_SVI.confidence_region()
lb_STD_SVI_log, ub_STD_SVI_log = torch.exp(lb_STD_SVI_log), torch.exp(ub_STD_SVI_log)

fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.scatter(df_STD["time"], df_STD["gonorrhea_count"], color="k")
plt.ylim(-0.5, 50000)
#plt.ylabel('number of awards', size=14)
#plt.xlabel('math score', size=14)

#plt.scatter(poisson_STD_scatter_df['math'], poisson_STD_scatter_df['num_awards'],
#            s=10* poisson_STD_scatter_df['count'], color='black', label='STD')
ax.plot(x_STD, STD_mean, lw=3, color='orange')
ax.plot(x_STD, ub_STD, lw=1, color='orange')
ax.plot(x_STD, lb_STD, lw=1, color='orange')
ax.fill_between(x_STD, lb_STD, ub_STD, alpha=0.3, color='orange')
ax.plot(x_STD, STD_mean_SVI_log, lw=3, color='blue')
ax.plot(x_STD, ub_STD_SVI_log, lw=1, color='blue')
ax.plot(x_STD, lb_STD_SVI_log, lw=1, color='blue')
ax.fill_between(x_STD, lb_STD_SVI_log, ub_STD_SVI_log, alpha=0.3, color='blue')

plt.tight_layout()

#plt.savefig('figures/STD_GP.pdf')

plt.plot();

## compare to standard GP

In [None]:
model_exact, likelihood_exact = count_models.create_ExactGP_model(X_STD_train, y_STD_train, init_lengthscale=10)

In [None]:
model_exact, likelihood_exact = count_models.train_ExactGP_model(X_STD_train, y_STD_train, model_exact, likelihood_exact,
                                                    num_iter=2000, lr=1, report_iter=200)

In [None]:
count_models.evaluate_ExactGP(model_exact, likelihood_exact, X_STD_test, y_STD_test)

In [None]:
# Get into evaluation (predictive posterior) mode
model_exact.eval()
likelihood_exact.eval()

# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    preds_exact = likelihood(model_exact(x_STD))

In [None]:
STD_mean_exact = preds_exact.mean
lb_STD_exact, ub_STD_exact = preds_exact.confidence_region()


fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.scatter(df_STD["time"], df_STD["gonorrhea_count"], color="k")
plt.ylim(-0.5, 50000)
#plt.ylabel('number of awards', size=14)
#plt.xlabel('math score', size=14)

#plt.scatter(poisson_STD_scatter_df['math'], poisson_STD_scatter_df['num_awards'],
#            s=10* poisson_STD_scatter_df['count'], color='black', label='STD')
ax.plot(x_STD, STD_mean, lw=3, color='orange', label="LM+GP")
ax.plot(x_STD, ub_STD, lw=1, color='orange')
ax.plot(x_STD, lb_STD, lw=1, color='orange')
ax.fill_between(x_STD, lb_STD, ub_STD, alpha=0.3, color='orange')
ax.plot(x_STD, STD_mean_exact, lw=3, color='blue', label="exact GP")
ax.plot(x_STD, ub_STD_exact, lw=1, color='blue')
ax.plot(x_STD, lb_STD_exact, lw=1, color='blue')
ax.fill_between(x_STD, lb_STD_exact, ub_STD_exact, alpha=0.3, color='blue')
ax.legend()

plt.tight_layout()

#plt.savefig('figures/STD_GP.pdf')

plt.plot();

## Test exact GP on log-transformed data

In [None]:
model_exact_log, likelihood_exact_log = count_models.create_ExactGP_model(X_STD_train, torch.log(y_STD_train), init_lengthscale=10)
model_exact_log, likelihood_exact_log = count_models.train_ExactGP_model(X_STD_train, torch.log(y_STD_train), model_exact_log, 
                                                    likelihood_exact_log,
                                                    num_iter=500, lr=0.1, report_iter=50)

In [None]:
count_models.evaluate_ExactGP_log(model_exact_log, likelihood_exact_log, X_STD_test, y_STD_test)

In [None]:
# Get into evaluation (predictive posterior) mode
model_exact_log.eval()
likelihood_exact_log.eval()

# Test points are regularly spaced along [0,1]
# Make predictions by feeding model through likelihood
with torch.no_grad(), gpytorch.settings.fast_pred_var():
    preds_exact_log = likelihood(model_exact_log(x_STD))
    
STD_mean_exact_log = torch.exp(preds_exact_log.mean)
lb_STD_exact_log, ub_STD_exact_log = preds_exact_log.confidence_region()
lb_STD_exact_log, ub_STD_exact_log = torch.exp(lb_STD_exact_log), torch.exp(ub_STD_exact_log)

fig, ax = plt.subplots(1, 1, figsize=(20, 5))
ax.scatter(df_STD["time"], df_STD["gonorrhea_count"], color="k")
plt.ylim(-0.5, 50000)
#plt.ylabel('number of awards', size=14)
#plt.xlabel('math score', size=14)

#plt.scatter(poisson_STD_scatter_df['math'], poisson_STD_scatter_df['num_awards'],
#            s=10* poisson_STD_scatter_df['count'], color='black', label='STD')
ax.plot(x_STD, STD_mean, lw=3, color='orange', label="LM+GP")
ax.plot(x_STD, ub_STD, lw=1, color='orange')
ax.plot(x_STD, lb_STD, lw=1, color='orange')
ax.fill_between(x_STD, lb_STD, ub_STD, alpha=0.3, color='orange')
ax.plot(x_STD, STD_mean_exact_log, lw=3, color='blue', label="exact GP log")
ax.plot(x_STD, ub_STD_exact_log, lw=1, color='blue')
ax.plot(x_STD, lb_STD_exact_log, lw=1, color='blue')
ax.fill_between(x_STD, lb_STD_exact_log, ub_STD_exact_log, alpha=0.3, color='blue')
ax.legend()

plt.tight_layout()

#plt.savefig('figures/STD_GP.pdf')

plt.plot();

# selecting hyperparameters with cross validation

In [None]:
LENGTHSCALES = [1, 2, 5, 10, 20, 50, 100, 200, 500]

In [None]:
NUM_INDUCING_POINTS=200
res = count_models.select_init_lengthscale_with_CV_Gamma(X_STD_train, y_STD_train, mode="ExactGP", num_inducing_points=NUM_INDUCING_POINTS, 
                                    learn_noise=True, num_iter=2000, lr=1,
                                    lengthscales=LENGTHSCALES, max_test_size=1000)

print(res)    
count_utils.plot_res(res)

In [None]:
res = count_models.select_init_lengthscale_with_CV_Gamma(X_STD_train, y_STD_train, mode="ExactGP_log", num_inducing_points=NUM_INDUCING_POINTS, 
                                    learn_noise=True, num_iter=200, lr=0.1,
                                    lengthscales=LENGTHSCALES, max_test_size=1000)

print(res)    
count_utils.plot_res(res)

In [None]:
res = count_models.select_init_lengthscale_with_CV_Gamma(X_STD_train, y_STD_train, mode="LMGP_gamma", num_inducing_points=NUM_INDUCING_POINTS, 
                                    learn_noise=True, num_iter=200, lr=0.1, fixed_likelihood=True,
                                    lengthscales=LENGTHSCALES, max_test_size=1000)

print(res)    
count_utils.plot_res(res)

In [None]:
res = count_models.select_init_lengthscale_with_CV_Gamma(X_STD_train, y_STD_train, mode="SVIGP", num_inducing_points=NUM_INDUCING_POINTS, 
                                    learn_noise=True, num_iter=200, lr=0.1,
                                    lengthscales=LENGTHSCALES, max_test_size=1000)

print(res)    
count_utils.plot_res(res)

In [None]:
res = count_models.select_init_lengthscale_with_CV_Gamma(X_STD_train, y_STD_train, mode="SVIGP_log", num_inducing_points=NUM_INDUCING_POINTS, 
                                    learn_noise=True, num_iter=200, lr=0.1,
                                    lengthscales=LENGTHSCALES, max_test_size=1000)

print(res)    
count_utils.plot_res(res)