In [28]:
import time
import pandas as pd
import numpy as np 
import math
import torch
import gpytorch
from matplotlib import pyplot as plt
from torch import device as device_

# Make plots inline
%matplotlib inline

if torch.cuda.is_available() :
    device='cuda'
else: device='cpu'
##device = 'cpu'

In [29]:
df = pd.read_stata('data/gruber_subset.dta', convert_categoricals=False)
df.describe()
print(df.shape)


#df.describe()
#train_x = torch.tensor(df[['lat', 'lon']].values).to(device)
#train_y = torch.tensor(df.ddd.values).to(device)


(139568, 395)


Define the MODEL 

In [30]:
# We will use the simplest form of GP model, exact inference
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
    
    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

Whiten the Necessary Variables 

In [31]:
'''
for j in range(0,5): 
    
    start = time.time() # timet the main loop

    df_dicts[j] = df.sample(n=10000, random_state=j) 
    df_dicts[j].to_stata(f'data/gruber_org_cuda_{j}.dta', write_index=False)

'''
chunks = [df.iloc[i:i+10000, :] for i in range(0, len(df), 10000)]


In [32]:
dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor

counter = 1 
for j in chunks: 
    
    j.name = str(counter) # give the chunk a name
    
    
    start = time.time() # timet the main loop
    
    df2 = pd.DataFrame().reindex_like(j) # empty dataset like 

    target = ['lat', 'lon']
    consider = [x for x in df.columns if x not in target]
    #print(consider)


    # train x is always latitude and longitude 
    train_x = torch.tensor(j[['lat', 'lon']].values).type(dtype).to(device)

    for x in consider:

        train_y = torch.tensor(j[x].values).type(dtype).to(device)

        '''
        run the gpytorch code here 
        '''
        # Find optimal model hyperparameters

        likelihood = gpytorch.likelihoods.GaussianLikelihood()
        model = ExactGPModel(train_x, train_y, likelihood).to(device)

        # Find optimal model hyperparameters
        model.train()
        likelihood.train()

        # Use the adam optimizer
        optimizer = torch.optim.Adam([
            {'params': model.parameters()},  # Includes GaussianLikelihood parameters
        ], lr=0.1)

        # "Loss" for GPs - the marginal log likelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)


        model.train()
        likelihood.train()

        # Use the adam optimizer
        optimizer = torch.optim.Adam([
            {'params': model.parameters()},  # Includes GaussianLikelihood parameters
        ], lr=0.1)

        # "Loss" for GPs - the marginal log likelihood
        mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

        training_iterations = 30

        #start = time.time() # timet the main loop

        for i in range(training_iterations):
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            loss.backward()
            # print('Iter %d/%d - Loss: %.3f' % (i + 1, training_iterations, loss.item()))
            optimizer.step()


        # making prediction 
        model.eval()
        likelihood.eval()
        
        torch.cuda.empty_cache()
        
        with torch.no_grad(), gpytorch.settings.fast_pred_var():
            prediction = likelihood(model(train_x))

            y_hat = prediction.mean
            res = train_y - y_hat 


            #append to the dataset 
            df2[x] = res.cpu().numpy()

            #torch.cuda.empty_cache()
    
    print(f"Done with Chunk: {counter} in {(time.time()-start)/60.0} minutes")
    
    df2.describe()
    df2.to_stata(f'data/whitened_chunk_{j.name}.dta', write_index=False)
    
    counter += 1 
            
            

Done with Chunk: 1 in 42.92758235534032 minutes
Done with Chunk: 2 in 38.64309225877126 minutes
Done with Chunk: 3 in 41.91689244111379 minutes
Done with Chunk: 4 in 42.33911789655686 minutes
Done with Chunk: 5 in 41.45989835659663 minutes
Done with Chunk: 6 in 42.84209967851639 minutes
Done with Chunk: 7 in 40.658673950036366 minutes
Done with Chunk: 8 in 43.39113495747248 minutes
Done with Chunk: 9 in 39.94662171999614 minutes
Done with Chunk: 10 in 40.6848464290301 minutes
Done with Chunk: 11 in 40.79162955681483 minutes
Done with Chunk: 12 in 42.59656974871953 minutes
Done with Chunk: 13 in 41.624030029773714 minutes
Done with Chunk: 14 in 37.437859416007996 minutes
