# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Data-Import-and-Preprocessing" data-toc-modified-id="Data-Import-and-Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Import and Preprocessing</a></div><div class="lev2 toc-item"><a href="#Distances" data-toc-modified-id="Distances-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Distances</a></div><div class="lev1 toc-item"><a href="#Modifications-of-the-spatiotemporal-kernel" data-toc-modified-id="Modifications-of-the-spatiotemporal-kernel-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Modifications of the spatiotemporal kernel</a></div><div class="lev2 toc-item"><a href="#Allowing-for-mean-variation" data-toc-modified-id="Allowing-for-mean-variation-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Allowing for mean variation</a></div><div class="lev1 toc-item"><a href="#Adding-a-diurnal-component" data-toc-modified-id="Adding-a-diurnal-component-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Adding a diurnal component</a></div>

In [2]:
using TimeSeries
using DataFrames
using GaussianProcesses
using GaussianProcesses: Mean, Kernel, evaluate, metric, IsotropicData, VecF64
using GaussianProcesses: Stationary, KernelData, MatF64
import GaussianProcesses: optimize!, get_optim_target, cov, grad_slice!
import GaussianProcesses: num_params, set_params!, get_params, update_mll!, update_mll_and_dmll!
import GaussianProcesses: get_param_names, cov!, addcov!, multcov!
import Proj4
using Optim
using Distances
;

In [3]:
import PyPlot; plt=PyPlot
using LaTeXStrings
plt.rc("figure", dpi=300.0)
# plt.rc("figure", figsize=(6,4))
plt.rc("savefig", dpi=300.0)
plt.rc("text", usetex=true)
plt.rc("font", family="serif")
plt.rc("font", serif="Palatino")
;

# Data Import and Preprocessing

In [4]:
include("src/preprocessing.jl")

test_data (generic function with 1 method)

In [5]:
include("src/TempModel.jl")

LoadError: LoadError: LoadError: UndefVarError: Mamba not defined
while loading /Users/imolk/Documents/Harvard/Natesh/temperature_model/src/stan_impute.jl, in expression starting on line 480
while loading /Users/imolk/Documents/Harvard/Natesh/temperature_model/src/TempModel.jl, in expression starting on line 9

In [6]:
isdList=read_isdList()
isdList[1:5,:]

Unnamed: 0,USAF,WBAN,NAME,CTRY,STATE,ICAO,LAT,LON,ELEV,BEGIN,END,X_PRJ,Y_PRJ
1,10010,99999,JAN MAYEN(NOR-NAVY),NO,,ENJA,70.933,-8.667,9.0,1931,2015,4554500.0,6113440.0
2,10060,99999,EDGEOYA,NO,,,78.25,22.817,14.0,1973,2015,4049820.0,7556400.0
3,10070,99999,NY-ALESUND,SV,,,78.917,11.933,7.7,1973,2015,3867800.0,7265490.0
4,10080,99999,LONGYEAR,SV,,ENSB,78.246,15.466,26.8,1975,2015,3997050.0,7336690.0
5,10090,99999,KARL XII OYA,SV,,,80.65,25.0,5.0,1955,2015,3692590.0,7685450.0


In [7]:
isdSubset=isdList[[(usaf in (725450,725460,725480,725485)) for usaf in isdList[:USAF].values],:]
isdSubset

Unnamed: 0,USAF,WBAN,NAME,CTRY,STATE,ICAO,LAT,LON,ELEV,BEGIN,END,X_PRJ,Y_PRJ
1,725450,14990,THE EASTERN IOWA AIRPORT,US,IA,KCID,41.883,-91.717,264.6,1973,2015,1647990.0,1044100.0
2,725460,14933,DES MOINES INTERNATIONAL AIRPORT,US,IA,KDSM,41.534,-93.653,291.7,1973,2015,1487230.0,1003790.0
3,725480,94910,WATERLOO MUNICIPAL AIRPORT,US,IA,KALO,42.554,-92.401,264.6,1960,2015,1590250.0,1117660.0
4,725485,14940,MASON CITY MUNICIPAL ARPT,US,IA,KMCW,43.154,-93.327,373.4,1973,2015,1514070.0,1183740.0


In [8]:
hourly_cat=read_Stations(isdSubset)
hourly_cat[1:5,:]

Unnamed: 0,year,month,day,hour,min,seconds,temp,ts,station,ts_hours
1,2015,1,1,0,52,0,-7.8,2015-01-01T00:52:00,1,0.866667
2,2015,1,1,1,52,0,-8.3,2015-01-01T01:52:00,1,1.86667
3,2015,1,1,2,52,0,-8.3,2015-01-01T02:52:00,1,2.86667
4,2015,1,1,3,52,0,-9.4,2015-01-01T03:52:00,1,3.86667
5,2015,1,1,4,52,0,-9.4,2015-01-01T04:52:00,1,4.86667


## Distances

To get distances between stations, we can either use a function to compute distances on a sphere, or we can first project the coordinates onto a Euclidean plane, and then compute normal distances. I'll do it both ways to check they're consistent (equal up to a multiplication constant), and then use Euclidean distances for convenience.

In [9]:
# http://www.johndcook.com/blog/python_longitude_latitude/
function distance_on_unit_sphere(lat1, long1, lat2, long2)
 
    # Convert latitude and longitude to 
    # spherical coordinates in radians.
    degrees_to_radians = π/180.0
         
    # phi = 90 - latitude
    phi1 = (90.0 - lat1)*degrees_to_radians
    phi2 = (90.0 - lat2)*degrees_to_radians
         
    # theta = longitude
    theta1 = long1*degrees_to_radians
    theta2 = long2*degrees_to_radians
         
    # Compute spherical distance from spherical coordinates.
         
    # For two locations in spherical coordinates 
    # (1, theta, phi) and (1, theta', phi')
    # cosine( arc length ) = 
    #    sin phi sin phi' cos(theta-theta') + cos phi cos phi'
    # distance = rho * arc length
     
    cosangle = (sin(phi1)*sin(phi2)*cos(theta1 - theta2) +
           cos(phi1)*cos(phi2))
    arc = acos( cosangle )
 
    # Remember to multiply arc by the radius of the earth 
    # in your favorite set of units to get length.
    return arc
end

distance_on_unit_sphere (generic function with 1 method)

In [10]:
numstations = nrow(isdSubset)
pairwiseSphere = zeros(numstations, numstations)
for i in 1:numstations
    for j in 1:i
        if i==j
            continue
        end
        station1 = isdSubset[i,:]
        station2 = isdSubset[j,:]
        lat1= get(station1[1,:LAT])
        lon1 = get(station1[1,:LON])
        lat2 = get(station2[1,:LAT])
        lon2 = get(station2[1,:LON])
        pairwiseSphere[i,j] = distance_on_unit_sphere(lat1, lon1, lat2, lon2)
        pairwiseSphere[j,i] = pairwiseSphere[i,j]
    end
end
pairwiseSphere

4×4 Array{Float64,2}:
 0.0        0.0259496  0.0146736  0.0303475
 0.0259496  0.0        0.024088   0.0285853
 0.0146736  0.024088   0.0        0.0158124
 0.0303475  0.0285853  0.0158124  0.0      

In [11]:
pairwiseEuclid=pairwise(Euclidean(), Matrix(isdSubset[[:X_PRJ,:Y_PRJ]])')

4×4 Array{Float64,2}:
      0.0        165736.0        93510.4        1.93474e5
 165736.0             0.0            1.53559e5  1.81942e5
  93510.4             1.53559e5      0.0        1.00846e5
      1.93474e5       1.81942e5      1.00846e5  0.0      

Ratio of the two distance matrices: close enough to a constant!

In [12]:
pairwiseEuclid ./ pairwiseSphere

4×4 Array{Float64,2}:
 NaN            6.38684e6    6.37271e6    6.37527e6
   6.38684e6  NaN            6.37493e6    6.36489e6
   6.37271e6    6.37493e6  NaN            6.37765e6
   6.37527e6    6.36489e6    6.37765e6  NaN        

# Modifications of the spatiotemporal kernel

We're going to simplify things drastically.

In [13]:
k_time = SEIso(0.0,0.0)

Type: GaussianProcesses.SEIso, Params: [0.0,0.0]


## Allowing for mean variation

In [14]:
k_spatial = SEIso(log(2*10^5), log(1.0))
k_means = SEIso(log(10^4), log(10.0))

Type: GaussianProcesses.SEIso, Params: [9.21034,2.30259]


In [15]:
k_spatiotemporal_1 = Masked(k_time, [1]) * Masked(k_spatial, [2,3]) + 
    fix(Masked(k_means, [2,3]))

Type: GaussianProcesses.SumKernel
  Type: GaussianProcesses.ProdKernel
    Type: GaussianProcesses.Masked{GaussianProcesses.SEIso}, Params: [0.0,0.0]
    Type: GaussianProcesses.Masked{GaussianProcesses.SEIso}, Params: [12.2061,0.0]
  Type: GaussianProcesses.FixedKern, Params: Float64[]


In [16]:
begin
    global opt_out
    k_spatiotemporal = k_spatiotemporal_1
    chunks=GP[]
    chunk_width=24*10
    tstart=0.0
    tend=tstart+chunk_width
    nobsv=0
    while tstart < get(maximum(hourly_cat[:ts_hours]))
        in_chunk=(tstart .<= hourly_cat[:ts_hours].values) & (hourly_cat[:ts_hours].values .< tend)
        hourly_chunk = hourly_cat[in_chunk,:]
        nobsv_chunk = sum(in_chunk)
        nobsv += nobsv_chunk

        chunk_X_PRJ = isdSubset[:X_PRJ].values[hourly_chunk[:station].values]
        chunk_Y_PRJ = isdSubset[:Y_PRJ].values[hourly_chunk[:station].values]
        chunk_X = [hourly_chunk[:ts_hours].values chunk_X_PRJ chunk_Y_PRJ]

        y = hourly_chunk[:temp].values
        chunk = GP(chunk_X', y, MeanConst(mean(y)), k_spatiotemporal, 0.0)
        push!(chunks, chunk)

        tstart=tend
        tend+=chunk_width
    end
    reals = TempModel.GPRealisations(chunks)
    update_mll_and_dmll!(reals, mean=false)
    println(reals.dmLL)
    @time opt_out=optimize!(reals, mean=false, show_trace=true, x_tol=1e-4, f_tol=1e-4)
end

[-2700.87,54918.2,1.01883e5,10177.6,1.01883e5]
Iter     Function value   Gradient norm 
     0     1.271155e+05     1.018831e+05
Base.LinAlg.PosDefException(3)
     1     8.151657e+04     2.495166e+04
     2     5.754880e+04     7.809512e+03
     3     5.697336e+04     4.002414e+03
     4     5.629485e+04     4.606744e+03
     5     5.579323e+04     3.444513e+03
     6     5.563954e+04     1.178332e+03
     7     5.561938e+04     3.955355e+02
     8     5.561500e+04     1.234359e+02
434.419816 seconds (9.71 M allocations: 47.524 GB, 2.56% gc time)


Results of Optimization Algorithm
 * Algorithm: Conjugate Gradient
 * Starting Point: [0.0,0.0,0.0,12.206072645530174,0.0]
 * Minimizer: [-0.8206663657481988,0.9954652395759197, ...]
 * Minimum: 5.561500e+04
 * Iterations: 8
 * Convergence: true
   * |x - x'| < 1.0e-04: false
   * |f(x) - f(x')| / |f(x)| < 1.0e-04: true
   * |g(x)| < 1.0e-08: false
   * Reached Maximum Number of Iterations: false
 * Objective Function Calls: 33
 * Gradient Calls: 21

In [17]:
print(Optim.minimizer(opt_out))

[-0.820666,0.995465,0.654919,12.0762,0.654919]

In [18]:
print(Optim.minimum(opt_out))

55614.99897704748

In [20]:
print("\nk: Temporal kernel \n=================\n")
@printf("σ: %5.3f\n", √k_time.σ2)
@printf("l: %5.3f hours\n", √k_time.ℓ2)
print("\nk: Spatial kernel \n=================\n")
@printf("σ: %5.3f\n", √k_spatial.σ2)
@printf("l: %5.3f meters\n", √k_spatial.ℓ2)
print("\n=================\n")
@printf("σy: %5.3f\n", exp(reals.logNoise))


k: Temporal kernel 
σ: 1.925
l: 2.706 hours

k: Spatial kernel 
σ: 1.925
l: 175638.738 meters

σy: 0.440


# Adding a diurnal component

In [23]:
k_periodic = fix(Periodic(0.0, 0.0, log(24.0)), :lp)
k_diurndecay = SEIso(0.0, 0.0)

Type: GaussianProcesses.SEIso, Params: [0.0,0.0]


In [24]:
k_spatiotemporal_2 = Masked(k_time, [1]) * Masked(k_spatial, [2,3]) + 
    fix(Masked(k_means, [2,3])) +
    Masked(k_periodic, [1]) * Masked(k_diurndecay, [2,3])

Type: GaussianProcesses.SumKernel
  Type: GaussianProcesses.ProdKernel
    Type: GaussianProcesses.Masked{GaussianProcesses.SEIso}, Params: [0.995465,0.654919]
    Type: GaussianProcesses.Masked{GaussianProcesses.SEIso}, Params: [12.0762,0.654919]
  Type: GaussianProcesses.FixedKern, Params: Float64[]
  Type: GaussianProcesses.ProdKernel
    Type: GaussianProcesses.Masked{GaussianProcesses.FixedKern}, Params: [0.0,0.0]
    Type: GaussianProcesses.Masked{GaussianProcesses.SEIso}, Params: [0.0,0.0]


In [25]:
begin
    global opt_out
    k_spatiotemporal = k_spatiotemporal_2
    chunks=GP[]
    chunk_width=24*10
    tstart=0.0
    tend=tstart+chunk_width
    nobsv=0
    while tstart < get(maximum(hourly_cat[:ts_hours]))
        in_chunk=(tstart .<= hourly_cat[:ts_hours].values) & (hourly_cat[:ts_hours].values .< tend)
        hourly_chunk = hourly_cat[in_chunk,:]
        nobsv_chunk = sum(in_chunk)
        nobsv += nobsv_chunk

        chunk_X_PRJ = isdSubset[:X_PRJ].values[hourly_chunk[:station].values]
        chunk_Y_PRJ = isdSubset[:Y_PRJ].values[hourly_chunk[:station].values]
        chunk_X = [hourly_chunk[:ts_hours].values chunk_X_PRJ chunk_Y_PRJ]

        y = hourly_chunk[:temp].values
        chunk = GP(chunk_X', y, MeanConst(mean(y)), k_spatiotemporal, 0.0)
        push!(chunks, chunk)

        tstart=tend
        tend+=chunk_width
    end
    reals = TempModel.GPRealisations(chunks)
    update_mll_and_dmll!(reals, mean=false)
    println(reals.dmLL)
    @time opt_out=optimize!(reals, mean=false, show_trace=true, x_tol=1e-4, f_tol=1e-4)
end

[-28371.5,8470.67,-3148.47,3151.29,-3148.47,257.441,-99.5163,0.0,-99.5163]
Iter     Function value   Gradient norm 
     0     6.970102e+04     2.837146e+04
     1     6.405081e+04     2.436904e+04
     2     5.758611e+04     1.338141e+04
     3     5.601555e+04     6.178053e+03
     4     5.551747e+04     2.373715e+03
     5     5.541494e+04     7.851578e+02
     6     5.539956e+04     5.122522e+02
     7     5.539342e+04     3.108432e+02
     8     5.538724e+04     1.289779e+02
     9     5.538661e+04     1.720577e+02
947.600506 seconds (9.85 M allocations: 105.122 GB, 2.27% gc time)


Results of Optimization Algorithm
 * Algorithm: Conjugate Gradient
 * Starting Point: [0.0,0.9954652395759197, ...]
 * Minimizer: [-0.8274601191080887,0.9964949796569037, ...]
 * Minimum: 5.538661e+04
 * Iterations: 9
 * Convergence: true
   * |x - x'| < 1.0e-04: false
   * |f(x) - f(x')| / |f(x)| < 1.0e-04: true
   * |g(x)| < 1.0e-08: false
   * Reached Maximum Number of Iterations: false
 * Objective Function Calls: 36
 * Gradient Calls: 27