In [1]:
import os
import sys
import numpy as np
import scipy
import pandas as pd
import pywt as wt
import matplotlib.pyplot as plt
# For lag plot
from pandas.plotting import lag_plot
# For ACF
import statsmodels
from statsmodels.tsa.stattools import acf
# For zoom-in inside the plot box
from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
from mpl_toolkits.axes_grid1.inset_locator import mark_inset
# Gridspec
import matplotlib.gridspec as gridspec
%matplotlib notebook

### Importing data, using Kepler light curves

Import a random Kepler light curve. Time is in days, flux is a relative scale.

In [2]:
lc_x = pd.read_csv('DataV_koi_kplr005706966.csv', names=['time', 'flux', 'e_flux'], 
                   nrows=10500, engine='python')
npt_lsst = np.ceil(np.ptp(lc_x.time.values) / 1.6).astype('int')
lc_x2 = lc_x[::npt_lsst]
lc_x2.reset_index(drop=True, inplace=True)

Getting the sampling rate (cadence), we will need it after to scale measurements to day scales

In [3]:
cad1 = np.median(np.ediff1d(lc_x['time'].values))
cad2 = np.median(np.ediff1d(lc_x2['time'].values))
print('Kepler sampling: {0:.2f} d, LSST-like sampling:{1:.2f} d'.format(cad1, cad2))

Kepler sampling: 0.02 d, LSST-like sampling:3.04 d


Quick visualization, both Kepler sample-like and LSST sample-like (best scenario)

In [None]:
fig, ax = plt.subplots(figsize=(6, 4))
ax.scatter(lc_x.time, lc_x.flux, marker='o', s=5, c=lc_x.e_flux, cmap='jet')
ax.scatter(lc_x2.time, lc_x2.flux, marker='.', s=50, c='lime', edgecolor='k', lw=0.5)
ax.set_xlabel('time d')
ax.set_ylabel(r'flux$_{normalized}$')
ax.set_ylim([np.min(lc_x.flux) - np.std(lc_x.flux), np.max(lc_x.flux) + np.std(lc_x.flux)])
ax.set_title('Same LC with different cadences', color='navy')

Quick checking: lag plot should be random for structures with no memory. The correlation here is a sanity check

In [None]:
plt.close('all')
fig, ax = plt.subplots(1, 2, figsize=(6, 3))
lag1 = lag_plot(lc_x.flux, ax=ax[0], marker='.', c='goldenrod', edgecolor='k', lw=0.1)
lag2 = lag_plot(lc_x2.flux, ax=ax[1], marker='.', c='dodgerblue', edgecolor='k', lw=0.1)
#
for sub in ax:
    sub.set_aspect('equal')
ax[0].set_title('Kepler sampling')
ax[1].set_title('LSST-like sampling')
plt.subplots_adjust(wspace=0.55)

### Let's create some gaps 

### Autocorrelation Function 

$\rho_{k} = \dfrac{\sum_{i=0}^{N-k} x_{i}x_{i+k}}{\sum_{i}^{N} x_{i}}$
Where $\rho_{k}$ is the autocorrelation coefficient at lag *k*. Each lag *k* corresponds to $\tau_{k}=k\,\Delta t$, $\Delta t$ being the cadence.

It is safe to only look for periods shorter than half the light curve, $k \leq \frac{N}{2}$

Calculate the autocorrelation coefficients via statsmodels. Note that the `tsa.stattools.acf` method receives only the flux, thus assuming the spacing between each observation is uniform.

In [4]:
acf_coeffs = acf(lc_x.flux.values, unbiased=False, nlags=len(lc_x.flux.values) // 2)
tau_k = np.arange(1, acf_coeffs.size + 1, 1)
t_d = cad1 * tau_k 
#
print('Number of coefficients from the ACF calculation is: {0}'.format(acf_coeffs.size))

Number of coefficients from the ACF calculation is: 5251


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10, 4))
# Raw results from ACF are 'coarse', because of the nature of the input data
ax[0].scatter(tau_k, acf_coeffs, marker='.', s=10, color='navy')
# Zoom in
if 1:
    zoom_factor = 30
    ax0_zoom = zoomed_inset_axes(ax[0], zoom_factor, loc=1)
    ax0_zoom.scatter(tau_k, acf_coeffs, marker='.', s=20, color='darkorange')
    ax0_zoom.set_xlim(2190, 2275)
    ax0_zoom.set_ylim(0.16, 0.176)
    ## Remove tick labels
    ax0_zoom.xaxis.set_major_formatter(plt.NullFormatter())
    ax0_zoom.yaxis.set_major_formatter(plt.NullFormatter())
    mark_inset(ax[0], ax0_zoom, loc1=2, loc2=4, fc='none', ec='goldenrod')
#
# Maxima for the coefficients
aux_maxima = scipy.signal.argrelextrema(acf_coeffs, np.greater)
ax[1].scatter(t_d, acf_coeffs, marker='.', s=10, color='lightgray')
ax[1].scatter(t_d[aux_maxima], acf_coeffs[aux_maxima], marker='^', s=20, color='lime', 
              edgecolor='k', linewidths=0.1)
#
for axis in ax:
    axis.set_ylabel(r'$\rho$', fontsize=13)
ax[0].set_xlabel(r'$\tau$', fontsize=13)
ax[1].set_xlabel(r'time $d$', fontsize=13)
#
ax[0].set_title('ACF coefficients')
ax[1].set_title('Location of local maxima')
plt.suptitle('Kepler sampling')

**Smooth the ACF coefficient distribution to easyly locate the local maxima. **

In [5]:
def gaussian(mu, sigma, x):
    return np.exp(np.power(-(x - mu), 2) / (2 * np.power(sigma, 2))) / (sigma * np.sqrt(2 * np.pi))

Values for the Gaussian (convolution) are empirical, as a compromise between diminish noise and keep the ACF signal. 
Note we need to trim a bit the result array, due to border padding. Also, remember the normalization, to keep the scale.

In [6]:
sigma_x = 18 / 2.35
x = np.arange(0, 56, 1)
acf_g_conv = scipy.signal.convolve(acf_coeffs, gaussian(0, sigma_x, x)) / np.sum(gaussian(0, sigma_x, x))
print('Original size of the ACF coefficients array: {0}. The smoothed: {1}'.format(acf_coeffs.size, acf_g_conv.size))

Original size of the ACF coefficients array: 5251. The smoothed: 5306


In [7]:
# Trim the padded extra section, and re-use the previously defined inteval tau_k
Ntrim = acf_g_conv.size - acf_coeffs.size
acf_g_conv = acf_g_conv[Ntrim:]

In [10]:
# Local maxima
aux_maxima = scipy.signal.argrelextrema(acf_g_conv, np.greater)
# Global maxima
idx_gmax = np.argmax(acf_g_conv[aux_maxima])

In [None]:
# Grid
fig = plt.figure(figsize=(9, 4))
gs1 = gridspec.GridSpec(3, 3)
gs1.update(left=0.14, right=0.48, hspace=0.05)
ax0 = fig.add_subplot(gs1[: -1, :])
ax1 = fig.add_subplot(gs1[-1, :], sharex=ax0)
#
ax1.scatter(t_d, acf_g_conv - acf_coeffs, marker='*', s=10, color='orange')
# 
ax0.scatter(t_d, acf_g_conv[:5251], marker='.', s=10, color='lightgray')
ax0.scatter(t_d[aux_maxima], acf_g_conv[aux_maxima], marker='^', s=20, color='lime', 
              edgecolor='k', linewidths=0.1)
# 
ax0.axvline(t_d[aux_maxima][idx_gmax], lw=2, c='b', alpha=0.5)
#
ax0.set_ylabel(r'$\rho$', fontsize=13)
ax1.set_xlabel(r'time $d$', fontsize=13)
ax1.set_ylabel(r'Gauss - ACF', fontsize=13)

In [11]:
max_acf_kplr = t_d[aux_maxima][np.argmax(acf_g_conv[aux_maxima])]
print('Maximum of the ACF: {0:.2f} d'.format(max_acf_kplr))

Maximum of the ACF: 22.68 d


### Let's do the same for the more sparse situation...

In [12]:
acf_coeffs_spa = acf(lc_x2.flux.values, unbiased=False, nlags=len(lc_x2.flux.values) // 2)
tau_k_spa = np.arange(1, acf_coeffs_spa.size + 1, 1)
t_d_spa = cad2 * tau_k_spa

We must remember that so far we are assuming an uniform sampling . In the case of Kepler cadence, given the amount of points, for this analysis this is not a concern.

In the case of a more sparse time series, changes in the cadence will make us result not be as accurate as if having a regular cadence.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 4))
ax[0].scatter(t_d, acf_coeffs, alpha=0.5, label='ACF for Kepler sampling', c='navy', s=10)
ax[0].scatter(t_d_spa, acf_coeffs_spa, label='ACF for downsampled data', c='orange', s=20)
# Histogram of the cadence in the data
ax[1].hist(np.ediff1d(lc_x2['time'].values), bins=10, histtype='stepfilled', color=['lemonchiffon'], lw=0)
ax[1].hist(np.ediff1d(lc_x2['time'].values), bins=10, histtype='step', color=['orange'], lw=2)
#
ax[0].legend(loc='upper right')
ax[0].set_xlabel(r'time $d$', fontsize=13)
ax[0].set_ylabel(r'$\rho$', fontsize=13)
ax[1].set_xlabel(r'$x_{(t+1)}-x_{t}$ $d$', fontsize=13)
ax[1].set_ylabel('N')
ax[0].set_title('ACF for both cadences', color='forestgreen')
ax[1].set_title('Histogram of cadence, sparse LC', color='navy')
#
plt.subplots_adjust(bottom=0.2)

### Gap filling

Fillig the gaps will allow us to have a more regular sampled grid. In the case when fewer observations are available, compared with the main variability length, to have an uniform sampling makes calculations more stable.

In [204]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, Matern, RationalQuadratic,
                                              ExpSineSquared, DotProduct,
                                              ConstantKernel)
# Instance of the GP
guess_period = max_acf_kplr
# length_scale: while larger, the shapes of the samples elongate. Default:1.
# length_scale_bounds: lower and upper bounds for lenght_scale. Let's say is a day
# periodicity_bounds: lower and upper bounds in periodicity
kernel = 3.0 * ExpSineSquared(length_scale=guess_period/2., 
                              periodicity=guess_period,
                              length_scale_bounds=(guess_period, 1.1*guess_period),
                              periodicity_bounds=(guess_period/2, 1.5 * guess_period),
                             )
# If want to replicate the result, must use same seed 
gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=50, random_state=45)#, alpha=0.1*lc_x2.flux.values)

if 0:
    # Calculate the prior
    x_prior = np.linspace(lc_x2.time.values[0], lc_x2.time.values[-1], 1000)
    y_mean_prior, y_std_prior = gp.predict(x_prior[:, np.newaxis], return_std=True)
    y_samples_prior = gp.sample_y(x_prior[:, np.newaxis], 4000)

# Fit to data using Maximum Likelihood Estimation of the parameters
gp.fit(lc_x2.time.values[:, np.newaxis], lc_x2.flux.values)
# Posterior
x_grid = np.linspace(lc_x2.time.values[0], lc_x2.time.values[-1], 1000)
# Make the prediction on the meshed x-axis (ask for MSE as well)
y_mean_post, y_std_post = gp.predict(x_grid[:, np.newaxis], return_std=True)
y_samples_post = gp.sample_y(x_grid[:, np.newaxis], 5000)

AttributeError: 'GaussianProcessRegressor' object has no attribute 'x'

In [205]:
fig, ax = plt.subplots(1, figsize=(8, 4))
ax.plot(x_grid, y_mean_post, lw=2, c='dodgerblue')
#ax.plot(x_grid, y_samples_post, lw=1, c='r', alpha=0.1)
ax.scatter(lc_x2.time, lc_x2.flux, marker='o', c='green', s=20)
ax.plot(lc_x.time, lc_x.flux, 'k-', alpha=0.1)


<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x135083860>]

## IDEA: create a loop and save 50 iterations. Then with them do the results

In [146]:
import pickle

In [149]:
if False:
    pickle.dump(y_mean_post, open('y_mean_posterior_lsst.pickle', 'wb'))
    np.save('y_mean_posterior.npy', y_mean_post)

In [83]:
fig, ax = plt.subplots(2, 1, figsize=(10, 4))
# Prior
ax[0].plot(x, y_mean, 'k', lw=3, zorder=9)
ax[0].fill_between(x, y_mean - y_std, y_mean + y_std,
                   alpha=0.2, color='k')
ax[0].plot(x, y_samples, lw=1)
ax[0].set_title('Prior (kernel:  {0})'.format(kernel), fontsize=12)
#
ax[1].plot(x, y_mean, 'k', lw=0.5, zorder=9)
ax[1].fill_between(x, y_mean - y_std, y_mean + y_std,
                   alpha=0.2, color='k')

# Plot generated samples
ax[1].plot(xx, yy_samples, lw=1)
# plot original data
ax[1].scatter(lc_x2.time.values[:, np.newaxis][:, 0], lc_x2.flux.values, c='r', s=20, zorder=10, edgecolors=(0, 0, 0))
ax[1].set_title("Posterior (kernel: %s)\n Log-Likelihood: %.3f"
          % (gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)),
          fontsize=12)
#ax[1].set_xlim([0, 50])
#ax[1].set_ylim([-.05, 2])
plt.tight_layout()



<IPython.core.display.Javascript object>

In [49]:
fig, ax = plt.subplots(1, figsize=(10, 6))
ax.scatter(lc_x2.time.values[:, np.newaxis][:, 0], lc_x2.flux.values,)
ax.plot(x, y_mean, 'k', lw=0.5, zorder=9)


<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x102dc7240>]

In [77]:
plt.close('all')
kernel = 1.0 * ExpSineSquared(length_scale=guess_period/2, 
                              periodicity=guess_period,
                              length_scale_bounds=(guess_period, 1.1*guess_period),
                              periodicity_bounds=(guess_period/2, 1.5 * guess_period),
                             )
gp = GaussianProcessRegressor(kernel=kernel)
# Prior
x = np.linspace(lc_x2.time.values[0], lc_x2.time.values[-1], 100)
y_mean, y_std = gp.predict(x[:, np.newaxis], return_std=True)
y_samples = gp.sample_y(x[:, np.newaxis], 1000)
plt.plot(x, y_samples)

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x111d66438>,
 <matplotlib.lines.Line2D at 0x111541080>,
 <matplotlib.lines.Line2D at 0x1162d6a58>,
 <matplotlib.lines.Line2D at 0x1162d60b8>,
 <matplotlib.lines.Line2D at 0x1162d62b0>,
 <matplotlib.lines.Line2D at 0x1162d6fd0>,
 <matplotlib.lines.Line2D at 0x1162d6b38>,
 <matplotlib.lines.Line2D at 0x1162d67b8>,
 <matplotlib.lines.Line2D at 0x1162d6da0>,
 <matplotlib.lines.Line2D at 0x1162d67f0>,
 <matplotlib.lines.Line2D at 0x1132d8d68>,
 <matplotlib.lines.Line2D at 0x1162d65c0>,
 <matplotlib.lines.Line2D at 0x1162d6470>,
 <matplotlib.lines.Line2D at 0x111cd1470>,
 <matplotlib.lines.Line2D at 0x111cd1978>,
 <matplotlib.lines.Line2D at 0x111cd1b38>,
 <matplotlib.lines.Line2D at 0x111cd1080>,
 <matplotlib.lines.Line2D at 0x111cd1be0>,
 <matplotlib.lines.Line2D at 0x111cd1ef0>,
 <matplotlib.lines.Line2D at 0x111cd1a20>,
 <matplotlib.lines.Line2D at 0x111cd1b00>,
 <matplotlib.lines.Line2D at 0x111cd1cf8>,
 <matplotlib.lines.Line2D at 0x111cd17f0>,
 <matplotli

In [18]:
# Generate data and fit GP
rng = np.random.RandomState(4)
X = rng.uniform(0, 5, 10)[:, np.newaxis]
y = np.sin((X[:, 0] - 2.5) ** 2)
y.shape

(10,)

In [None]:



# Generate data and fit GP
rng = np.random.RandomState(4)
X = rng.uniform(0, 5, 10)[:, np.newaxis]
y = np.sin((X[:, 0] - 2.5) ** 2)
gp.fit(X, y)

# Plot posterior
plt.subplot(2, 1, 2)
X_ = np.linspace(0, 5, 100)
y_mean, y_std = gp.predict(X_[:, np.newaxis], return_std=True)
plt.plot(X_, y_mean, 'k', lw=3, zorder=9)
plt.fill_between(X_, y_mean - y_std, y_mean + y_std,
                 alpha=0.2, color='k')

y_samples = gp.sample_y(X_[:, np.newaxis], 10)
plt.plot(X_, y_samples, lw=1)
plt.scatter(X[:, 0], y, c='r', s=50, zorder=10, edgecolors=(0, 0, 0))
plt.xlim(0, 5)
plt.ylim(-3, 3)
plt.title("Posterior (kernel: %s)\n Log-Likelihood: %.3f"
          % (gp.kernel_, gp.log_marginal_likelihood(gp.kernel_.theta)),
          fontsize=12)
plt.tight_layout()

### Wavelet Transform

Continuous vs Discrete

### (Fast) Lomb-Scargle

### Evaluate the selected variability period, using phase plot

In [None]:
np.arange(1,10,1)