In [1]:
#This Notebook is intended to reproduce figures 2 and 3 of McGraw and Barnes (2018) (can be found at:
# https://journals.ametsoc.org/doi/full/10.1175/JCLI-D-17-0334.1)
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

def lagged_regress(lag,alpha,X,Y):
    #lag: maximum lag
    #alpha: significance level
    #X, Y: time series (we're regressing Y on X)
    len_x = len(X)
    len_y = len(Y)
    N = len_x
    #create lagged X and Y 
    index_vec = np.arange(0,N)
    x_lag = np.empty((N,lag))
    x_lag[:] = np.nan
    y_lag = np.empty((N,lag))
    y_lag[:] = np.nan
    for ilag in np.arange(0,lag):
        for i_ind in np.arange(0,N):
            index_i = index_vec[i_ind] - ilag
            if (index_i < 1) or (index_i > N):
                continue
            else:
                x_lag[i_ind,ilag] = X[index_i]
                y_lag[i_ind,ilag] = Y[index_i]
    #Now do the regression
    lm = linear_model.LinearRegression()
    model = lm.fit(x_lag,y_lag)
    predictions = lm.predict(x_lag)
    R2 = lm.score(x_lag,y_lag)
    B = lm.coeff_
    return R2, B
    

In [22]:
#1. Define Monte Carlo simulation parameters
N = 550 #length of our synthetic data
ntimes = 100 #number of Monte Carlo simulations
sig_level = 0.05 #significance level
a_y = np.arange(0,0.95,0.05) #range of autocorrelations for our dependent variable
noise = [0.005,0.1,0.25,0.5,0.75,0.8,0.95,1,1.25,1.5,2,2.5,3,4,5,6,7,8,9,10,12,15] #amount of noise in our independent variable
max_lag = 2

#for i_alpha in a_y:
i_alpha = 0.8
i_noise = 0.25
i_lag = 1
i_times = 0
#    for i_noise in noise:
#        for i_lag in np.arange(1,max_lag+1):
#            for i_times in np.arange(0,ntimes):
imax_lag = 10
#Create the independent variable, Y, a red-noise time series with autocorrelation i_alpha. 
y_red = np.zeros(N,)
for ired in np.arange(1,N):
    #Version of the red-noise time series with non-standardized variance
    y_red[ired] = i_alpha*y_red[ired - 1] + np.random.randn(1,)
    #Version of the red-noise time series with standardized variance
    #y_red[ired] = i_alpha*y_red[ired - 1] + np.sqrt(1 - i_alpha**2)*np.random.randn(1,)
                
#Now create our dependent variable, X, from lagged Y with some noise
x_red = np.zeros(N,)
for mred in np.arange(i_lag+49,N):
    x_red[mred] = y_red[mred - i_lag] + i_noise*np.random.randn(1,)
    
#Remove the first 50 elements of the time series
y_i = y_red[50:-1,]
x_i = x_red[50:-1,]
#Standardize the time series
y_st = (y_i - np.nanmean(y_i))/np.nanstd(y_i)
x_st = (x_i - np.nanmean(x_i))/np.nanstd(x_i)
N_adj = len(x_st) #adjusted length

print(y_st.shape)
#print(x_st[1:11])
print(np.corrcoef(y_st[0:10],x_st[1:11]))

(499,)
[[1.         0.99497137]
 [0.99497137 1.        ]]


In [23]:
##Now, we perform the regressions
#1) standard lagged regression 
#Plot x_red and y_red
#print(len(np.arange(i_lag+49,N)))
#print(N)
#fig1 = plt.figure()
#ax1 = fig1.add_axes([0.1,0.1,0.8,0.8])
#ax1.plot(y_red,'k')
#ax1.plot(x_red,'r')
x_st_df = pd.DataFrame(x_st)
X_LAG = pd.DataFrame(x_st)
Y_LAG = pd.DataFrame(y_st)
y_st_df = pd.DataFrame(y_st)
#Create our lagged time series
for i in np.arange(1,max_lag+1):
    X_LAG[i] = x_st_df.shift(i)
    Y_LAG[i] = y_st_df.shift(i)
#X_LAG = [pd.DataFrame(x_st).shift(i) for i in np.arange(1,max_lag)]
#print(X_LAG.shape)

In [31]:
lm = linear_model.LinearRegression()
lm2 = linear_model.LinearRegression()
X_single_test = X_LAG.iloc[5:-1,1]
X_test = X_single_test.values.reshape(-1,1)
Y_test = Y_LAG.iloc[5:-1,0].values.reshape(-1,1)
lm.fit(Y_test,X_test)
t2 = lm2.fit(Y_test,X_LAG.iloc[5:-1,:])
#other_dir = lm2.fit(Y_LAG.iloc[5:-1,1],X_LAG.iloc[5:-1,1])
#print(lm.coef_)
coeffs = lm.coef_
coeffs2 = lm2.coef_
#R2 = r2_score(X_LAG.iloc[5:-1,:],Y_LAG.iloc[5:-1,:])
#R2_other = r2_score(Y_LAG.iloc[5:-1,:],X_LAG.iloc[5:-1,:])
#print(R2)
#print(X_LAG.shape)
print(coeffs)
print(coeffs2)
#print(R2_other)
#print(coeffs_other)

[[0.64416993]]
[[0.81376172]
 [0.64416993]
 [0.51325193]]
