# NAO Prediction

In [1]:
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, addcyclic, shiftgrid
import pymongo
from pprint import pprint
from datetime import datetime, timedelta, date
import pandas as pd
from sklearn.decomposition import PCA
import sklearn.linear_model as skl_lm
import gdal as gdl
import matplotlib.mlab as ml
import cartopy.crs as ccrs
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True) # for live plot
pd.set_option('display.notebook_repr_html', False)
%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
mongo_host_local = 'mongodb://localhost:27017/'
mg = pymongo.MongoClient(mongo_host_local)

In [3]:
db = mg.ECMWF
db.collection_names()

['system.indexes',
 'ERAINT_grid',
 'ERAINT_lores_grid',
 'ERAINT_lores_monthly_anom',
 'ERAINT_monthly',
 'ERAINT_lores_monthly']

In [4]:
ERA_vers = 'lores'
if (ERA_vers == 'hires'):
    col_dat = 'ERAINT_monthly'
    col_anom = 'ERAINT_monthly_anom'
    col_grid = 'ERAINT_grid'
    resolution = 0.25
elif (ERA_vers == 'lores'):
    col_dat = 'ERAINT_lores_monthly'
    col_anom = 'ERAINT_lores_monthly_anom'
    col_grid = 'ERAINT_lores_grid'
    resolution = 2.5

# Construct NAO index

In [5]:
# Query grid cells for NAO calculation
con_grid = db[col_grid]
poly1 = [list(reversed([ [-50,25], [-50,55], [10,55],[ 10,25], [-50,25]]))]
poly2 = [list(reversed([ [-40, 55], [-40, 85], [20, 85], [20, 55], [-40, 55]]))]
def getGridIds(this_polygon):
    geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": this_polygon
                   }
               }}}

    res = con_grid.find(filter = geo_qry, projection = {"_id":0, "id_grid": 1, "loc": 1})
    grid_df = pd.DataFrame(list(res))
    return grid_df
grid_df1 = getGridIds(poly1)
grid_ids1 = grid_df1.id_grid.values
grid_df2 = getGridIds(poly2)
grid_ids2 = grid_df2.id_grid.values

In [6]:
# Get yearly DJF averages over the two NAO nodal locations

con_anom = db[col_anom]

def setWinterYear(date): # December belong to next year's winter
    mon=date.month
    yr=date.year
    if mon >= 9:
        res = yr+1
    else:
        res = yr
    return res

def getMSL(this_grid_ids):
    this_msl = con_anom.aggregate(pipeline=[
        {"$match": {"id_grid": {"$in": this_grid_ids.tolist()}}},
        {"$group": {"_id": "$date", "mean": {"$avg": "$msl"} }},
        {"$project": {"date": "$_id", 
                      "_id": 0, 
                      "msl": "$mean"}}])
    this_msl_df = pd.DataFrame(list(this_msl))
    this_msl_df = this_msl_df.assign(
            month=list(map(lambda x: x.month, this_msl_df.date)),
            wyear=list(map(lambda x: setWinterYear(x), this_msl_df.date)) ).pipe(
    lambda df: df.query("month in [12, 1, 2]") ).pipe(
    lambda df: df.groupby("wyear").mean().reset_index())
    return this_msl_df
    
msl1_df = getMSL(this_grid_ids = grid_ids1).rename(columns={'msl': 'msl1'})
msl2_df = getMSL(this_grid_ids = grid_ids2).rename(columns={'msl': 'msl2'})
msl_df = pd.merge(msl1_df, msl2_df)
msl_df = msl_df.assign(NAO = msl_df.msl2-msl_df.msl1).sort_values('wyear', ascending=True).reset_index(drop=True)
# Get rid of the first year (1979) and the last (2017) because the winter month are not complete
msl_df = msl_df.query("(wyear > 1979) & (wyear <2017)")
msl_df.head()

   wyear        msl1  month        msl2         NAO
1   1980  152.977452    5.0  -98.370800 -251.348252
2   1981 -444.747185    5.0  117.529424  562.276609
3   1982  336.279706    5.0 -377.140170 -713.419876
4   1983 -342.535287    5.0  378.676276  721.211563
5   1984 -220.904942    5.0  330.200922  551.105864

In [7]:
# Plot ts
data = [go.Scatter(x=msl_df['wyear'], y=msl_df['NAO'] )]
py.iplot(data, filename='pandas-time-series')

# Get PCA scores

In [8]:
# Generic function to query grid ids above a given latitude
def genCircle(start_lon, stop_lon, lat, decreasing): 
    res = map(lambda x:[int(x), lat],
              sorted(np.arange(start=start_lon, stop=stop_lon+1), reverse=decreasing))
    return list(res)

def queryGrids(aboveLat):
    this_box = {'lonmin': -180, 'lonmax': 180, 'latmin': aboveLat, 'latmax': 90}
    circle_north_pos = genCircle(start_lon = this_box['lonmin'], stop_lon = this_box['lonmax'], 
                                  lat = this_box['latmax'], decreasing = False)
    circle_south_neg = genCircle(start_lon = this_box['lonmin'], stop_lon = this_box['lonmax'], 
                                lat = this_box['latmin'],  decreasing = True)
    slp_poly = [[this_box['lonmin'], this_box['latmin']]]
    slp_poly.extend(circle_north_pos)
    slp_poly.extend(circle_south_neg)
    this_polygon = slp_poly
    
    if aboveLat > 0:
        geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": [this_polygon]
               }}}}
    else: # case of a big polygon larger than one hemisphere
        geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": [list(reversed(this_polygon))], # the orientation matters
                       "crs": {
                           "type": "name", 
                           "properties": { "name": "urn:x-mongodb:crs:strictwinding:EPSG:4326" }
                       }
                   }
               }}}
        
    res = con_grid.find(filter = geo_qry, projection = {"_id":0, "id_grid": 1, "loc": 1})
    grid_df = pd.DataFrame(list(res))
    return grid_df

grid_df_20N = queryGrids(aboveLat=20)
grid_df_20S = queryGrids(aboveLat=-20)

In [9]:
def queryScores(this_variable, this_grid_df):
    # Query data anomalies
    grid_ids = this_grid_df.id_grid.values
    res = con_anom.aggregate(pipeline=[ 
    {"$project": {"id_grid": 1, "date": 1, this_variable: 1, "month": {"$month": "$date"}}},
    {"$match": {"month": {"$in": [9, 10, 11, 12, 1, 2]},
                "id_grid": {"$in": grid_ids.tolist()} }},
    {"$project": {"_id": 0, "id_grid": 1, "date": 1, this_variable: 1}} ])    
    anom_df = pd.DataFrame(list(res))

    # Get Principal Component Scores
    X_df = anom_df.pivot(index='date', columns='id_grid', values=this_variable)
    pca = PCA(n_components=3)
    df_scores = pd.DataFrame(pca.fit_transform(X_df), 
                             columns=['PC1_%s' % (this_variable), 
                                      'PC2_%s' % (this_variable), 
                                      'PC3_%s' % (this_variable)],
                             index=X_df.index)
    return df_scores

scores_z70 = queryScores(this_variable='z70', this_grid_df=grid_df_20N)
scores_ci = queryScores(this_variable='ci', this_grid_df=grid_df_20N)
scores_sst = queryScores(this_variable='sst', this_grid_df=grid_df_20S)

In [10]:
scores_z70.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 230 entries, 1979-01-01 to 2017-02-01
Data columns (total 3 columns):
PC1_z70    230 non-null float64
PC2_z70    230 non-null float64
PC3_z70    230 non-null float64
dtypes: float64(3)
memory usage: 7.2 KB


### Comparison with correlation coefficients

In [11]:
P03_df = scores_ci.assign(month=list(map(lambda x: x.month, scores_ci.index))).query('month == 10')
P03_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 38 entries, 1979-10-01 to 2016-10-01
Data columns (total 4 columns):
PC1_ci    38 non-null float64
PC2_ci    38 non-null float64
PC3_ci    38 non-null float64
month     38 non-null int64
dtypes: float64(3), int64(1)
memory usage: 1.5 KB


In [12]:
np.corrcoef(P03_df.PC1_ci[:-1], msl_df.NAO)

array([[ 1.      , -0.495391],
       [-0.495391,  1.      ]])

In [13]:
P04_df = scores_z70.assign(month=list(map(lambda x: x.month, scores_z70.index))).query('month == 10')
np.corrcoef(P04_df.PC2_z70[:-1], msl_df.NAO)

array([[ 1.        , -0.43572617],
       [-0.43572617,  1.        ]])

In [14]:
P17_df = scores_sst.assign(month=list(map(lambda x: x.month, scores_sst.index))).query('month == 9')
np.corrcoef(P17_df.PC3_sst[:-1], msl_df.NAO)

array([[ 1.        ,  0.04698798],
       [ 0.04698798,  1.        ]])

### Group all predictors in one DataFrame

In [15]:
scores_df = pd.merge(left=scores_z70, right=scores_ci, left_index=True, right_index=True).\
pipe(lambda df: pd.merge(df, scores_sst, left_index=True, right_index=True))
scores_df.reset_index(level=0, inplace=True)
scores_df = scores_df.assign(
    year=list(map(lambda x: x.year, scores_df.date)),
    wyear=list(map(lambda x: setWinterYear(x), scores_df.date)), 
    month=list(map(lambda x: x.month, scores_df.date)))
scores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 13 columns):
date       230 non-null datetime64[ns]
PC1_z70    230 non-null float64
PC2_z70    230 non-null float64
PC3_z70    230 non-null float64
PC1_ci     230 non-null float64
PC2_ci     230 non-null float64
PC3_ci     230 non-null float64
PC1_sst    230 non-null float64
PC2_sst    230 non-null float64
PC3_sst    230 non-null float64
year       230 non-null int64
wyear      230 non-null int64
month      230 non-null int64
dtypes: datetime64[ns](1), float64(9), int64(3)
memory usage: 23.4 KB


In [16]:
# Create the Predictor DataFrame
def renCol(x, mon):
    if ('PC' in x):
        z = '%s_%s' % (x, mon)
    else:
        z = x
    return z

def createMondf(this_mon, scores_df):
    mon_df = scores_df.query('month == @this_mon')
    mon_df.columns = list(map(lambda x: renCol(x, mon=this_mon), list(mon_df)))
    mon_df = mon_df.drop(['date','year','month'], axis=1)
    return mon_df

sep_df = createMondf(this_mon=9, scores_df=scores_df)
oct_df = createMondf(this_mon=10, scores_df=scores_df)
X_df = pd.merge(sep_df, oct_df)

In [17]:
# Create Regression DataFrame
NAO_df = msl_df.drop(columns=['msl1', 'msl2', 'month'])
dat_df = pd.merge(NAO_df, X_df)
dat_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37 entries, 0 to 36
Data columns (total 20 columns):
wyear         37 non-null int64
NAO           37 non-null float64
PC1_z70_9     37 non-null float64
PC2_z70_9     37 non-null float64
PC3_z70_9     37 non-null float64
PC1_ci_9      37 non-null float64
PC2_ci_9      37 non-null float64
PC3_ci_9      37 non-null float64
PC1_sst_9     37 non-null float64
PC2_sst_9     37 non-null float64
PC3_sst_9     37 non-null float64
PC1_z70_10    37 non-null float64
PC2_z70_10    37 non-null float64
PC3_z70_10    37 non-null float64
PC1_ci_10     37 non-null float64
PC2_ci_10     37 non-null float64
PC3_ci_10     37 non-null float64
PC1_sst_10    37 non-null float64
PC2_sst_10    37 non-null float64
PC3_sst_10    37 non-null float64
dtypes: float64(19), int64(1)
memory usage: 6.1 KB


In [18]:
dat_df.head()

   wyear         NAO     PC1_z70_9    PC2_z70_9     PC3_z70_9  PC1_ci_9  \
0   1980 -251.348252  18023.465139  2794.736512  -2092.985110  2.963272   
1   1981  562.276609  -5422.104718  9689.567172   5922.630486  0.101697   
2   1982 -713.419876  19744.941093 -8460.604205   1635.209243 -0.538567   
3   1983  721.211563 -14530.470578  8700.189280  -4888.437956 -0.840920   
4   1984  551.105864  -5774.518779  2361.237107  13897.025591  0.980549   

   PC2_ci_9  PC3_ci_9  PC1_sst_9  PC2_sst_9  PC3_sst_9    PC1_z70_10  \
0  1.071399  1.375248   5.315734   5.144223  -0.101667  14187.255820   
1  0.728206  0.115557  -3.156064   0.330436  -3.889343  45350.002970   
2  1.527365 -1.105478  -4.273700   2.978352  -4.129618  33268.045918   
3  1.564602 -0.486677  19.597934   7.318927  -4.275474   -479.075361   
4  0.753575  0.899514   2.394603  -2.069695  19.713818 -30617.123747   

     PC2_z70_10    PC3_z70_10  PC1_ci_10  PC2_ci_10  PC3_ci_10  PC1_sst_10  \
0  -8848.225741  -3218.127655   2.1524

# Regression

In [19]:
regr = skl_lm.LinearRegression()
X = dat_df[['PC1_ci_10', 
            'PC2_z70_10',
            'PC3_sst_9']].as_matrix()
y = dat_df.NAO
regr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [20]:
regr.score(X, y)

0.50036563409207524

In [21]:
import statsmodels.api as sm
import statsmodels.formula.api as smf


The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.



In [22]:
est = smf.ols('NAO ~ PC1_ci_10 + PC2_z70_10 + PC3_sst_9', dat_df).fit()
est.summary()

0,1,2,3
Dep. Variable:,NAO,R-squared:,0.5
Model:,OLS,Adj. R-squared:,0.455
Method:,Least Squares,F-statistic:,11.02
Date:,"Sat, 24 Feb 2018",Prob (F-statistic):,3.63e-05
Time:,17:07:46,Log-Likelihood:,-279.32
No. Observations:,37,AIC:,566.6
Df Residuals:,33,BIC:,573.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.6815,80.217,0.046,0.964,-159.522,166.885
PC1_ci_10,-289.4863,64.195,-4.510,0.000,-420.091,-158.881
PC2_z70_10,-0.0208,0.007,-2.909,0.006,-0.035,-0.006
PC3_sst_9,31.2634,11.543,2.709,0.011,7.780,54.747

0,1,2,3
Omnibus:,2.214,Durbin-Watson:,2.305
Prob(Omnibus):,0.331,Jarque-Bera (JB):,1.717
Skew:,-0.359,Prob(JB):,0.424
Kurtosis:,2.226,Cond. No.,11400.0


In [23]:
est = smf.ols('NAO ~ PC1_z70_9 + PC2_z70_9 + PC3_z70_9 + PC1_ci_9 + PC2_ci_9 + PC3_ci_9 + PC1_sst_9 + PC2_sst_9 + PC3_sst_9 + PC1_z70_10 + PC2_z70_10 + PC3_z70_10 + PC1_ci_10 + PC2_ci_10 + PC3_ci_10 + PC1_sst_10 + PC2_sst_10 + PC3_sst_10', dat_df).fit()
est.summary()

0,1,2,3
Dep. Variable:,NAO,R-squared:,0.799
Model:,OLS,Adj. R-squared:,0.598
Method:,Least Squares,F-statistic:,3.969
Date:,"Sat, 24 Feb 2018",Prob (F-statistic):,0.0027
Time:,17:07:46,Log-Likelihood:,-262.5
No. Observations:,37,AIC:,563.0
Df Residuals:,18,BIC:,593.6
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,13.9593,70.374,0.198,0.845,-133.891,161.810
PC1_z70_9,-0.0054,0.008,-0.673,0.509,-0.022,0.011
PC2_z70_9,0.0098,0.017,0.567,0.578,-0.026,0.046
PC3_z70_9,0.0360,0.017,2.093,0.051,-0.000,0.072
PC1_ci_9,286.1949,322.535,0.887,0.387,-391.425,963.815
PC2_ci_9,-198.6332,182.055,-1.091,0.290,-581.117,183.851
PC3_ci_9,-70.9907,372.821,-0.190,0.851,-854.258,712.276
PC1_sst_9,-82.3047,32.527,-2.530,0.021,-150.641,-13.968
PC2_sst_9,-48.2848,25.727,-1.877,0.077,-102.336,5.766

0,1,2,3
Omnibus:,0.394,Durbin-Watson:,2.07
Prob(Omnibus):,0.821,Jarque-Bera (JB):,0.552
Skew:,-0.173,Prob(JB):,0.759
Kurtosis:,2.512,Cond. No.,180000.0


## Regularization / Lasso Model Selection

In [122]:
# Predictors:
# 'NAO ~ PC1_ci_10 + PC2_z70_10 + PC3_sst_9' # Wang
predNames = np.array(['PC1_z70_9',
 'PC2_z70_9',
 'PC3_z70_9', # selected by Lasso
 'PC1_ci_9',
 'PC2_ci_9',
 'PC3_ci_9',
 'PC1_sst_9',
 'PC2_sst_9',
 'PC3_sst_9', # selected by Lasso
 'PC1_z70_10', # selected by Lasso 
 'PC2_z70_10', # selected by Lasso & as in Wang et al. 2010
 'PC3_z70_10',
 'PC1_ci_10', # selected by Lasso & as in Wang et al. 2010
 'PC2_ci_10',
 'PC3_ci_10',
 'PC1_sst_10',
 'PC2_sst_10',
 'PC3_sst_10'])
X = dat_df[predNames].as_matrix()
# Target Variables:
y = dat_df.NAO

In [123]:
from sklearn.preprocessing import StandardScaler
# Before applying the Lasso, it is necessary to standardize the predictor
scaler = StandardScaler()
scaler.fit(X)
X_stan = scaler.transform(X)

In [124]:
# Lasso Regression with fixed penalty term lambda=150:
# We see that all predictors but three have been shrunk to null:
clf = linear_model.Lasso(alpha=150)
clf.fit(X_stan, y)
print(clf.coef_)

[  -0.            0.           79.24917344   -0.           -0.            0.
   -0.           -0.            0.           -0.          -92.03940494
    0.         -178.33924523   -0.           -0.           -0.           -0.
    0.        ]


In [125]:
# In order to find the optimal penalty parameter alpha,
# use Cross-validated Lasso
from sklearn.linear_model import LassoCV, LassoLarsCV, LassoLarsIC
#modlcv = LassoLarsIC(criterion='aic')
modlcv = LassoCV(cv=3, n_alphas=10000)
modlcv.fit(X_stan, y)
alpha = model.alpha_
alpha # Optimal penalty

19.576129756642523

In [126]:
# Non-zero predictors:
modlcv.coef_

array([  -0.        ,    0.        ,  143.76772639,    0.        ,
         -0.        ,    0.        ,   -0.        ,   -0.        ,
         14.57226511,  -27.72196345, -119.30866149,    0.        ,
       -264.4612583 ,    0.        ,   -0.        ,   -0.        ,
         -0.        ,    0.        ])

In [127]:
# Model R^2 :
modlcv.score(X_stan, y)

0.48147098522658272

In [137]:
# 'NAO ~ PC1_ci_10 + PC2_z70_10 + PC3_sst_9' # Wang
# Name Of the non-null coefficients:
ind = np.array(list(map(lambda x: int(x)!=0, modlcv.coef_)))
importance_df = pd.DataFrame({'pred': predNames[ind], 
                              'coef': modlcv.coef_[ind]})
# According to the Lasso, the 3 strongest predictors are:
# PC3_z70_9, PC2_z70_10, PC1_ci_10
importance_df

         coef        pred
0  143.767726   PC3_z70_9
1   14.572265   PC3_sst_9
2  -27.721963  PC1_z70_10
3 -119.308661  PC2_z70_10
4 -264.461258   PC1_ci_10

In [141]:
# Let's repeat the linear regression using the predictors suggested by the Lasso:
#est = smf.ols('NAO ~ PC1_ci_10 + PC2_z70_10 + PC3_sst_9', dat_df).fit() # Wang
est = smf.ols('NAO ~ PC3_z70_9 + PC2_z70_10 + PC1_ci_10', dat_df).fit()
est.summary()

0,1,2,3
Dep. Variable:,NAO,R-squared:,0.514
Model:,OLS,Adj. R-squared:,0.47
Method:,Least Squares,F-statistic:,11.63
Date:,"Sat, 24 Feb 2018",Prob (F-statistic):,2.34e-05
Time:,20:23:33,Log-Likelihood:,-278.82
No. Observations:,37,AIC:,565.6
Df Residuals:,33,BIC:,572.1
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,10.3703,79.248,0.131,0.897,-150.861,171.601
PC3_z70_9,0.0366,0.013,2.908,0.006,0.011,0.062
PC2_z70_10,-0.0158,0.007,-2.159,0.038,-0.031,-0.001
PC1_ci_10,-240.5642,55.894,-4.304,0.000,-354.281,-126.847

0,1,2,3
Omnibus:,3.957,Durbin-Watson:,2.238
Prob(Omnibus):,0.138,Jarque-Bera (JB):,1.795
Skew:,-0.172,Prob(JB):,0.408
Kurtosis:,1.977,Cond. No.,11600.0


In [139]:
np.sqrt(0.514)

0.71693793315739685

In [None]:
# Next step: define a 3rd region for SST in Northern Atlantic, as in Promet publi.