# NAO Prediction

In [149]:
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, addcyclic, shiftgrid
import pymongo
from pprint import pprint
from datetime import datetime, timedelta, date
import pandas as pd
from sklearn.decomposition import PCA
import sklearn.linear_model as skl_lm
import gdal as gdl
import matplotlib.mlab as ml
import cartopy.crs as ccrs
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True) # for live plot
pd.set_option('display.notebook_repr_html', False)
%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
mongo_host_local = 'mongodb://localhost:27017/'
mg = pymongo.MongoClient(mongo_host_local)

In [3]:
db = mg.ECMWF
db.collection_names()

['system.indexes',
 'ERAINT_grid',
 'ERAINT_lores_grid',
 'ERAINT_lores_monthly_anom',
 'ERAINT_monthly',
 'ERAINT_lores_monthly']

In [4]:
ERA_vers = 'lores'
if (ERA_vers == 'hires'):
    col_dat = 'ERAINT_monthly'
    col_anom = 'ERAINT_monthly_anom'
    col_grid = 'ERAINT_grid'
    resolution = 0.25
elif (ERA_vers == 'lores'):
    col_dat = 'ERAINT_lores_monthly'
    col_anom = 'ERAINT_lores_monthly_anom'
    col_grid = 'ERAINT_lores_grid'
    resolution = 2.5

# Construct NAO index

In [5]:
# Query grid cells for NAO calculation
con_grid = db[col_grid]
poly1 = [list(reversed([ [-50,25], [-50,55], [10,55],[ 10,25], [-50,25]]))]
poly2 = [list(reversed([ [-40, 55], [-40, 85], [20, 85], [20, 55], [-40, 55]]))]
def getGridIds(this_polygon):
    geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": this_polygon
                   }
               }}}

    res = con_grid.find(filter = geo_qry, projection = {"_id":0, "id_grid": 1, "loc": 1})
    grid_df = pd.DataFrame(list(res))
    return grid_df
grid_df1 = getGridIds(poly1)
grid_ids1 = grid_df1.id_grid.values
grid_df2 = getGridIds(poly2)
grid_ids2 = grid_df2.id_grid.values

In [6]:
# Get yearly DJF averages over the two NAO nodal locations

con_anom = db[col_anom]

def setWinterYear(date): # December belong to next year's winter
    mon=date.month
    yr=date.year
    if mon >= 9:
        res = yr+1
    else:
        res = yr
    return res

def getMSL(this_grid_ids):
    this_msl = con_anom.aggregate(pipeline=[
        {"$match": {"id_grid": {"$in": this_grid_ids.tolist()}}},
        {"$group": {"_id": "$date", "mean": {"$avg": "$msl"} }},
        {"$project": {"date": "$_id", 
                      "_id": 0, 
                      "msl": "$mean"}}])
    this_msl_df = pd.DataFrame(list(this_msl))
    this_msl_df = this_msl_df.assign(
            month=list(map(lambda x: x.month, this_msl_df.date)),
            wyear=list(map(lambda x: setWinterYear(x), this_msl_df.date)) ).pipe(
    lambda df: df.query("month in [12, 1, 2]") ).pipe(
    lambda df: df.groupby("wyear").mean().reset_index())
    return this_msl_df
    
msl1_df = getMSL(this_grid_ids = grid_ids1).rename(columns={'msl': 'msl1'})
msl2_df = getMSL(this_grid_ids = grid_ids2).rename(columns={'msl': 'msl2'})
msl_df = pd.merge(msl1_df, msl2_df)
msl_df = msl_df.assign(NAO = msl_df.msl2-msl_df.msl1).sort_values('wyear', ascending=True).reset_index(drop=True)
# Get rid of the first year (1979) and the last (2017) because the winter month are not complete
msl_df = msl_df.query("(wyear > 1979) & (wyear <2017)")
msl_df.head()

   wyear        msl1  month        msl2         NAO
1   1980  152.977452    5.0  -98.370800 -251.348252
2   1981 -444.747185    5.0  117.529424  562.276609
3   1982  336.279706    5.0 -377.140170 -713.419876
4   1983 -342.535287    5.0  378.676276  721.211563
5   1984 -220.904942    5.0  330.200922  551.105864

In [7]:
# Plot ts
data = [go.Scatter(x=msl_df['wyear'], y=msl_df['NAO'] )]
py.iplot(data, filename='pandas-time-series')

# Get PCA scores

In [8]:
# Generic function to query grid ids above a given latitude
def genCircle(start_lon, stop_lon, lat, decreasing): 
    res = map(lambda x:[int(x), lat],
              sorted(np.arange(start=start_lon, stop=stop_lon+1), reverse=decreasing))
    return list(res)

def queryGrids(aboveLat):
    this_box = {'lonmin': -180, 'lonmax': 180, 'latmin': aboveLat, 'latmax': 90}
    circle_north_pos = genCircle(start_lon = this_box['lonmin'], stop_lon = this_box['lonmax'], 
                                  lat = this_box['latmax'], decreasing = False)
    circle_south_neg = genCircle(start_lon = this_box['lonmin'], stop_lon = this_box['lonmax'], 
                                lat = this_box['latmin'],  decreasing = True)
    slp_poly = [[this_box['lonmin'], this_box['latmin']]]
    slp_poly.extend(circle_north_pos)
    slp_poly.extend(circle_south_neg)
    this_polygon = slp_poly
    
    if aboveLat > 0:
        geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": [this_polygon]
               }}}}
    else: # case of a big polygon larger than one hemisphere
        geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": [list(reversed(this_polygon))], # the orientation matters
                       "crs": {
                           "type": "name", 
                           "properties": { "name": "urn:x-mongodb:crs:strictwinding:EPSG:4326" }
                       }
                   }
               }}}
        
    res = con_grid.find(filter = geo_qry, projection = {"_id":0, "id_grid": 1, "loc": 1})
    grid_df = pd.DataFrame(list(res))
    return grid_df

grid_df_20N = queryGrids(aboveLat=20)
grid_df_20S = queryGrids(aboveLat=-20)

In [9]:
def queryScores(this_variable, this_grid_df):
    # Query data anomalies
    grid_ids = this_grid_df.id_grid.values
    res = con_anom.aggregate(pipeline=[ 
    {"$project": {"id_grid": 1, "date": 1, this_variable: 1, "month": {"$month": "$date"}}},
    {"$match": {"month": {"$in": [9, 10, 11, 12, 1, 2]},
                "id_grid": {"$in": grid_ids.tolist()} }},
    {"$project": {"_id": 0, "id_grid": 1, "date": 1, this_variable: 1}} ])    
    anom_df = pd.DataFrame(list(res))

    # Get Principal Component Scores
    X_df = anom_df.pivot(index='date', columns='id_grid', values=this_variable)
    pca = PCA(n_components=3)
    df_scores = pd.DataFrame(pca.fit_transform(X_df), 
                             columns=['PC1_%s' % (this_variable), 
                                      'PC2_%s' % (this_variable), 
                                      'PC3_%s' % (this_variable)], 
                             index=X_df.index)
    return df_scores

scores_z70 = queryScores(this_variable='z70', this_grid_df=grid_df_20N)
scores_ci = queryScores(this_variable='ci', this_grid_df=grid_df_20N)
scores_sst = queryScores(this_variable='sst', this_grid_df=grid_df_20S)

In [10]:
scores_df = pd.merge(left=scores_z70, right=scores_ci, left_index=True, right_index=True).\
pipe(lambda df: pd.merge(df, scores_sst, left_index=True, right_index=True))
scores_df.reset_index(level=0, inplace=True)
scores_df = scores_df.assign(
    year=list(map(lambda x: x.year, scores_df.date)),
    wyear=list(map(lambda x: setWinterYear(x), scores_df.date)), 
    month=list(map(lambda x: x.month, scores_df.date)))
scores_df.tail()

          date       PC1_z70       PC2_z70       PC3_z70    PC1_ci    PC2_ci  \
225 2016-10-01  44299.331490  23068.206844  13441.926821  1.196213  0.790865   
226 2016-11-01  92403.226733  -5662.939549 -20588.211094  3.322215  0.966125   
227 2016-12-01  39836.996391  -8379.671980  26762.907225  2.777325 -0.220234   
228 2017-01-01 -39836.428742 -23745.659253  24101.644928  2.524970 -0.594460   
229 2017-02-01  -7208.191393  -6404.854152 -21445.130997  1.154286 -0.473464   

       PC3_ci   PC1_sst    PC2_sst    PC3_sst  year  wyear  month  
225  0.307352  0.546405 -18.423239  19.370803  2016   2017     10  
226  0.960893 -0.813587 -14.706474  24.183666  2016   2017     11  
227  0.510333 -0.182977  -8.269907  17.974406  2016   2017     12  
228  0.228384  2.921389  -6.787676  16.229876  2017   2017      1  
229  0.309103  8.464926  -4.510643  11.886872  2017   2017      2  

In [148]:
# Create the Predictor DataFrame

def renCol(x, mon):
    if ('PC' in x):
        z = '%s_%s' % (x, mon)
    else:
        z = x
    return z

def createMondf(this_mon, scores_df):
    mon_df = scores_df.query('month == 9')
    mon_df.columns = list(map(lambda x: renCol(x, mon=this_mon), list(mon_df)))
    mon_df = mon_df.drop(['date','year','month'], axis=1)
    return mon_df

sep_df = createMondf(this_mon=9, scores_df=scores_df)
oct_df = createMondf(this_mon=10, scores_df=scores_df)
X_df = pd.merge(sep_df, oct_df)

In [147]:
# Create Regression DataFrame
NAO_df = msl_df.drop(columns=['msl1', 'msl2', 'month'])
dat_df = pd.merge(NAO_df, X_df)
dat_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37 entries, 0 to 36
Data columns (total 20 columns):
wyear         37 non-null int64
NAO           37 non-null float64
PC1_z70_9     37 non-null float64
PC2_z70_9     37 non-null float64
PC3_z70_9     37 non-null float64
PC1_ci_9      37 non-null float64
PC2_ci_9      37 non-null float64
PC3_ci_9      37 non-null float64
PC1_sst_9     37 non-null float64
PC2_sst_9     37 non-null float64
PC3_sst_9     37 non-null float64
PC1_z70_10    37 non-null float64
PC2_z70_10    37 non-null float64
PC3_z70_10    37 non-null float64
PC1_ci_10     37 non-null float64
PC2_ci_10     37 non-null float64
PC3_ci_10     37 non-null float64
PC1_sst_10    37 non-null float64
PC2_sst_10    37 non-null float64
PC3_sst_10    37 non-null float64
dtypes: float64(19), int64(1)
memory usage: 6.1 KB


In [160]:
dat_df.head()

   wyear         NAO     PC1_z70_9    PC2_z70_9     PC3_z70_9  PC1_ci_9  \
0   1980 -251.348252  18023.465139  2794.736512  -2092.985110  2.963271   
1   1981  562.276609  -5422.104718  9689.567172   5922.630486  0.101697   
2   1982 -713.419876  19744.941093 -8460.604205   1635.209243 -0.538567   
3   1983  721.211563 -14530.470578  8700.189280  -4888.437956 -0.840921   
4   1984  551.105864  -5774.518779  2361.237107  13897.025591  0.980549   

   PC2_ci_9  PC3_ci_9  PC1_sst_9  PC2_sst_9  PC3_sst_9    PC1_z70_10  \
0  1.071407  1.375141   5.315733   5.144231  -0.101282  18023.465139   
1  0.728209  0.115517  -3.156064   0.330546  -3.889518  -5422.104718   
2  1.527360 -1.105422  -4.273700   2.978333  -4.130311  19744.941093   
3  1.564609 -0.486808  19.597934   7.319096  -4.275942 -14530.470578   
4  0.753572  0.899565   2.394603  -2.069814  19.714610  -5774.518779   

    PC2_z70_10    PC3_z70_10  PC1_ci_10  PC2_ci_10  PC3_ci_10  PC1_sst_10  \
0  2794.736512  -2092.985110   2.963271

# Regression

In [169]:
regr = skl_lm.LinearRegression()
X = dat_df[['PC1_ci_10', 
            'PC2_z70_10',
            'PC3_sst_9']].as_matrix()
y = dat_df.NAO
regr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [170]:
regr.score(X, y)

0.1261185309919487

In [157]:
import statsmodels.api as sm
import statsmodels.formula.api as smf


The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.



In [164]:
est = smf.ols('NAO ~ PC1_ci_10 + PC2_z70_10 + PC3_sst_9', dat_df).fit()
est.summary()

0,1,2,3
Dep. Variable:,NAO,R-squared:,0.126
Model:,OLS,Adj. R-squared:,0.047
Method:,Least Squares,F-statistic:,1.588
Date:,"Sun, 11 Feb 2018",Prob (F-statistic):,0.211
Time:,12:34:54,Log-Likelihood:,-289.67
No. Observations:,37,AIC:,587.3
Df Residuals:,33,BIC:,593.8
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.8559,106.073,0.225,0.823,-191.952,239.664
PC1_ci_10,-190.4072,103.107,-1.847,0.074,-400.181,19.366
PC2_z70_10,0.0251,0.018,1.422,0.164,-0.011,0.061
PC3_sst_9,16.6320,15.072,1.104,0.278,-14.032,47.296

0,1,2,3
Omnibus:,2.658,Durbin-Watson:,2.147
Prob(Omnibus):,0.265,Jarque-Bera (JB):,1.814
Skew:,-0.535,Prob(JB):,0.404
Kurtosis:,3.175,Cond. No.,6120.0


In [173]:
np.sqrt(0.126)

0.35496478698597694

In [176]:
regr = skl_lm.LinearRegression()
X = dat_df[['PC1_z70_9',
 'PC2_z70_9',
 'PC3_z70_9',
 'PC1_ci_9',
 'PC2_ci_9',
 'PC3_ci_9',
 'PC1_sst_9',
 'PC2_sst_9',
 'PC3_sst_9',
 'PC1_z70_10',
 'PC2_z70_10',
 'PC3_z70_10',
 'PC1_ci_10',
 'PC2_ci_10',
 'PC3_ci_10',
 'PC1_sst_10',
 'PC2_sst_10',
 'PC3_sst_10']].as_matrix()
y = dat_df.NAO
regr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [177]:
regr.score(X, y)

0.27357954145307639

In [178]:
np.sqrt(0.27)

0.51961524227066325