# NAO Prediction

In [24]:
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, addcyclic, shiftgrid
import pymongo
from pprint import pprint
from datetime import datetime, timedelta, date
import pandas as pd
from sklearn.decomposition import PCA
import gdal as gdl
import matplotlib.mlab as ml
import cartopy.crs as ccrs
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True) # for live plot
pd.set_option('display.notebook_repr_html', False)
%matplotlib inline
plt.style.use('seaborn-white')

In [2]:
mongo_host_local = 'mongodb://localhost:27017/'
mg = pymongo.MongoClient(mongo_host_local)

In [3]:
db = mg.ECMWF
db.collection_names()

['system.indexes',
 'ERAINT_grid',
 'ERAINT_lores_grid',
 'ERAINT_lores_monthly_anom',
 'ERAINT_monthly',
 'ERAINT_lores_monthly']

In [4]:
ERA_vers = 'lores'
if (ERA_vers == 'hires'):
    col_dat = 'ERAINT_monthly'
    col_anom = 'ERAINT_monthly_anom'
    col_grid = 'ERAINT_grid'
    resolution = 0.25
elif (ERA_vers == 'lores'):
    col_dat = 'ERAINT_lores_monthly'
    col_anom = 'ERAINT_lores_monthly_anom'
    col_grid = 'ERAINT_lores_grid'
    resolution = 2.5

# Construct NAO index

In [5]:
# Query grid cells for NAO calculation
con_grid = db[col_grid]
poly1 = [list(reversed([ [-50,25], [-50,55], [10,55],[ 10,25], [-50,25]]))]
poly2 = [list(reversed([ [-40, 55], [-40, 85], [20, 85], [20, 55], [-40, 55]]))]
def getGridIds(this_polygon):
    geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": this_polygon
                   }
               }}}

    res = con_grid.find(filter = geo_qry, projection = {"_id":0, "id_grid": 1, "loc": 1})
    grid_df = pd.DataFrame(list(res))
    return grid_df
grid_df1 = getGridIds(poly1)
grid_ids1 = grid_df1.id_grid.values
grid_df2 = getGridIds(poly2)
grid_ids2 = grid_df2.id_grid.values

In [20]:
# Get yearly DJF averages over the two NAO nodal locations

con_anom = db[col_anom]

def setWinterYear(date): # December belong to next year's winter
    mon=date.month
    yr=date.year
    if mon == 12:
        res = yr+1
    else:
        res = yr
    return res

def getMSL(this_grid_ids):
    this_msl = con_anom.aggregate(pipeline=[
        {"$match": {"id_grid": {"$in": this_grid_ids.tolist()}}},
        {"$group": {"_id": "$date", "mean": {"$avg": "$msl"} }},
        {"$project": {"date": "$_id", 
                      "_id": 0, 
                      "msl": "$mean"}}])
    this_msl_df = pd.DataFrame(list(this_msl))
    this_msl_df = this_msl_df.assign(
            month=list(map(lambda x: x.month, this_msl_df.date)),
            wyear=list(map(lambda x: setWinterYear(x), this_msl_df.date)) ).pipe(
    lambda df: df.query("month in [12, 1, 2]") ).pipe(
    lambda df: df.groupby("wyear").mean().reset_index())
    return this_msl_df
    
msl1_df = getMSL(this_grid_ids = grid_ids1).rename(columns={'msl': 'msl1'})
msl2_df = getMSL(this_grid_ids = grid_ids2).rename(columns={'msl': 'msl2'})
msl_df = pd.merge(msl1_df, msl2_df)
msl_df = msl_df.assign(NAO = msl_df.msl2-msl_df.msl1).sort_values('wyear', ascending=True).reset_index(drop=True)
# Get rid of the first year (1979) and the last (2017) because the winter month are not complete
msl_df = msl_df.query("(wyear > 1979) & (wyear <2017)")
msl_df.head()

   wyear        msl1  month        msl2         NAO
1   1980  152.977452    5.0  -98.370800 -251.348252
2   1981 -444.747185    5.0  117.529424  562.276609
3   1982  336.279706    5.0 -377.140170 -713.419876
4   1983 -342.535287    5.0  378.676276  721.211563
5   1984 -220.904942    5.0  330.200922  551.105864

In [21]:
# Plot ts
data = [go.Scatter(x=msl_df['wyear'], y=msl_df['NAO'] )]
py.iplot(data, filename='pandas-time-series')

# Get PCA scores

In [22]:
# Generic function to query grid ids above a given latitude
def genCircle(start_lon, stop_lon, lat, decreasing): 
    res = map(lambda x:[int(x), lat],
              sorted(np.arange(start=start_lon, stop=stop_lon+1), reverse=decreasing))
    return list(res)

def queryGrids(aboveLat):
    this_box = {'lonmin': -180, 'lonmax': 180, 'latmin': aboveLat, 'latmax': 90}
    circle_north_pos = genCircle(start_lon = this_box['lonmin'], stop_lon = this_box['lonmax'], 
                                  lat = this_box['latmax'], decreasing = False)
    circle_south_neg = genCircle(start_lon = this_box['lonmin'], stop_lon = this_box['lonmax'], 
                                lat = this_box['latmin'],  decreasing = True)
    slp_poly = [[this_box['lonmin'], this_box['latmin']]]
    slp_poly.extend(circle_north_pos)
    slp_poly.extend(circle_south_neg)
    this_polygon = slp_poly
    
    if aboveLat > 0:
        geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": [this_polygon]
               }}}}
    else: # case of a big polygon larger than one hemisphere
        geo_qry = {"loc": 
               {"$geoWithin": {
                   "$geometry": {
                       "type": "Polygon",
                       "coordinates": [list(reversed(this_polygon))], # the orientation matters
                       "crs": {
                           "type": "name", 
                           "properties": { "name": "urn:x-mongodb:crs:strictwinding:EPSG:4326" }
                       }
                   }
               }}}
        
    res = con_grid.find(filter = geo_qry, projection = {"_id":0, "id_grid": 1, "loc": 1})
    grid_df = pd.DataFrame(list(res))
    return grid_df

grid_df_20N = queryGrids(aboveLat=20)
grid_df_20S = queryGrids(aboveLat=-20)

In [47]:
def queryScores(this_variable, this_grid_df):
    # Query data anomalies
    grid_ids = this_grid_df.id_grid.values
    res = con_anom.aggregate(pipeline=[ 
    {"$project": {"id_grid": 1, "date": 1, this_variable: 1, "month": {"$month": "$date"}}},
    {"$match": {"month": {"$in": [9, 10, 11, 12, 1, 2]},
                "id_grid": {"$in": grid_ids.tolist()} }},
    {"$project": {"_id": 0, "id_grid": 1, "date": 1, this_variable: 1}} ])    
    anom_df = pd.DataFrame(list(res))

    # Get Principal Component Scores
    X_df = anom_df.pivot(index='date', columns='id_grid', values=this_variable)
    pca = PCA(n_components=3)
    df_scores = pd.DataFrame(pca.fit_transform(X_df), 
                             columns=['PC1_%s' % (this_variable), 
                                      'PC2_%s' % (this_variable), 
                                      'PC3_%s' % (this_variable)], 
                             index=X_df.index)
    return df_scores

scores_z70 = queryScores(this_variable='z70', this_grid_df=grid_df_20N)
scores_ci = queryScores(this_variable='ci', this_grid_df=grid_df_20N)
scores_sst = queryScores(this_variable='sst', this_grid_df=grid_df_20S)

In [58]:
scores_df = pd.merge(left=scores_z70, right=scores_ci, left_index=True, right_index=True).\
pipe(lambda df: pd.merge(df, scores_sst, left_index=True, right_index=True))
scores_df.head()

                 PC1_z70       PC2_z70       PC3_z70    PC1_ci    PC2_ci  \
date                                                                       
1979-01-01  47305.944044 -21384.693316 -20280.483751 -0.069347  0.698034   
1979-02-01  27377.090406  -5627.629663 -24174.825459 -0.176903  0.691111   
1979-09-01  18023.465139   2794.736512  -2092.985110  2.963272  1.071387   
1979-10-01  14187.255820  -8848.225741  -3218.127655  2.152409  0.879816   
1979-11-01  27520.923158 -14059.658557  -5650.416198  2.057799 -0.180313   

              PC3_ci   PC1_sst   PC2_sst   PC3_sst  
date                                                
1979-01-01  1.563129 -3.052552  3.452704 -5.781001  
1979-02-01  1.377256 -6.587935  8.314202 -7.698400  
1979-09-01  1.375115  5.315733  5.144320 -0.100937  
1979-10-01 -0.169666  6.340731  3.405841  3.129901  
1979-11-01 -0.983080  3.580924  3.571717  7.026330  