# Station Prediction

In [2]:
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, addcyclic, shiftgrid
import pymongo
from pprint import pprint
from datetime import datetime, timedelta, date
import pandas as pd
from sklearn.decomposition import PCA
import sklearn.linear_model as skl_lm
import gdal as gdl
import matplotlib.mlab as ml
import cartopy.crs as ccrs
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True) # for live plot
pd.set_option('display.notebook_repr_html', False)
%matplotlib inline
plt.style.use('seaborn-white')

In [3]:
mongo_host_local = 'mongodb://localhost:27017/'
mg = pymongo.MongoClient(mongo_host_local)

In [4]:
db = mg.ECMWF
db.collection_names()

['system.indexes',
 'ERAINT_grid',
 'ERAINT_lores_grid',
 'ERAINT_lores_monthly_anom',
 'ERAINT_monthly',
 'ERAINT_lores_monthly']

In [5]:
ERA_vers = 'lores'
if (ERA_vers == 'hires'):
    col_dat = 'ERAINT_monthly'
    col_anom = 'ERAINT_monthly_anom'
    col_grid = 'ERAINT_grid'
    resolution = 0.25
elif (ERA_vers == 'lores'):
    col_dat = 'ERAINT_lores_monthly'
    col_anom = 'ERAINT_lores_monthly_anom'
    col_grid = 'ERAINT_lores_grid'
    resolution = 2.5

con_grid = db[col_grid]
con_anom = db[col_anom]

## Name of variables:

* 'ci':  Sea-ice cover [0-1]
* 'sst': Sea surface temperature [K]
* 'istl1': Ice temp layer1 [K]
* 'sp': Surface pressure [Pa]
* 'stl1': Soil temp lev1 [K]
* 'msl': Mean SLP [Pa]
* 'u10': wind-u [m/s]
* 'v10': 
* 't2m': 2m temp [K]
* 'd2m': 2m dewpoint temp.[K]
* 'al': Surface albedo [0-1]
* 'lcc': Low cloud cover [0-1]
* 'mcc': Medium cloud cover [0-1]
* 'hcc': High cloud cover [0-1]
* 'si10': 10m wind speed [m/s]
* 'skt': Skin temperature [K]
* 'blh': Boundary layer hgt [m]
* 'ishf': Inst.surf.sensbl.heatflux [W/m2]
* 'ie': Instantaneous moisture flux [kg*m^-2*s^-1]
* 'z70': Geopot. height @70hPa [m]

In [6]:
# Names of candidate variables:
fo0 = con_anom.find({}, {'_id': 0, 'year': 0, 'month': 0, 'date': 0, 'id_grid': 0}).limit(1)
fo_df = pd.DataFrame(list(fo0))
all_varnames = list(fo_df)
all_varnames

['al',
 'blh',
 'ci',
 'd2m',
 'hcc',
 'ie',
 'ishf',
 'istl1',
 'lcc',
 'mcc',
 'msl',
 'si10',
 'skt',
 'sp',
 'sst',
 'stl1',
 't2m',
 'u10',
 'v10',
 'z70']

# Get Target Variables

### GHCN Monthly Data

* ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/v3/
* "QCA" files represent the quality controlled adjusted data.

# Get Predictors

In [7]:
# Query anomalies for a variable for each input grid cells
def queryAnom(this_variable, this_grid_df):
    # Query data anomalies
    grid_ids = this_grid_df.id_grid.values
    res = con_anom.aggregate(pipeline=[ 
    {"$project": {"id_grid": 1, "date": 1, this_variable: 1, "month": {"$month": "$date"}}},
    {"$match": {"month": {"$in": [9, 10, 11, 12, 1, 2]},
                "id_grid": {"$in": grid_ids.tolist()} }},
    {"$project": {"_id": 0, "id_grid": 1, "date": 1, this_variable: 1}} ])    
    anom_df = pd.DataFrame(list(res))
    return anom_df