# Exploration of the ERA-int monthly data collection

[Era-int hires Database Exploration](#dbhires)

[Indexes](#indexes)

[SLP plot](#slpplot)

[Era-int Database Exploration](#db)

In [1]:
from netCDF4 import Dataset
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, addcyclic, shiftgrid
import pymongo
from pprint import pprint
from datetime import datetime, timedelta, date
import pandas as pd

In [2]:
mongo_host_local = 'mongodb://localhost:27017/'
mg = pymongo.MongoClient(mongo_host_local)

In [3]:
pprint(mg.database_names())

['ECMWF', 'local']


In [4]:
db = mg.ECMWF
db.collection_names()

['ERAINT_grid',
 'ERAINT_lores_grid',
 'ERAINT_lores_monthly',
 'ERAINT_monthly',
 'system.indexes']

## ERA-int Database exploration <a id='db'></a>

In [5]:
ERA_vers = 'lores'

if (ERA_vers == 'hires'):
    col_dat = 'ERAINT_monthly'
    col_grid = 'ERAINT_grid'
    resolution = 0.25
elif (ERA_vers == 'lores'):
    col_dat = 'ERAINT_lores_monthly'
    col_grid = 'ERAINT_lores_grid'
    resolution = 2.5

In [7]:
con_grid = db[col_grid]
fo = con_grid.find_one()

In [8]:
pprint(fo)

{'_id': ObjectId('5a1882228cb6b815bce38f92'),
 'id_grid': 1,
 'loc': {'coordinates': [-180.0, 90.0], 'type': 'Point'}}


In [9]:
con_grid.count()

10512

In [10]:
con_data = db[col_dat]
fo = con_data.find_one()
pprint(fo)

{'_id': ObjectId('5a1884c08cb6b817b38309c4'),
 'al': 0.07,
 'ci': 1.0,
 'd2m': 239.53,
 'date': datetime.datetime(1979, 1, 1, 0, 0),
 'hcc': 0.2,
 'id_grid': 1,
 'istl1': 242.55,
 'lcc': 0.51,
 'mcc': 0.15,
 'msl': 102793.31,
 'si10': 5.65,
 'skt': 242.63,
 'sp': 102792.57,
 'sst': 271.46,
 'stl1': 242.63,
 't2m': 242.7,
 'u10': 0.95,
 'v10': 0.28,
 'year': 1979}


In [11]:
this_day = datetime(1995,1 ,1)
for doc in con_data.find({'date': this_day}).limit(2):
    pprint(doc)

{'_id': ObjectId('5a1885d48cb6b817b58d4e04'),
 'al': 0.07,
 'ci': 1.0,
 'd2m': 239.33,
 'date': datetime.datetime(1995, 1, 1, 0, 0),
 'hcc': 0.29,
 'id_grid': 1,
 'istl1': 242.21,
 'lcc': 0.52,
 'mcc': 0.19,
 'msl': 102342.05,
 'si10': 4.96,
 'skt': 242.22,
 'sp': 102342.02,
 'sst': 271.46,
 'stl1': 242.24,
 't2m': 242.46,
 'u10': -0.5,
 'v10': 2.28,
 'year': 1995}
{'_id': ObjectId('5a1885d48cb6b817b58d4e05'),
 'al': 0.07,
 'ci': 1.0,
 'd2m': 239.33,
 'date': datetime.datetime(1995, 1, 1, 0, 0),
 'hcc': 0.29,
 'id_grid': 2,
 'istl1': 242.21,
 'lcc': 0.52,
 'mcc': 0.19,
 'msl': 102342.05,
 'si10': 4.96,
 'skt': 242.22,
 'sp': 102342.02,
 'sst': 271.46,
 'stl1': 242.24,
 't2m': 242.46,
 'u10': -0.39,
 'v10': 2.26,
 'year': 1995}


In [12]:
# How many documents are in the daily data collection
con_data.count()

4877568

In [13]:
alldays = con_data.distinct(key='date')
alldays

[datetime.datetime(1979, 1, 1, 0, 0),
 datetime.datetime(1979, 2, 1, 0, 0),
 datetime.datetime(1979, 3, 1, 0, 0),
 datetime.datetime(1979, 4, 1, 0, 0),
 datetime.datetime(1979, 5, 1, 0, 0),
 datetime.datetime(1979, 6, 1, 0, 0),
 datetime.datetime(1979, 7, 1, 0, 0),
 datetime.datetime(1979, 8, 1, 0, 0),
 datetime.datetime(1979, 9, 1, 0, 0),
 datetime.datetime(1979, 10, 1, 0, 0),
 datetime.datetime(1979, 11, 1, 0, 0),
 datetime.datetime(1979, 12, 1, 0, 0),
 datetime.datetime(1980, 1, 1, 0, 0),
 datetime.datetime(1980, 2, 1, 0, 0),
 datetime.datetime(1980, 3, 1, 0, 0),
 datetime.datetime(1980, 4, 1, 0, 0),
 datetime.datetime(1980, 5, 1, 0, 0),
 datetime.datetime(1980, 6, 1, 0, 0),
 datetime.datetime(1980, 7, 1, 0, 0),
 datetime.datetime(1980, 8, 1, 0, 0),
 datetime.datetime(1980, 9, 1, 0, 0),
 datetime.datetime(1980, 10, 1, 0, 0),
 datetime.datetime(1980, 11, 1, 0, 0),
 datetime.datetime(1980, 12, 1, 0, 0),
 datetime.datetime(1981, 1, 1, 0, 0),
 datetime.datetime(1981, 2, 1, 0, 0),
 datet

## Data collection indexes <a id='indexes'></a>

In [14]:
ind = con_grid.index_information()
pprint(ind)

{'_id_': {'key': [('_id', 1)], 'ns': 'ECMWF.ERAINT_lores_grid', 'v': 1},
 'loc_2dsphere_id_grid_1': {'2dsphereIndexVersion': 2,
                            'key': [('loc', '2dsphere'), ('id_grid', 1)],
                            'ns': 'ECMWF.ERAINT_lores_grid',
                            'v': 1}}


In [16]:
con_data.index_information()

{'_id_': {'key': [('_id', 1)], 'ns': 'ECMWF.ERAINT_lores_monthly', 'v': 1},
 'date_-1': {'key': [('date', -1)],
  'ns': 'ECMWF.ERAINT_lores_monthly',
  'v': 1},
 'id_grid_1_date_-1': {'key': [('id_grid', 1), ('date', -1)],
  'ns': 'ECMWF.ERAINT_lores_monthly',
  'v': 1},
 'year_1_id_grid_1': {'key': [('year', 1), ('id_grid', 1)],
  'ns': 'ECMWF.ERAINT_lores_monthly',
  'v': 1}}

# Plot SLP <a id='slpplot'></a>

In [17]:
con_grid.find_one()

{'_id': ObjectId('5a1882228cb6b815bce38f92'),
 'id_grid': 1,
 'loc': {'coordinates': [-180.0, 90.0], 'type': 'Point'}}

- The domain north of 20°N is chosen for EOF decomposition for Z70 hPa and SIC to focus on the extra-tropical variability 
- The area north of 20°S is chosen for SST to also include key regions of tropical SST variability

In [18]:
slp_poly = [[180,20], [180,90],[-180,90], [-180,20],[180,20]] #[[-180,20],[-180,0],[180,0],[-180,0],[-180,20]]

In [19]:
this_polygon = slp_poly
geo_qry = {"loc": 
           {"$geoWithin": {
               "$geometry": {
                   "type": "Polygon","coordinates": [this_polygon]}}}}
con_grid.count(filter = geo_qry)

172

In [20]:
res = con_grid.find(filter = geo_qry, projection = {"_id":0, "id_grid": 1, "loc": 1})
grid_df = pd.DataFrame(list(res))
grid_ids = grid_df.id_grid.values
grid_ids[:10]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

In [21]:
# Query SLP in the data collection for a given day for the grid_ids
res = con_data.find(filter = {
    "date": datetime(1988, 12, 1, 0, 0),
    "id_grid": {"$in": grid_ids.tolist()}},
                       projection = {"_id":0, "id_grid": 1, "msl": 1})
slp_df = pd.DataFrame(list(res))
slp_df.head()

Unnamed: 0,id_grid,msl
0,1,99672.65
1,2,99672.65
2,3,99672.65
3,4,99672.65
4,5,99672.65


In [22]:
grid_df.head()

Unnamed: 0,id_grid,loc
0,1,"{'type': 'Point', 'coordinates': [-180.0, 90.0]}"
1,2,"{'type': 'Point', 'coordinates': [-177.5, 90.0]}"
2,3,"{'type': 'Point', 'coordinates': [-175.0, 90.0]}"
3,4,"{'type': 'Point', 'coordinates': [-172.5, 90.0]}"
4,5,"{'type': 'Point', 'coordinates': [-170.0, 90.0]}"


In [23]:
# Merge slp_df with grid_df:
slp_dat = pd.merge(left=slp_df, right=grid_df, on = "id_grid")
slp_dat.head()

Unnamed: 0,id_grid,msl,loc
0,1,99672.65,"{'type': 'Point', 'coordinates': [-180.0, 90.0]}"
1,2,99672.65,"{'type': 'Point', 'coordinates': [-177.5, 90.0]}"
2,3,99672.65,"{'type': 'Point', 'coordinates': [-175.0, 90.0]}"
3,4,99672.65,"{'type': 'Point', 'coordinates': [-172.5, 90.0]}"
4,5,99672.65,"{'type': 'Point', 'coordinates': [-170.0, 90.0]}"


In [24]:
con_grid.find_one()

{'_id': ObjectId('5a1882228cb6b815bce38f92'),
 'id_grid': 1,
 'loc': {'coordinates': [-180.0, 90.0], 'type': 'Point'}}

In [25]:
import GDAL as gdl

ModuleNotFoundError: No module named 'GDAL'