In [1]:
from netCDF4 import Dataset, netcdftime, num2date, date2num, date2index
from datetime import datetime, timedelta, date
import pytz
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap, addcyclic, shiftgrid
import pymongo
from pymongo import IndexModel, ASCENDING, DESCENDING
from pprint import pprint
from os import listdir
import os
import pandas as pd
import fnmatch
import logging
from joblib import Parallel, delayed
import multiprocessing
from functools import partial

In [2]:
startTime = datetime.now()
logging.info("%s %s:%s Job started" %
             (startTime.date(), startTime.hour, startTime.minute))
downloadDir = '/home/dmasson/data/era-interim/'
files00 = listdir(downloadDir)
files = fnmatch.filter(files00, '*multivarm1*.nc')
files.sort()

# What dates are already ingested in MongoDB ?
# MongoDB:
import sys
sys.path.insert(0, '/home/production/dev/')

mongo_host_local = 'mongodb://localhost:27017/'
mg = pymongo.MongoClient(mongo_host_local)

db = mg.ECMWF
con_data = db.ERAINT_monthly
datesInMongo = {}#con_data.distinct('date')
files

['era-int_multivarm1_1979-01-01_to_2017-08-31.nc']

In [3]:
def getDatesDF(nc_file):  # insertFile(nc_file):
    logging.info("Inserting %s" % (nc_file))
    nc_file00 = '%s%s' % (downloadDir, nc_file)
    fh = Dataset(nc_file00, mode='r')
    nctime = fh.variables['time'][:]
    t_unit = fh.variables['time'].units
    fh.close()
    time = num2date(nctime, units=t_unit)
    # Create a data frame
    df = pd.DataFrame({'time': time})
    df = df.assign(date=df.time.dt.date)
    # Do some aggregation
    gdf = pd.DataFrame(df.groupby('date').size().rename('ndoc')).reset_index()
    df2 = pd.merge(left=df, right=gdf, on="date")
    # exclude datesInMongo (data already ingested)
    DF = df2[~pd.to_datetime(df2.date).isin(datesInMongo)]
    return DF

In [6]:
this_file = files[0]
DF = getDatesDF(this_file)
DF.head()

Unnamed: 0,time,date,ndoc
0,1979-01-01,1979-01-01,1
1,1979-02-01,1979-02-01,1
2,1979-03-01,1979-03-01,1
3,1979-04-01,1979-04-01,1
4,1979-05-01,1979-05-01,1


In [10]:
days = DF.date.drop_duplicates()
ncfile = this_file
this_day = days.iloc[45]
this_day

datetime.date(1982, 10, 1)

In [11]:
#def insertOneDay(this_day, ncfile, DF):
# Choose one arbitrary day
logging.info(this_day)
ncfile00 = '%s%s' % (downloadDir, ncfile)
fh = Dataset(ncfile00, mode='r')
lons = fh.variables['longitude'][:]
lats = fh.variables['latitude'][:]
# Extract the data for this day out of the nc file
times = DF[DF.date == this_day].time
ind = date2index(dates=times.tolist(), nctime=fh.variables['time'])

vars = {'ci': fh.variables['ci'][ind], # Sea-ice cover [0-1]
        'sst': fh.variables['sst'][ind], # Sea surface temperature [K]
        'istl1': fh.variables['istl1'][ind], # Ice temp layer1 [K]
        'sp': fh.variables['sp'][ind], # Surface pressure [Pa]
        'stl1': fh.variables['stl1'][ind], # Soil temp lev1 [K]
        'msl': fh.variables['msl'][ind], # Mean SLP [Pa]
        'u10': fh.variables['u10'][ind], # wind-u [m/s]
        'v10': fh.variables['v10'][ind],
        't2m': fh.variables['t2m'][ind], # 2m temp [K]
        'd2m': fh.variables['d2m'][ind], # 2 metre dewpoint temperature[K]
        'al': fh.variables['al'][ind], # Surface albedo [0-1]
        'lcc': fh.variables['lcc'][ind], # Low cloud cover [0-1]
        'mcc': fh.variables['mcc'][ind], # Medium cloud cover [0-1]
        'hcc': fh.variables['hcc'][ind], # High cloud cover [0-1]
        'si10': fh.variables['si10'][ind], # 10m wind speed [m/s]
        'skt': fh.variables['skt'][ind], # Skin temperature [K]
        'lons': lons,
        'lats': lats,
        'this_day': this_day}

fh.close()

In [16]:
type(vars)

dict

### Limit the arrays to a specific region 

In [18]:
this_day = vars['this_day']
lons = vars['lons']
lats = vars['lats']

In [None]:
# Stack all 2d arrays in one multi-d array

DAT = np.array([vars['ci'],   # 0
                vars['sst'],  # 1
                vars['istl1'],# 2
                vars['sp'],   # 3
                vars['stl1'], # 4
                vars['msl'],  # 5
                vars['u10'],  # 6
                vars['v10'],  # 7
                vars['t2m'],  # 8
                vars['d2m'],  # 9
                vars['al'],   # 10
                vars['lcc'],  # 11
                vars['mcc'],  # 12
                vars['hcc'],  # 13
                vars['si10'], # 14
                vars['skt']   # 15
                ])

# Shift the grid so lons go from -180 to 180 instead of 0 to 360.
DAT_shift, lons_shift = shiftgrid(
    lon0=180., datain=DAT, lonsin=lons, start=False)
lon, lat = np.meshgrid(lons_shift, lats)
this_dayhh = datetime.strptime(
    "%s-%s-%sT00:00:00Z" % (this_day.year, this_day.month, this_day.day), "%Y-%m-%dT%H:%M:%SZ")
this_year = this_dayhh.year