<a name="top"></a>
<div style="width:1000 px">

<div style="float:right; width:98 px; height:98px;">
<img src="https://cdn.miami.edu/_assets-common/images/system/um-logo-gray-bg.png" alt="Miami Logo" style="height: 98px;">
</div>

<h1>Lunch Byte 4/19/2019</h1>
By Kayla Besong
    
<br>
<br>
<br>Introduction to Xarray and Dask to upload and process data from NOAA for ProcessData_XR.ipynb
<br>use to compare to GettingData_XR.ipynb



<div style="clear:both"></div>
</div>

<hr style="height:2px;">

In [1]:
import xarray as xr
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import netCDF4 as nc
from mpl_toolkits.basemap import Basemap
from datetime import datetime
from dask.diagnostics import ProgressBar


In [2]:
%matplotlib inline

from dask.distributed import Client
import xarray as xr

### Let's Import Some Data through NOAA 

In [3]:
%%time

heights = []                                   # empty array to append opened netCDF's to
temps = []
date_range = np.arange(1995,2001,1)            # range of years interested in obtaining, remember python starts counting at 0 so for 10 years we actually need to say through 2005


for i in date_range:
    url_h = 'https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/hgt.%s.nc' % i    # string subset -->  %.s and % i will insert the i in date_range we are looping through
    url_t = 'https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/air.%s.nc' % i
    print(url_h, url_t)
    
    heights.append(url_h)                                         # append 
    temps.append(url_t)

('https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/hgt.1995.nc', 'https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/air.1995.nc')
('https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/hgt.1996.nc', 'https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/air.1996.nc')
('https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/hgt.1997.nc', 'https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/air.1997.nc')
('https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/hgt.1998.nc', 'https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/air.1998.nc')
('https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/hgt.1999.nc', 'https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/ncep.reanalysis2/pressure/air.1999.nc')
('https://www.esrl.noaa.gov/psd/thredds/dodsC/Datasets/

### Turn list of urls into one large, combined (concatenated) dataset based on time

In [4]:
%%time 
concat_h = xr.open_mfdataset(heights)                          # aligns all the lat, lon, lev, values of all the datasets based on dimesnion of time


  stack_char_dim=stack_char_dim)


CPU times: user 226 ms, sys: 22.9 ms, total: 249 ms
Wall time: 2.98 s


In [5]:
%%time 
concat_t = xr.open_mfdataset(temps)

  stack_char_dim=stack_char_dim)


CPU times: user 221 ms, sys: 20 ms, total: 241 ms
Wall time: 2.94 s


#### Take a peak to ensure everything was read successfully and understand the dataset that you have

In [6]:
concat_h, concat_t


(<xarray.Dataset>
 Dimensions:  (lat: 73, level: 17, lon: 144, time: 8768)
 Coordinates:
   * level    (level) float32 1000.0 925.0 850.0 700.0 ... 50.0 30.0 20.0 10.0
   * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... -82.5 -85.0 -87.5 -90.0
   * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
   * time     (time) datetime64[ns] 1995-01-01 ... 2000-12-31T18:00:00
 Data variables:
     hgt      (time, level, lat, lon) float32 dask.array<shape=(8768, 17, 73, 144), chunksize=(1460, 17, 73, 144)>
 Attributes:
     Conventions:                     CF-1.0
     title:                           4x daily NCEP/DOE Reanalysis 2
     history:                         created 2002/03/15 by RHS (netCDF2.3)
     comments:                        Data is from \nNCEP/DOE AMIP-II Reanalys...
     platform:                        Model
     source:                          NCEP/DOE AMIP-II Reanalysis (Reanalysis-...
     institution:                     National Centers for E

In [7]:
%%time
concat_h = concat_h.sel(lat = slice(90,0), level = 500).resample(time = '24H').mean(dim = 'time')

CPU times: user 10.4 s, sys: 277 ms, total: 10.7 s
Wall time: 10.5 s


In [8]:
%%time
concat_t = concat_t.sel(lat = slice(90,0), level = 925).resample(time = '24H').mean(dim = 'time')

CPU times: user 10.7 s, sys: 163 ms, total: 10.9 s
Wall time: 10.8 s


#### Take another peak

In [9]:
concat_h, concat_t

(<xarray.Dataset>
 Dimensions:  (lat: 37, lon: 144, time: 2192)
 Coordinates:
   * time     (time) datetime64[ns] 1995-01-01 1995-01-02 ... 2000-12-31
     level    float32 500.0
   * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... 10.0 7.5 5.0 2.5 0.0
   * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
 Data variables:
     hgt      (time, lat, lon) float32 dask.array<shape=(2192, 37, 144), chunksize=(1, 37, 144)>,
 <xarray.Dataset>
 Dimensions:  (lat: 37, lon: 144, time: 2192)
 Coordinates:
   * time     (time) datetime64[ns] 1995-01-01 1995-01-02 ... 2000-12-31
     level    float32 925.0
   * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... 10.0 7.5 5.0 2.5 0.0
   * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
 Data variables:
     air      (time, lat, lon) float32 dask.array<shape=(2192, 37, 144), chunksize=(1, 37, 144)>)

#### Write out data for processing 

In [10]:
%%time
concat_h.to_netcdf('heights_9520.nc')

CPU times: user 8.61 s, sys: 3.7 s, total: 12.3 s
Wall time: 2min 47s


In [11]:
%%time
concat_t.to_netcdf('temps_9520.nc')

CPU times: user 8.79 s, sys: 3.74 s, total: 12.5 s
Wall time: 2min 42s
