In [11]:
# Python built-in library imports
import os
from collections import OrderedDict

# pip installed library imports
import h5py
import matplotlib.cm as cm
import matplotlib.colors
import matplotlib.pyplot as plt
import netCDF4
import numpy as np
import pandas as pd
import scipy as sci

import mpl_toolkits
from mpl_toolkits.basemap import Basemap

### Utility tools

In [28]:
utils = {
    'map_title': 'Monthly Averaged Methane Column Mixing Ratio (20{})',
    'data_dir': './data/SWIRL3CH4/all_data/SWIRL3CH4/',  
    'data_loc': '/Users/mlimb/sio/methane/gosat/data/SWIRL3CH4/all_data/SWIRL3CH4/{}',
}


### Intialize `aliso_trend` dictionary

Notes

- OrderedDict has year [int] as keys from 2009 to 2017
- aliso_trend[year] has an list of 12 elements
- each element is initialized to null value of **-1** 

#### **<span style="color:orange; background-color: navy">Question</span>**

- Is **-1** a good choice of null value?

In [303]:
all_years = range(2009, 2017+1)

# initialize OrderedDict with year [int] mapping to empty list []
# using -1 as init
aliso_trend = OrderedDict((year,[-1]*12) for year in all_years)

# manual chek
# for k,v in aliso_trend.items():
#     print k # should year year [int] 2009 to 2017
#     print v # should be []

In [29]:
data_dir = utils['data_dir']
data_loc = utils['data_loc']
len(os.listdir(data_dir))

89

In [24]:
files = [f for f in os.listdir(data_dir) if f.endswith('.h5')]
len(files)

89

In [344]:
# dictionary that maps month number to names (Long version, 3-letter abbreviated version,
# month number as int).
# all in Strings
month_names = {
    '01': ('January', 'JAN', 1),
    '02': ('February', 'FEB', 2),
    '03': ('March', 'MAR', 3),
    '04': ('April', 'APR', 4),
    '05': ('May', 'MAY', 5),
    '06': ('June', 'JUN', 6),
    '07': ('July', 'JUL', 7),
    '08': ('August', 'AUG', 8),
    '09': ('September', 'SEP', 9),
    '10': ('October', 'OCT', 10),
    '11': ('November', 'NOV', 11),
    '12': ('December', 'DEC', 12)
}



I manually confirmed all filenames.

**All available data**<br />
2009 June to 2017 February EXCEPT:

Known missing data<br />
- 2014 June and December 
- 2015 January and September 

Q. Why is there no data after 2017 FEB?

## Write out what to do

I have a list of files that have global coverage, 
one monthly average per file

From each of the 89 matrix, I only need a single value,
the one nearest to the Aliso Canyon Oil Field

    this is at: 
        [lon, lat] = -118.75, 33.75

    which is same as location as:
        ???
    
Given:
- 2.5 degrees lat/lon spatial resolution
- starting values hidden in the h5 file

`get_aliso_value` (function): given the point, get value at that exact point for all years

<hr >
**Tasks:**

1) Open files **<span style="color:black; background-color: yellow">DONE</span>**

- Loop through all files, open all, save netcdf of all **<span style="color:black; background-color: yellow">DONE</span>**
- Mask the files **<span style="color:black; background-color: yellow">DONE</span>**

2) Determine the location for the Aliso lon/lat combo **<span style="color:red; background-color: black">Don't Panic</span>**

- What is the lon/lat value nearest to the Aliso Canyon Oil Field? **<span style="color:black; background-color: yellow">DONE</span>**

        34.3122, -118.5645 (from USGS data)

- What is the matrix [x,y] of the [lon, lat]? **<span style="color:black; background-color: yellow">DONE</span>**

        Latitude: lats[22][0] (33.75)
        Longitude: lons[0][24] (-118.75)


- Write a function (lambda) that retrieves the value at [x,y] for each matrix **<span style="color:black; background-color: lightgreen">In Progress</span>**

3) Write `get_aliso_value` function: 

- Add 1 value to aliso_trend as looping over** 
**<span style="color:black; background-color: lightgreen">In Progress</span>**


Either 
- given the point, get value at that exact point for all years
- OR given the global monthly data and one point, return the methane value at that point

4) Create an OrderedDictionary of years to store all data per month by year

5) Manual confirmation that point is accurate**

missing value (invalid) is `-9999.99`
```
aliso_trend = 
{
    2009: [] # 12 values per yer
    2010: [] 
    2011: []
    2012
    2013
    2014
    2015
    2016
    2017: []
}
```

Need to be 100% correct:
- correct matrix location for lon/lat pair

Status Indicators:
**<span style="color:black; background-color: yellow">DONE</span>**
**<span style="color:black; background-color: lightgreen">In Progress</span>**
**<span style="color:red; background-color: black">Don't Panic</span>**

### Function to retrieve month-year from file name

In [246]:
test = files[0]
year_month = test[9:15]

month = test[13:15]
year = test[9:13]
print month, year

get_year_month = lambda fname: fname[9:15]
get_year_month_tup = lambda year_month_str: (int(year_month_str[0:  4]), month_names.get(year_month_str[4:]))

int(get_year_month(test))

06 2009


200906

In [32]:
# check file string names
files[1:3]

['GOSATTFTS2009070120090731_03C02SV0221.h5',
 'GOSATTFTS2009080120090831_03C02SV0221.h5']

### Store h5 data as netCDF in `ncs` OrderedDict

string year-month (ex. `200906`) is used a key

### **<span style="color:yellow; background-color: darkblue">Added empty string for 4 missing months here</span>**

In [269]:
# initialize empty OrderedDict
ncs = OrderedDict()

for fname in files:
    year_month = get_year_month(fname)
    ncs[year_month] = netCDF4.Dataset(data_loc.format(fname))
    # manual insertion of missing data as empty arrays (Better here than later due to use
    # or OrderedDicts that doesn't allow inserting element at specified locations)
    if year_month == '201405':
        ncs['201406'] = []
    if year_month == '201411': #consecutive
        ncs['201412'] = []
        ncs['201501'] = []
    if year_month == '201508':
        ncs['201509'] = []

### Check that 4 missing date-month pairs were corrected

In [266]:
print (len(ncs.keys()) - len(files)) is 4

True


In [268]:
# ncs.keys() # manual check

In [55]:
# check keys output
# ncs.keys()

In [56]:
ncs['200910']

<type 'netCDF4._netCDF4.Dataset'>
root group (NETCDF4 data model, file format HDF5):
    dimensions(sizes): 
    variables(dimensions): 
    groups: Attribute, Data, Global

In [353]:
def get_ch4_masked_data(nc):
    """Return the *masked* monthly methane average column mixing ratio values
    
    *This is what should be used to generate figures as invalid values will be masked
    and automatically NOT Plotted
    """
    if nc == []:
        return []
    ch4 = nc.groups['Data']['mixingRatio']['XCH4']
    masked_data = np.ma.masked_equal(ch4, value=ch4.invalidValue)
#   print masked_data
    return masked_data

In [270]:
# ncs

In [354]:
# init emtpy ordered dicts
data = OrderedDict()

def make_data_dict(_ncs, _data=OrderedDict()):  
    """Populate the given OrderedDict with given hdf5 data
    Note: If no OrderedDict is given, create an empty one
    
    Result:
        OrderedDict will have these keys: `masked_data` and `flat_data`
        
    * Currently, *masked_data* is used to generate all the maps
    * flat_data is 1D version of 2D arrays and is used to generate histograms
    """
    for key, _nc in _ncs.items():
        _data[key] = {}
        _data[key]['masked_data'] = get_ch4_masked_data(_nc) # used to make maps
#         _data[key]['flat_data'] = _data[key]['data'].flatten() # used to make histograms
    #return _data # commenting this line prevents output

In [355]:
make_data_dict(ncs, data)

In [69]:
# a = data['200906']['masked_data']
# np.amax(a.flatten())

1.8196319

## don't forget!

data is at `data['200906']['masked_data']`

or more generally `data['yyyymm']['masked_data']

In [70]:
len(data)

89

### GET lat and lon matrices (in 'geolocation') 

In [76]:
geo = ncs['200910']['Data']['geolocation']
geo.variables.keys()
lat =geo.variables.get('latitude')
lon =geo.variables.get('longitude')
lat.shape

(72, 144)

#### Searching for numLatitude and intervalLengthOfLatitude (+ longitude equivalents) in nc

In [100]:
nc = ncs['200910']
ncattrs = nc.groups.get('Attribute')
ncattrs.variables.keys()

[u'intervalLengthOfLatitude',
 u'intevalLengthOfLongitude',
 u'numLatitude',
 u'numLongitude']

In [122]:
aa = [v for k, v in ncattrs.variables.iteritems()]
v = aa[0]
v[0]

aa[3][0]

144

In [138]:
ll = nc.groups.get('Data')['geolocation']['latitude']
ll.ncattrs()

[u'unit', u'validRange', u'longName', u'invalidValue']

#### Covert lats and lons to numpy arrays (right now netCDF.Variable)

In [79]:
lats = np.array(lat)
lons = np.array(lon)

In [140]:
lats

array([[ 88.75,  88.75,  88.75, ...,  88.75,  88.75,  88.75],
       [ 86.25,  86.25,  86.25, ...,  86.25,  86.25,  86.25],
       [ 83.75,  83.75,  83.75, ...,  83.75,  83.75,  83.75],
       ..., 
       [-83.75, -83.75, -83.75, ..., -83.75, -83.75, -83.75],
       [-86.25, -86.25, -86.25, ..., -86.25, -86.25, -86.25],
       [-88.75, -88.75, -88.75, ..., -88.75, -88.75, -88.75]], dtype=float32)

In [141]:
lons

array([[-178.75, -176.25, -173.75, ...,  173.75,  176.25,  178.75],
       [-178.75, -176.25, -173.75, ...,  173.75,  176.25,  178.75],
       [-178.75, -176.25, -173.75, ...,  173.75,  176.25,  178.75],
       ..., 
       [-178.75, -176.25, -173.75, ...,  173.75,  176.25,  178.75],
       [-178.75, -176.25, -173.75, ...,  173.75,  176.25,  178.75],
       [-178.75, -176.25, -173.75, ...,  173.75,  176.25,  178.75]], dtype=float32)

### Find the indices for latitude closest to 34.3

In [197]:
lats[0,:] # -> 88.75 (top value) NH
lats[71,:] # -88.75 (bottom value) SH

lats[22][0] # closest
lats[21][0], lats[23][0]

lats[22][0] # choose row first (latitude) then all values in that row are equivalent, so pick first one

33.75

### Find the indices for longitude closet to -118.6

In [198]:
lons[:, 0][0]# -> -178.75
lons[: -1][0] 
np.where(lons[: -1][0] == -118.75)

lons[0][24] # choose any row (picked first), and all lon values should be different across
# pick the right oen located at i=24

-118.75

### Write a function (lambda to retrieve the aliso_data from each data matrix

In [379]:
def get_aliso_data(_month_data):
    # return -1 if masked_data is [] <-- these are missing months (4)
    if month_data['masked_data'] == []:
        return -1
    else:
        return np.ma.getdata(month_data['masked_data'][22][24]).item() or -1
    # or statement exists so that -1 is returned in place of masked value of 0.0

## This step of retrieving Aliso specific data needs validation***

In [376]:
d1 = data['201506']
d1['masked_data'].shape
v = d1['masked_data'][22][24]
print v
d = np.ma.getdata(v)
print 0.0 or -1
# d.item()
v

--
-1


masked

## some manual validation

In [367]:
ms = month_names.keys()

#print ms # not ordered 
m2015 = sorted(['2015' + m for m in ms]) # sorted
print m2015

for m in m2015:
    val = data.get(m)['masked_data']
    if val == []:
        print 'null'
    else:
        print val[22][24]
    
#     print data[m]['masked_data'][22][24]

['201501', '201502', '201503', '201504', '201505', '201506', '201507', '201508', '201509', '201510', '201511', '201512']
null
1.81371
1.81371
1.81363
1.81237
--
1.803
--
null
--
--
--


In [271]:
# data

In [378]:
for key, _month_data in data.iteritems():
#     aliso_trend[]
    (year, month_info) = get_year_month_tup(key)
    (m_name, m_abbr, m_num) = month_info
    print _month_data
    aliso_monthly_ch4 = get_aliso_data(_month_data)
    # aliso_trend[year] is initialized to [-1, ... -1] that has twelve -1's
    # append methane concentration
    # the index location corresponds to month #
    index = m_num - 1 # since Python is 0 indexed
    aliso_trend[year][index] = aliso_monthly_ch4

# for k,v in aliso_trend.items():
#     print v
#     print ' '
    
aliso_trend[2015]

{'masked_data': masked_array(data =
 [[-- -- -- ..., -- -- --]
 [-- -- -- ..., -- -- --]
 [-- -- -- ..., -- -- --]
 ..., 
 [-- -- -- ..., -- -- --]
 [-- -- -- ..., -- -- --]
 [-- -- -- ..., -- -- --]],
             mask =
 [[ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 ..., 
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]],
       fill_value = -9999.0)
}
{'masked_data': masked_array(data =
 [[-- -- -- ..., -- -- --]
 [-- -- -- ..., -- -- --]
 [-- -- -- ..., -- -- --]
 ..., 
 [-- -- -- ..., -- -- --]
 [-- -- -- ..., -- -- --]
 [-- -- -- ..., -- -- --]],
             mask =
 [[ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 ..., 
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...,  True  True  True]
 [ True  True  True ...

IndexError: list index out of range