This notebook describes how I assigned each NOMAD observation to a Longhurst Region. Each region belongs to one of four biomes (Coastal, Polar, Trades, Westerlies), so the data also receive a biome assignment in the process. The goal is to facilitate the development of hierarchical models.

For reference here is the longhurst map. The number here is +1 relative to the breakdown 2 cells up.
<img src="./figJar/lhurstbiogeog.jpg">

In [6]:
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon

In [2]:
d_aop = pd.read_pickle('./pickleJar/dfNomad_AOP.pkl')

In [4]:
dl = pd.read_pickle('./pickleJar/d_longhurst.pkl')

In [4]:
d_aop.head().T

Unnamed: 0,0,1,2,3,4
id,1565,1566,1567,1568,1559
datetime,2003-04-15 15:15:00,2003-04-15 16:50:00,2003-04-15 17:50:00,2003-04-17 18:15:00,2003-07-21 18:27:00
doy,105.875,106.5,106.542,108,203.2
lat,38.4279,38.368,38.3074,38.6367,38.3047
lon,-76.61,-76.5,-76.44,-76.32,-76.44
etopo2,0,0,1,3,1
bathymetry,-3.044,-10.44,-8.956,-10.008,-9.118
oisst,3.7,3.7,3.7,3.7,22.03
wtemp,,,,,
woa_sst_mean,,,,,


In [5]:
dl.head()

Unnamed: 0,Biome,BiomCat,Province,ProvCode,ProvCat,coords
0,Coastal,0,Alaska Downwelling Coastal Province,ALSK,0,"[(-127.4999999999999, 50.58680189157428), (-12..."
1,Coastal,0,Australia-Indonesia Coastal Province,AUSW,1,"[(125.66080820729292, -14.61166092766527), (12..."
2,Coastal,0,Benguela Current Coastal Province,BENG,2,"[(11.716972846451625, -17.49999999999993), (11..."
3,Coastal,0,Brazil Current Coastal Province,BRAZ,3,"[(-33.49999999999994, -10.499999999999986), (-..."
4,Coastal,0,California Upwelling Coastal Province,CCAL,4,"[(-110.4999999999999, 24.500000000000142), (-1..."


In [6]:
d_aop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 24 columns):
id              4459 non-null object
datetime        4459 non-null datetime64[ns]
doy             4459 non-null float64
lat             4459 non-null float64
lon             4459 non-null float64
etopo2          4459 non-null float64
bathymetry      4459 non-null float64
oisst           4459 non-null float64
wtemp           1235 non-null float64
woa_sst_mean    2516 non-null float64
woa_sst_se      2516 non-null float64
sal             672 non-null float64
woa_sal_mean    2511 non-null float64
woa_sal_se      2511 non-null float64
rrs411          4293 non-null float64
rrs443          4456 non-null float64
rrs489          4422 non-null float64
rrs510          4207 non-null float64
rrs555          4453 non-null float64
rrs670          3905 non-null float64
fluo_chl        3392 non-null float64
hplc_chl        1381 non-null float64
is_hplc         4127 non-null float64
chl           

In [7]:
dl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 6 columns):
Biome       54 non-null object
BiomCat     54 non-null int8
Province    54 non-null object
ProvCode    54 non-null object
ProvCat     54 non-null int64
coords      54 non-null object
dtypes: int64(1), int8(1), object(4)
memory usage: 2.2+ KB


In [8]:
dl

Unnamed: 0,Biome,BiomCat,Province,ProvCode,ProvCat,coords
0,Coastal,0,Alaska Downwelling Coastal Province,ALSK,0,"[(-127.4999999999999, 50.58680189157428), (-12..."
1,Coastal,0,Australia-Indonesia Coastal Province,AUSW,1,"[(125.66080820729292, -14.61166092766527), (12..."
2,Coastal,0,Benguela Current Coastal Province,BENG,2,"[(11.716972846451625, -17.49999999999993), (11..."
3,Coastal,0,Brazil Current Coastal Province,BRAZ,3,"[(-33.49999999999994, -10.499999999999986), (-..."
4,Coastal,0,California Upwelling Coastal Province,CCAL,4,"[(-110.4999999999999, 24.500000000000142), (-1..."
5,Coastal,0,Canary Coastal Province (Eacb),CNRY,5,"[(-16.339471208351284, 11.50000000000017), (-1..."
6,Coastal,0,Central American Coastal Province,CAMR,6,"[(-80.44239450415253, -0.4999999999998437), (-..."
7,Coastal,0,Chile-Peru Current Coastal Province,CHIL,7,"[(-80.49999999999991, -0.37113815817150453), (..."
8,Coastal,0,China Sea Coastal Province,CHIN,8,"[(128.50000000000017, 35.12007487044124), (128..."
9,Coastal,0,E. Africa Coastal Province,EAFR,9,"[(48.50315282529476, -13.499999999999929), (48..."


In [7]:
def is_in_polygon(row):
    pnt = Point(row.lon, row.lat)
    dist_dict = {}
    for pcat, coords in zip (dl.ProvCat, dl.coords):
        poly = Polygon(coords)
        if pnt.within(poly):
            return pcat
        else:
            dist_dict[pcat] = poly.distance(pnt)
    return min(dist_dict, key=dist_dict.get)

In [40]:
# run province number assignment based on lat & lon for each row
d_aop.insert(5, 'ProvCat', d_aop.apply(is_in_polygon,axis=1))
d_aop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Data columns (total 25 columns):
id              4459 non-null object
datetime        4459 non-null datetime64[ns]
doy             4459 non-null float64
lat             4459 non-null float64
lon             4459 non-null float64
ProvCat         4459 non-null int64
etopo2          4459 non-null float64
bathymetry      4459 non-null float64
oisst           4459 non-null float64
wtemp           1235 non-null float64
woa_sst_mean    2516 non-null float64
woa_sst_se      2516 non-null float64
sal             672 non-null float64
woa_sal_mean    2511 non-null float64
woa_sal_se      2511 non-null float64
rrs411          4293 non-null float64
rrs443          4456 non-null float64
rrs489          4422 non-null float64
rrs510          4207 non-null float64
rrs555          4453 non-null float64
rrs670          3905 non-null float64
fluo_chl        3392 non-null float64
hplc_chl        1381 non-null float64
is_hplc         

In [101]:
d_aop.to_pickle('./pickleJar/dfNomad_AOP.pkl')

In [49]:
d_aop.sort_values('id').head(20)

Unnamed: 0,id,datetime,doy,lat,lon,ProvCat,etopo2,bathymetry,oisst,wtemp,...,rrs411,rrs443,rrs489,rrs510,rrs555,rrs670,fluo_chl,hplc_chl,is_hplc,chl
414,10,1997-02-07 16:59:00,39.65,31.613,-64.221,45,4405.0,-4368.080078,20.6,,...,0.007387,0.006822,0.005316,0.00219,0.000916,2.8e-05,0.059,,0.0,0.059
918,1002,1997-02-02 17:42:00,34.408333,31.082,-122.663,4,3976.0,-4146.02002,15.86,15.71,...,0.00555,0.005493,0.004758,0.003211,0.001549,6.5e-05,0.14,,0.0,0.14
919,1003,1997-02-02 23:47:00,34.741667,31.397,-121.983,4,3900.0,-4119.060059,15.68,15.76,...,0.006443,0.006276,0.005318,0.003529,0.001769,6.1e-05,0.14,,0.0,0.14
920,1005,1997-02-04 17:39:00,36.358333,33.182,-118.393,4,1202.0,-1182.97998,14.9,15.34,...,0.00489,0.004952,0.004655,0.003516,0.001905,6.9e-05,0.286,,0.0,0.286
921,1006,1997-02-05 17:58:00,37.675,33.658,-118.977,4,701.0,-796.0,14.9,14.33,...,0.003867,0.003997,0.004283,0.003748,0.00252,0.00013,0.55,,0.0,0.55
922,1007,1997-02-05 22:25:00,37.333333,33.488,-119.318,4,1422.0,-1649.880005,14.45,14.62,...,0.003489,0.003442,0.003549,0.003082,0.002029,9.6e-05,1.327,,0.0,1.327
923,1008,1997-02-06 17:54:00,38.608333,32.642,-121.035,4,3718.0,-3826.659912,14.99,14.41,...,0.006051,0.005803,0.005418,0.004213,0.002364,0.000257,0.346,,0.0,0.346
924,1011,1997-02-09 17:47:00,41.491667,33.752,-120.405,4,1263.0,-1096.400024,13.94,13.03,...,0.002605,0.00255,0.002712,0.002479,0.001949,0.000161,2.282,,0.0,2.282
925,1012,1997-02-10 19:12:00,41.991667,34.318,-120.8,4,760.0,-762.919983,13.52,13.25,...,0.003015,0.003085,0.003104,0.002541,0.001563,7.5e-05,0.75,,0.0,0.75
926,1013,1997-02-10 22:40:00,42.583333,34.15,-121.152,4,2443.0,-2198.840088,13.42,13.58,...,0.004205,0.004141,0.004497,0.004122,0.003026,0.000149,0.974,,0.0,0.974


In [42]:
dl_cols_to_xtract = ['Biome', 'BiomCat', 'Province', 'ProvCode', 'ProvCat']
merge_inner = pd.merge(left=dl[dl_cols_to_xtract], right=d_aop, on='ProvCat')

In [50]:
merge_inner.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4459 entries, 0 to 4458
Data columns (total 29 columns):
Biome           4459 non-null object
BiomCat         4459 non-null int8
Province        4459 non-null object
ProvCode        4459 non-null object
ProvCat         4459 non-null int64
id              4459 non-null object
datetime        4459 non-null datetime64[ns]
doy             4459 non-null float64
lat             4459 non-null float64
lon             4459 non-null float64
etopo2          4459 non-null float64
bathymetry      4459 non-null float64
oisst           4459 non-null float64
wtemp           1235 non-null float64
woa_sst_mean    2516 non-null float64
woa_sst_se      2516 non-null float64
sal             672 non-null float64
woa_sal_mean    2511 non-null float64
woa_sal_se      2511 non-null float64
rrs411          4293 non-null float64
rrs443          4456 non-null float64
rrs489          4422 non-null float64
rrs510          4207 non-null float64
rrs555          4453 n

Organize merged dataframe

In [52]:
d_ = merge_inner[['id', 'datetime', 'doy', 'lat', 'lon',
                  'Province', 'ProvCode', 'ProvCat', 'Biome', 'BiomCat',
                 'etopo2', 'oisst', 'woa_sst_mean', 'woa_sst_se', 'woa_sal_mean', 'woa_sal_se',
                 'rrs411', 'rrs443', 'rrs489', 'rrs510', 'rrs555', 'rrs670',
                 'is_hplc', 'chl']]

Compute maxBlues, including 3 blues or 4 blues

In [59]:
insert_loc = d_.columns.get_loc('rrs670') + 1
d_.insert(insert_loc, 'mxBlId2',
          d_[['rrs411', 'rrs443', 'rrs489', 'rrs510']].idxmax(axis=1))
d_.insert(insert_loc+1, 'mxBlCat2', d_.mxBlId2.astype('category').cat.codes)
d_.insert(insert_loc, 'mxBl2', d_[['rrs411', 'rrs443', 'rrs489', 'rrs510']].max(axis=1))
d_.insert(insert_loc, 'mxBlId', d_[['rrs443', 'rrs489', 'rrs510']].idxmax(axis=1))
d_.insert(insert_loc+1, 'mxBlCat', d_.mxBlId.astype('category').cat.codes)
d_.insert(insert_loc, 'mxBl', d_[['rrs443', 'rrs489', 'rrs510']].max(axis=1))

In [60]:
d_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4459 entries, 0 to 4458
Data columns (total 30 columns):
id              4459 non-null object
datetime        4459 non-null datetime64[ns]
doy             4459 non-null float64
lat             4459 non-null float64
lon             4459 non-null float64
Province        4459 non-null object
ProvCode        4459 non-null object
ProvCat         4459 non-null int64
Biome           4459 non-null object
BiomCat         4459 non-null int8
etopo2          4459 non-null float64
oisst           4459 non-null float64
woa_sst_mean    2516 non-null float64
woa_sst_se      2516 non-null float64
woa_sal_mean    2511 non-null float64
woa_sal_se      2511 non-null float64
rrs411          4293 non-null float64
rrs443          4456 non-null float64
rrs489          4422 non-null float64
rrs510          4207 non-null float64
rrs555          4453 non-null float64
rrs670          3905 non-null float64
mxBl            4459 non-null float64
mxBlId          4459 

In [61]:
d_.to_pickle('./pickleJar/d_nomad_aop_with_longhurst.pkl')

Next is to log transform the data where appropriate.
Note that here, I don't log-transform temperature, since the data includes sub-zero entries. I therefore leave it as is with the intention of standardizing or binning the resulting data. Note that according to Gelman (2008)$^*$ one might consider standardizing even after log transformation, where deemed appropriate.

In [124]:
d_l = pd.DataFrame(columns=['id', 'datetime', 'lat', 'lon',
                            'etopo2_l', 'oisst', 'sal_l',
                           'rrs411_l', 'rrs443_l', 'rrs489_l', 
                           'rrs510_l', 'rrs555_l', 'rrs670_l',
                           'mxBl_l', 'mxBlId', 'mxBlCat',
                           'mxBl2_l', 'mxBl2Id', 'mxBl2Cat',
                           'provCode', 'provCat', 'biome', 'biomCat',
                            'is_hplc', 'chl_l'])

In [125]:
d_l.id = d_.id
d_l.datetime = d_.datetime
d_l.lat = d_.lat
d_l.lon = d_.lon
d_l.etopo2_l = np.log10(d_.etopo2+1e-6)
d_l.oisst = d_.oisst
d_l.sal_l = np.log10(d_.woa_sal_mean+1e-6)
d_l.rrs411_l = np.log10(d_.rrs411)
d_l.rrs443_l = np.log10(d_.rrs443)
d_l.rrs489_l = np.log10(d_.rrs489)
d_l.rrs510_l = np.log10(d_.rrs510)
d_l.rrs555_l = np.log10(d_.rrs555)
d_l.rrs670_l = np.log10(d_.rrs670)
d_l.mxBl_l = np.log10(d_.mxBl)
d_l.mxBlId = d_.mxBlId
d_l.mxBlCat = d_.mxBlCat
d_l.mxBl2_l = np.log10(d_.mxBl2)
d_l.mxBl2Id = d_.mxBlId2
d_l.mxBl2Cat = d_.mxBlCat2
d_l.provCode = d_.ProvCode
d_l.provCat = d_.ProvCat
d_l.biome = d_.Biome
d_l.biomCat = d_.BiomCat
d_l.is_hplc = d_.is_hplc
d_l.chl_l = np.log10(d_.chl)

In [126]:
d_l.insert(d_l.columns.get_loc('chl_l')-1, 'blueBiome',
           d_l.apply(lambda x: x.biome + '-' + x.mxBlId, axis=1).astype('category'))

In [127]:
d_l.insert(d_l.columns.get_loc('blueBiome'), 'blueProvince',
           d_l.apply(lambda x: x.provCode + '-'+ x.mxBlId, axis=1).astype('category'))

In [128]:
d_l.insert(d_l.columns.get_loc('blueBiome')+1, 'blBiomCat',
          d_l.blueBiome.cat.codes)

In [129]:
d_l.insert(d_l.columns.get_loc('blueProvince')+1, 'blProvCat',
          d_l.blueProvince.astype('category').cat.codes)

In [130]:
d_l.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4459 entries, 0 to 4458
Data columns (total 29 columns):
id              4459 non-null object
datetime        4459 non-null datetime64[ns]
lat             4459 non-null float64
lon             4459 non-null float64
etopo2_l        4459 non-null float64
oisst           4459 non-null float64
sal_l           2511 non-null float64
rrs411_l        4293 non-null float64
rrs443_l        4456 non-null float64
rrs489_l        4422 non-null float64
rrs510_l        4207 non-null float64
rrs555_l        4453 non-null float64
rrs670_l        3822 non-null float64
mxBl_l          4459 non-null float64
mxBlId          4459 non-null object
mxBlCat         4459 non-null int8
mxBl2_l         4459 non-null float64
mxBl2Id         4459 non-null object
mxBl2Cat        4459 non-null int8
provCode        4459 non-null object
provCat         4459 non-null int64
biome           4459 non-null object
biomCat         4459 non-null int8
blueProvince    4459 non-nul

In [None]:
d_l.insert(d_l.columns.get_loc('provCat')+1,
          'provIdx', d_l.provCode.astype('category').cat.codes)
d_l.rename(columns={'provCat': 'provNum', 'provIdx': 'provCat'}, inplace=True)

In [131]:
d_l.to_pickle('./pickleJar/df_log.pkl')

$^*$Gelman, A. 2008. Scaling regression inputs by dividing by two standard deviations.  Statistics in Medicine 27:2865-73.