In [60]:
import pandas as pd
import pathlib
from xml.dom.minidom import *

In [2]:
provinces = {}
home = pathlib.Path.home()
fp = home / 'DEV/Longhurst-Province-Finder/longhurst.xml'
tree = parse(fp)

In [3]:
for node in tree.getElementsByTagName('MarineRegions:longhurst'):

	# 1. Get province code, name and bounding box from file
	provCode = node.getElementsByTagName('MarineRegions:provcode')[0].firstChild.data
	provName = node.getElementsByTagName('MarineRegions:provdescr')[0].firstChild.data
	fid = node.getAttribute("fid")
	b = node.getElementsByTagName('gml:coordinates')[0].firstChild.data

	# 2. Parse bounding box coordinates
	b = b.split(' ')
	x1,y1 = b[0].split(',')
	x2,y2 = b[1].split(',')
	x1 = float(x1)
	y1 = float(y1)
	x2 = float(x2)
	y2 = float(y2)

	provinces[fid] = {'provName': provName, 'provCode': provCode, 'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2}

In [15]:
d = pd.DataFrame(provinces).T

In [8]:
d.head()

Unnamed: 0,provCode,provName,x1,x2,y1,y2
longhurst.1,FKLD,Coastal - SW Atlantic Shelves Province,-70.5,-51.5,-55.5,-38.5
longhurst.10,BENG,Coastal - Benguela Current Coastal Province,7.5,18.3499,-35.5,-16.5
longhurst.11,ARCH,Trades - Archipelagic Deep Basins Province,92.5,174.5,-33.5,22.5
longhurst.12,SUND,Coastal - Sunda-Arafura Shelves Province,98.1819,147.147,-17.7192,21.9356
longhurst.13,GUIN,Coastal - Guinea Current Coastal Province,-17.5,13.8536,-17.5,11.6837


In [9]:
d.tail()

Unnamed: 0,provCode,provName,x1,x2,y1,y2
longhurst.54,BPLR,Polar - Boreal Polar Province (POLR),-179.999,179.99,50.735,89.8992
longhurst.6,EAFR,Coastal - E. Africa Coastal Province,15.5,52.5,-39.5,-6.5
longhurst.7,AUSW,Coastal - Australia-Indonesia Coastal Province,94.5,146.5,-39.5,7.5
longhurst.8,AUSE,Coastal - East Australian Coastal Province,144.5,156.5,-39.5,-13.5
longhurst.9,ISSG,Trades - Indian S. Subtropical Gyre Province,30.5,117.5,-37.5,-10.5


In [10]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Index: 54 entries, longhurst.1 to longhurst.9
Data columns (total 6 columns):
provCode    54 non-null object
provName    54 non-null object
x1          54 non-null object
x2          54 non-null object
y1          54 non-null object
y2          54 non-null object
dtypes: object(6)
memory usage: 3.0+ KB


In [29]:
d.rename(columns={'x1': 'lon_min', 'x2': 'lon_max', 'y1': 'lat_min', 'y2': 'lat_max'}, inplace=True)

In [13]:
d.set_index('provCode').head()

Unnamed: 0_level_0,provName,lon_min,lon_max,lat_min,lat_max
provCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
FKLD,Coastal - SW Atlantic Shelves Province,-70.5,-51.5,-55.5,-38.5
BENG,Coastal - Benguela Current Coastal Province,7.5,18.3499,-35.5,-16.5
ARCH,Trades - Archipelagic Deep Basins Province,92.5,174.5,-33.5,22.5
SUND,Coastal - Sunda-Arafura Shelves Province,98.1819,147.147,-17.7192,21.9356
GUIN,Coastal - Guinea Current Coastal Province,-17.5,13.8536,-17.5,11.6837


In [21]:
d['provType'] = d.provName.apply(lambda x: x.split(' - ')[0])

In [25]:
d.reset_index(inplace=True)

In [61]:
d.rename(columns={'index': 'provID'}, inplace=True)

In [62]:
d.head()

Unnamed: 0_level_0,provID,provCode,provType,provCat,provName,lon_min,lon_max,lat_min,lat_max
provNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,longhurst.1,FKLD,Coastal,0,Coastal - SW Atlantic Shelves Province,-70.5,-51.5,-55.5,-38.5
2,longhurst.2,CHIL,Coastal,0,Coastal - Chile-Peru Current Coastal Province,-86.5,-69.5,-55.5,0.5
3,longhurst.3,TASM,Westerlies,3,Westerlies - Tasman Sea Province,146.5,174.975,-43.5,-33.5
4,longhurst.4,BRAZ,Coastal,0,Coastal - Brazil Current Coastal Province,-58.5497,-33.5,-41.5,-10.5
5,longhurst.5,SATL,Trades,2,Trades - South Atlantic Gyral Province (SATG),-53.5,15.5,-41.5,-2.5


In [42]:
d['provNo'] = [no.split('.')[1] for no in d.provNum.tolist()]

In [43]:
d.head()

Unnamed: 0,provNum,provCode,provName,lon_min,lon_max,lat_min,lat_max,provType,provNo
0,longhurst.1,FKLD,Coastal - SW Atlantic Shelves Province,-70.5,-51.5,-55.5,-38.5,Coastal,1
1,longhurst.10,BENG,Coastal - Benguela Current Coastal Province,7.5,18.3499,-35.5,-16.5,Coastal,10
2,longhurst.11,ARCH,Trades - Archipelagic Deep Basins Province,92.5,174.5,-33.5,22.5,Trades,11
3,longhurst.12,SUND,Coastal - Sunda-Arafura Shelves Province,98.1819,147.147,-17.7192,21.9356,Coastal,12
4,longhurst.13,GUIN,Coastal - Guinea Current Coastal Province,-17.5,13.8536,-17.5,11.6837,Coastal,13


In [44]:
d.set_index('provNo', inplace=True)

In [66]:
d.index.name = 'provNum'

In [47]:
d.index = d.index.astype('int')

In [50]:
d.sort_index(inplace=True)

In [67]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54 entries, 1 to 54
Data columns (total 9 columns):
provID      54 non-null object
provCode    54 non-null object
provType    54 non-null object
provCat     54 non-null int8
provName    54 non-null object
lon_min     54 non-null object
lon_max     54 non-null object
lat_min     54 non-null object
lat_max     54 non-null object
dtypes: int8(1), object(8)
memory usage: 6.3+ KB


In [68]:
d.head()

Unnamed: 0_level_0,provID,provCode,provType,provCat,provName,lon_min,lon_max,lat_min,lat_max
provNum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,longhurst.1,FKLD,Coastal,0,Coastal - SW Atlantic Shelves Province,-70.5,-51.5,-55.5,-38.5
2,longhurst.2,CHIL,Coastal,0,Coastal - Chile-Peru Current Coastal Province,-86.5,-69.5,-55.5,0.5
3,longhurst.3,TASM,Westerlies,3,Westerlies - Tasman Sea Province,146.5,174.975,-43.5,-33.5
4,longhurst.4,BRAZ,Coastal,0,Coastal - Brazil Current Coastal Province,-58.5497,-33.5,-41.5,-10.5
5,longhurst.5,SATL,Trades,2,Trades - South Atlantic Gyral Province (SATG),-53.5,15.5,-41.5,-2.5


In [53]:
d['provCat'] = d.provType.astype('category').cat.codes

In [57]:
d.head()

Unnamed: 0_level_0,provNum,provCode,provName,lon_min,lon_max,lat_min,lat_max,provType,provCat
provNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,longhurst.1,FKLD,Coastal - SW Atlantic Shelves Province,-70.5,-51.5,-55.5,-38.5,Coastal,0
2,longhurst.2,CHIL,Coastal - Chile-Peru Current Coastal Province,-86.5,-69.5,-55.5,0.5,Coastal,0
3,longhurst.3,TASM,Westerlies - Tasman Sea Province,146.5,174.975,-43.5,-33.5,Westerlies,3
4,longhurst.4,BRAZ,Coastal - Brazil Current Coastal Province,-58.5497,-33.5,-41.5,-10.5,Coastal,0
5,longhurst.5,SATL,Trades - South Atlantic Gyral Province (SATG),-53.5,15.5,-41.5,-2.5,Trades,2


In [70]:
d = d[['provID', 'provCode', 'provType', 'provCat', 'provName', 'lon_min', 'lon_max', 'lat_min', 'lat_max']]

In [71]:
d.head()

Unnamed: 0_level_0,provID,provCode,provType,provCat,provName,lon_min,lon_max,lat_min,lat_max
provNum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,longhurst.1,FKLD,Coastal,0,Coastal - SW Atlantic Shelves Province,-70.5,-51.5,-55.5,-38.5
2,longhurst.2,CHIL,Coastal,0,Coastal - Chile-Peru Current Coastal Province,-86.5,-69.5,-55.5,0.5
3,longhurst.3,TASM,Westerlies,3,Westerlies - Tasman Sea Province,146.5,174.975,-43.5,-33.5
4,longhurst.4,BRAZ,Coastal,0,Coastal - Brazil Current Coastal Province,-58.5497,-33.5,-41.5,-10.5
5,longhurst.5,SATL,Trades,2,Trades - South Atlantic Gyral Province (SATG),-53.5,15.5,-41.5,-2.5


In [72]:
d.lon_min = d.lon_min.astype('float')
d.lon_max = d.lon_max.astype('float')
d.lat_min = d.lat_min.astype('float')
d.lat_max = d.lat_max.astype('float')

In [73]:
d.head()

Unnamed: 0_level_0,provID,provCode,provType,provCat,provName,lon_min,lon_max,lat_min,lat_max
provNum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,longhurst.1,FKLD,Coastal,0,Coastal - SW Atlantic Shelves Province,-70.5,-51.5,-55.5,-38.5
2,longhurst.2,CHIL,Coastal,0,Coastal - Chile-Peru Current Coastal Province,-86.5,-69.5,-55.5,0.5
3,longhurst.3,TASM,Westerlies,3,Westerlies - Tasman Sea Province,146.5,174.974973,-43.5,-33.5
4,longhurst.4,BRAZ,Coastal,0,Coastal - Brazil Current Coastal Province,-58.549728,-33.5,-41.5,-10.5
5,longhurst.5,SATL,Trades,2,Trades - South Atlantic Gyral Province (SATG),-53.5,15.5,-41.5,-2.5


In [75]:
d.provType.value_counts()

Coastal       22
Westerlies    14
Trades        12
Polar          6
Name: provType, dtype: int64

In [76]:
d.provCat.value_counts()

0    22
3    14
2    12
1     6
Name: provCat, dtype: int64

In [77]:
d.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54 entries, 1 to 54
Data columns (total 9 columns):
provID      54 non-null object
provCode    54 non-null object
provType    54 non-null object
provCat     54 non-null int8
provName    54 non-null object
lon_min     54 non-null float64
lon_max     54 non-null float64
lat_min     54 non-null float64
lat_max     54 non-null float64
dtypes: float64(4), int8(1), object(4)
memory usage: 6.3+ KB


In [78]:
d.to_pickle('../Bayesian-Chlorophyll/pickleJar/d_longhurst.pkl')