In [1]:
import pandas as pd
import numpy as np
import geoenrich
from geoenrich import dataloader, enrichment, satellite, exports
import pathlib

pd.set_option('display.max_rows', 200)

In [2]:
df = pd.read_csv('../plankton_geoenrich_data/biodiv/plankton_med.csv')

In [4]:
variables = [
    'sst',
    'nh4_med',
    'no3_med',
    'po4_med',
    'o2_med',
    'chl_med',
    'thetao_med',
    'so_med',
    ]

In [5]:
df.shape

(62275, 96)

In [3]:
df = pd.read_csv('./plankton_data/planktons_med.csv')

In [4]:
df['datetime'] = pd.to_datetime(df['datetime'])

In [5]:
# sort : 
df = df.sort_values(by=['datetime', 'lat', 'lon'])

In [8]:
taxons = [
    "Dinophysis acuminata",
    "Karenia mikimotoi",
    "Chaetoceros",
    "Dinophysis", 
    "Alexandrium minutum",
    "Pseudo-nitzschia"
]

In [9]:
df.head(30)

Unnamed: 0,index,taxon,lat,lon,value,unit,dataset,datetime,subset
33008,33008,Dinophysis acuminata,43.087319,5.906421,0.0,Nombre par litre,MED,1987-01-07 10:00:00,val
30414,30414,Chaetoceros,43.382816,4.879229,100.0,Nombre par litre,MED,1987-01-20 10:00:00,val
30415,30415,Dinophysis acuminata,43.382816,4.879229,0.0,Nombre par litre,MED,1987-01-20 10:00:00,val
35172,35172,Dinophysis acuminata,42.134665,9.540678,0.0,Nombre par litre,MED,1987-01-21 13:00:00,val
35173,35173,Karenia mikimotoi,42.134665,9.540678,100.0,Nombre par litre,MED,1987-01-21 13:00:00,val
36633,36633,Dinophysis acuminata,42.076882,9.795849,0.0,Nombre par litre,MED,1987-02-02 09:00:00,val
36634,36634,Karenia mikimotoi,42.076882,9.795849,100.0,Nombre par litre,MED,1987-02-02 09:00:00,val
35174,35174,Chaetoceros,42.134665,9.540678,100.0,Nombre par litre,MED,1987-02-02 14:00:00,val
35175,35175,Dinophysis acuminata,42.134665,9.540678,0.0,Nombre par litre,MED,1987-02-02 14:00:00,val
35176,35176,Karenia mikimotoi,42.134665,9.540678,500.0,Nombre par litre,MED,1987-02-02 14:00:00,val


In [10]:
index = df.groupby(['datetime', 'lat', 'lon']).apply(
    lambda rows: 
        np.array([[row["value"] if row["taxon"] == taxon else 0.0 for taxon in taxons]  for index, row in rows.iterrows()]).sum(axis=0)
)

In [11]:
# df2 like df but without rows : 
df2 = df.sort_values(by=['datetime', 'lat', 'lon']).drop_duplicates(subset=['datetime', 'lat', 'lon'])
df2 = df2.drop_duplicates(subset=['datetime', 'lat', 'lon'])

df2['taxons'] = index.values.tolist()


In [12]:
for taxon in taxons:
    df2[taxon] = df2['taxons'].apply(lambda x: x[taxons.index(taxon)])

In [14]:
df2.drop(columns=['taxon', 'taxons', 'value'], inplace=True)

In [16]:
df2.to_csv('./plankton_data/planktons_med_filtered.csv', index=False)

In [35]:
index_bool = index
for row in index_bool:
    for i in range(len(row)):
        if row[i] > 0:
            row[i] = 1
        else:
            row[i] = 0

In [40]:
index_sum = index_bool.apply(lambda row: row.sum())

In [44]:
index_sum.argmax()

5998

In [52]:
index_bool.index[5998]
# (Timestamp('2000-02-14 10:10:00'), 43.0873189621, 5.9064212393)

(Timestamp('2000-02-14 10:10:00'), 43.0873189621, 5.9064212393)

In [53]:
df[np.logical_and(df['datetime'] == '2000-02-14 10:10:00', np.logical_and(df['lat'] == 43.0873189621, df['lon'] == 5.9064212393))]

Unnamed: 0,index,taxon,lat,lon,value,unit,dataset,datetime,subset
34048,34048,Alexandrium minutum,43.087319,5.906421,700.0,Nombre par litre,MED,2000-02-14 10:10:00,val
34049,34049,Chaetoceros,43.087319,5.906421,64800.0,Nombre par litre,MED,2000-02-14 10:10:00,val
34050,34050,Dinophysis,43.087319,5.906421,100.0,Nombre par litre,MED,2000-02-14 10:10:00,val
34051,34051,Karenia mikimotoi,43.087319,5.906421,1900.0,Nombre par litre,MED,2000-02-14 10:10:00,val
34052,34052,Pseudo-nitzschia,43.087319,5.906421,12500.0,Nombre par litre,MED,2000-02-14 10:10:00,val


In [56]:
taxons = [
    "Dinophysis acuminata",
    "Karenia mikimotoi",
    "Chaetoceros",
    "Dinophysis", 
    "Alexandrium minutum",
    "Pseudo-nitzschia"
]

index[5998]

array([    0.,  1900., 64800.,   100.,   700., 12500.])

In [57]:
df

Unnamed: 0,index,taxon,lat,lon,value,unit,dataset,datetime,subset
33008,33008,Dinophysis acuminata,43.087319,5.906421,0.0,Nombre par litre,MED,1987-01-07 10:00:00,val
30414,30414,Chaetoceros,43.382816,4.879229,100.0,Nombre par litre,MED,1987-01-20 10:00:00,val
30415,30415,Dinophysis acuminata,43.382816,4.879229,0.0,Nombre par litre,MED,1987-01-20 10:00:00,val
35172,35172,Dinophysis acuminata,42.134665,9.540678,0.0,Nombre par litre,MED,1987-01-21 13:00:00,val
35173,35173,Karenia mikimotoi,42.134665,9.540678,100.0,Nombre par litre,MED,1987-01-21 13:00:00,val
...,...,...,...,...,...,...,...,...,...
44792,44792,Dinophysis,43.060336,3.075940,0.0,Nombre par litre,MED,2021-12-27 13:20:00,train
58387,58387,Dinophysis,43.348598,4.694981,0.0,Nombre par litre,MED,2021-12-27 13:20:00,val
59600,59600,Dinophysis,43.489290,5.122630,200.0,Nombre par litre,MED,2021-12-27 15:15:00,val
49612,49612,Dinophysis,43.556311,4.035876,100.0,Nombre par litre,MED,2021-12-28 09:10:00,test


In [8]:
# Median of lon : 

df['lon'].median()
# subset : train for lon <= median, test for lon > median
df['subset'] = np.where(df['lon'] <= df['lon'].median(), 'train', 'test')

df.to_csv('./plankton_data/planktons_med.csv', index=False)

In [9]:
# at the right of the 80th percentile : val
perc_80 = df['lon'].quantile(0.8)

df.loc[df['lon'] > perc_80, 'subset'] = 'val'

In [10]:
df.to_csv('./plankton_data/planktons_med.csv', index=False)

In [5]:
# output = exports.retrieve_data("plankton", 3897, 'oxygen', shape = 'buffer')
# values = exports.export_to_array(output)

# exports.collate_npy(ds_ref = 'plankton_med', data_path = './npy', slice = (0, 10000))

In [11]:
raster = np.load('./npy/plankton_med-npy/193.npy')

In [12]:
raster[:,:,0]

array([[         nan,          nan,          nan, ..., 297.44924927,
        297.58990479, 297.63739014],
       [         nan,          nan,          nan, ..., 297.29650879,
        297.39498901, 297.40649414],
       [         nan,          nan,          nan, ..., 297.19326782,
        297.22668457, 297.18890381],
       ...,
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan]])

In [13]:
raster[:,:,6]

array([[        nan,         nan,         nan, ..., 23.96222496,
        24.19167137, 24.26071167],
       [        nan,         nan,         nan, ..., 23.64068794,
        23.89012718, 23.97191238],
       [        nan,         nan,         nan, ..., 23.58297157,
        23.71512794, 23.69724846],
       ...,
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]])

In [3]:
import numpy as np
arr = np.load('/mounts/Datasets4/DeepOcean/npy-norm/gbif_10779164-norm-npy/1020055146.npy')