### Imports
Import necessary libraries for data analysis.

In [1]:
import pandas as pd
import numpy as np

### Data Loading
Read the CSV files into pandas DataFrames and preprocess the data.

In [20]:
# Reading CSV files
df1 = pd.read_csv('../plankton_geoenrich_data/biodiv/plankton_med.csv')
df2 = pd.read_csv('./plankton_data/planktons_med.csv')
df3 = pd.read_csv('./plankton_data/planktons_med_filtered.csv')

# Convert 'datetime' columns to datetime objects
df1['datetime'] = pd.to_datetime(df1['eventDate'])
df2['datetime'] = pd.to_datetime(df2['datetime'])
df3['datetime'] = pd.to_datetime(df3['datetime'])

# Displaying the shape of DataFrames
print("Shape of df1:", df1.shape)
print("Shape of df2:", df2.shape)
print("Shape of df3:", df3.shape)

Shape of df1: (62275, 97)
Shape of df2: (62275, 9)
Shape of df3: (23408, 14)


In [14]:
taxons = [
    "Dinophysis acuminata",
    "Karenia mikimotoi",
    "Chaetoceros",
    "Dinophysis", 
    "Alexandrium minutum",
    "Pseudo-nitzschia"
]

In [21]:
df3.head()

Unnamed: 0,index,lat,lon,unit,dataset,datetime,subset,Dinophysis acuminata,Karenia mikimotoi,Chaetoceros,Dinophysis,Alexandrium minutum,Pseudo-nitzschia,total plankton
0,33008,43.087319,5.906421,Nombre par litre,MED,1987-01-07 10:00:00,val,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30414,43.382816,4.879229,Nombre par litre,MED,1987-01-20 10:00:00,test,0.0,0.0,100.0,0.0,0.0,0.0,100.0
2,35172,42.134665,9.540678,Nombre par litre,MED,1987-01-21 13:00:00,val,0.0,100.0,0.0,0.0,0.0,0.0,100.0
3,36633,42.076882,9.795849,Nombre par litre,MED,1987-02-02 09:00:00,val,0.0,100.0,0.0,0.0,0.0,0.0,100.0
4,35174,42.134665,9.540678,Nombre par litre,MED,1987-02-02 14:00:00,val,0.0,500.0,100.0,0.0,0.0,0.0,600.0


### Data Processing
Perform data grouping, sorting, and filtering operations.

In [15]:
df2 = df2.sort_values(by=['datetime', 'lat', 'lon'])

# Grouping data in df1
grouped_data = df2.groupby(['datetime', 'lat', 'lon']).apply(
    lambda rows: 
        np.array([[row["value"] if row["taxon"] == taxon else 0.0 for taxon in taxons]  for index, row in rows.iterrows()]).sum(axis=0)
)

df2 = df2.drop_duplicates(subset=['datetime', 'lat', 'lon'])

df2['taxons'] = grouped_data.values.tolist()

for taxon in taxons:
    df2[taxon] = df2['taxons'].apply(lambda x: x[taxons.index(taxon)])
df2['total plankton'] = df2['taxons'].apply(lambda x: sum(x))

df2 = df2.drop(columns=['taxon', 'taxons', 'value'])

# Displaying the first few rows
df2.head()

Unnamed: 0,index,lat,lon,unit,dataset,datetime,subset,Dinophysis acuminata,Karenia mikimotoi,Chaetoceros,Dinophysis,Alexandrium minutum,Pseudo-nitzschia,total plankton
33008,33008,43.087319,5.906421,Nombre par litre,MED,1987-01-07 10:00:00,val,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30414,30414,43.382816,4.879229,Nombre par litre,MED,1987-01-20 10:00:00,val,0.0,0.0,100.0,0.0,0.0,0.0,100.0
35172,35172,42.134665,9.540678,Nombre par litre,MED,1987-01-21 13:00:00,val,0.0,100.0,0.0,0.0,0.0,0.0,100.0
36633,36633,42.076882,9.795849,Nombre par litre,MED,1987-02-02 09:00:00,val,0.0,100.0,0.0,0.0,0.0,0.0,100.0
35174,35174,42.134665,9.540678,Nombre par litre,MED,1987-02-02 14:00:00,val,0.0,500.0,100.0,0.0,0.0,0.0,600.0


In [16]:
df2.to_csv('./plankton_data/planktons_med_filtered.csv', index=False)

### Data Analysis
Perform additional data analysis.

### Numpy Array Operations
Load and manipulate numpy arrays.

In [None]:
# Working with Numpy Arrays
raster = np.load('./npy/plankton_med-npy/193.npy')

# Accessing specific layers
layer_0 = raster[:, :, 0]
layer_6 = raster[:, :, 6]

# Displaying array information
print("Layer 0 shape:", layer_0.shape)
print("Layer 6 shape:", layer_6.shape)

### Perform subset division

Based on geographical location

In [17]:
# subset : train for lon <= median, test for lon > median
median = df2['lon'].median()

df2.loc[df2['lon'] <= median, 'subset'] = 'train'
df2.loc[df2['lon'] > median, 'subset'] = 'test'

perc_80 = df2['lon'].quantile(0.8)

df2.loc[df2['lon'] >= perc_80, 'subset'] = 'val'

df2.to_csv('./plankton_data/planktons_med_filtered.csv', index=False)