### Imports
Import necessary libraries for data analysis.

In [32]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

import math
from plotly.subplots import make_subplots

### Data Loading
Read the CSV files into pandas DataFrames and preprocess the data.

In [33]:
# Reading CSV files
df1 = pd.read_csv('../plankton_geoenrich_data/biodiv/plankton_med.csv')
df2 = pd.read_csv('./plankton_data/planktons_med.csv')
df3 = pd.read_csv('./plankton_data/planktons_med_filtered.csv')

# Convert 'datetime' columns to datetime objects
df1['datetime'] = pd.to_datetime(df1['eventDate'])
df2['datetime'] = pd.to_datetime(df2['datetime'])
df3['datetime'] = pd.to_datetime(df3['datetime'])

# Displaying the shape of DataFrames
print("Shape of df1:", df1.shape)
print("Shape of df2:", df2.shape)
print("Shape of df3:", df3.shape)

Shape of df1: (62275, 97)
Shape of df2: (62275, 9)
Shape of df3: (23408, 14)


In [3]:
taxons = [
    "Dinophysis acuminata",
    "Karenia mikimotoi",
    "Chaetoceros",
    "Dinophysis", 
    "Alexandrium minutum",
    "Pseudo-nitzschia"
]

In [4]:
df3.head()

Unnamed: 0,index,lat,lon,unit,dataset,datetime,subset,Dinophysis acuminata,Karenia mikimotoi,Chaetoceros,Dinophysis,Alexandrium minutum,Pseudo-nitzschia,total plankton
0,33008,43.087319,5.906421,Nombre par litre,MED,1987-01-07 10:00:00,val,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,30414,43.382816,4.879229,Nombre par litre,MED,1987-01-20 10:00:00,val,0.0,0.0,100.0,0.0,0.0,0.0,100.0
2,35172,42.134665,9.540678,Nombre par litre,MED,1987-01-21 13:00:00,val,0.0,100.0,0.0,0.0,0.0,0.0,100.0
3,36633,42.076882,9.795849,Nombre par litre,MED,1987-02-02 09:00:00,val,0.0,100.0,0.0,0.0,0.0,0.0,100.0
4,35174,42.134665,9.540678,Nombre par litre,MED,1987-02-02 14:00:00,val,0.0,500.0,100.0,0.0,0.0,0.0,600.0


### Data Processing
Perform data grouping, sorting, and filtering operations.

In [5]:
df3 = df2.sort_values(by=['datetime', 'lat', 'lon'])

# Grouping data in df1
grouped_data = df3.groupby(['datetime', 'lat', 'lon']).apply(
    lambda rows: 
        np.array([[row["value"] if row["taxon"] == taxon else 0.0 for taxon in taxons]  for index, row in rows.iterrows()]).sum(axis=0)
)

df3 = df3.drop_duplicates(subset=['datetime', 'lat', 'lon'])

df3['taxons'] = grouped_data.values.tolist()

for taxon in taxons:
    df3[taxon] = df3['taxons'].apply(lambda x: x[taxons.index(taxon)])
df3['total plankton'] = df3['taxons'].apply(lambda x: sum(x))

df3 = df3.drop(columns=['taxon', 'taxons', 'value'])

# Displaying the first few rows
df3.head()

Unnamed: 0,index,lat,lon,unit,dataset,datetime,subset,Dinophysis acuminata,Karenia mikimotoi,Chaetoceros,Dinophysis,Alexandrium minutum,Pseudo-nitzschia,total plankton
33008,33008,43.087319,5.906421,Nombre par litre,MED,1987-01-07 10:00:00,val,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30414,30414,43.382816,4.879229,Nombre par litre,MED,1987-01-20 10:00:00,val,0.0,0.0,100.0,0.0,0.0,0.0,100.0
35172,35172,42.134665,9.540678,Nombre par litre,MED,1987-01-21 13:00:00,val,0.0,100.0,0.0,0.0,0.0,0.0,100.0
36633,36633,42.076882,9.795849,Nombre par litre,MED,1987-02-02 09:00:00,val,0.0,100.0,0.0,0.0,0.0,0.0,100.0
35174,35174,42.134665,9.540678,Nombre par litre,MED,1987-02-02 14:00:00,val,0.0,500.0,100.0,0.0,0.0,0.0,600.0


In [16]:
df3.to_csv('./plankton_data/planktons_med_filtered.csv', index=False)

### Data Analysis
Perform additional data analysis.

In [34]:
for taxon in taxons:
    print(taxon, df3[taxon].mean())

Dinophysis acuminata 65.4339114832536
Karenia mikimotoi 344.9667634996582
Chaetoceros 41555.37653793575
Dinophysis 36.44066131237184
Alexandrium minutum 9363.476161995899
Pseudo-nitzschia 33925.44002050581


In [8]:
taxon_values = []

for taxon in taxons:
    taxon_values.append(df3.loc[df3[taxon] > 0, taxon].tolist())
    

In [12]:
# Using a plot(ly) grid, plot the distribution on non null values (taxon_values)
fig = make_subplots(rows=math.ceil(len(taxon_values) / 2), cols=2, subplot_titles=taxons)

for i in range(len(taxon_values)):
    row = math.ceil((i + 1) / 2)
    col = (i % 2) + 1
    fig.add_trace(
        go.Histogram(x=taxon_values[i], name=taxons[i]),
        row=row, col=col
    )

fig.update_layout(height=1000, width=1000, title_text="Distribution of plankton values")
fig.show()

In [16]:
# Mean for each : 
means = [np.mean(taxon) for taxon in taxon_values]
for taxon in taxons:
    print("Mean for", taxon, ":", np.mean(taxon_values[taxons.index(taxon)]))
    print("Median for", taxon, ":", np.median(taxon_values[taxons.index(taxon)]))
    print("")


Mean for Dinophysis acuminata : 530.7266112266112
Median for Dinophysis acuminata : 200.0

Mean for Karenia mikimotoi : 6591.822040816326
Median for Karenia mikimotoi : 300.0

Mean for Chaetoceros : 157603.41121192483
Median for Chaetoceros : 10950.0

Mean for Dinophysis : 492.4959584295612
Median for Dinophysis : 200.0

Mean for Alexandrium minutum : 231202.79535864978
Median for Alexandrium minutum : 300.0

Mean for Pseudo-nitzschia : 96374.5995145631
Median for Pseudo-nitzschia : 4300.0



In [22]:
error = 0.

bad_predictor = df3.loc[:, [taxon for taxon in taxons]]
for taxon in taxons:
    bad_predictor[taxon + " Pred"] = bad_predictor[taxon].mean()
    error += np.mean((bad_predictor[taxon] - bad_predictor[taxon + " Pred"]) ** 2)

print("Error for bad predictor:", error)

# 1843201262012.8
# 0793232343040.0


Error for bad predictor: 1843201262012.7842


In [31]:
# Print the distribution of 'total plankton':

print("Min:", df3.loc[df3['total plankton'] > 0,'total plankton'].min())
print("Max:", df3.loc[df3['total plankton'] > 0,'total plankton'].max())


fig = go.Figure(data=[go.Histogram(x=df3.loc[df3['total plankton'] > 0,'total plankton'])])
fig.update_layout(title_text="Distribution of total plankton values")
fig.show()

Min: 1.0
Max: 180000000.0


### Numpy Array Operations
Load and manipulate numpy arrays.

In [None]:
# Working with Numpy Arrays
raster = np.load('./npy/plankton_med-npy/193.npy')

# Accessing specific layers
layer_0 = raster[:, :, 0]
layer_6 = raster[:, :, 6]

# Displaying array information
print("Layer 0 shape:", layer_0.shape)
print("Layer 6 shape:", layer_6.shape)

### Perform subset division

Based on geographical location

In [26]:
# subset : train for lon <= median, test for lon > median
train_test_split = df3['lon'].quantile(0.7)

df3.loc[df3['lon'] <= train_test_split, 'subset'] = 'train'
df3.loc[df3['lon'] > train_test_split, 'subset'] = 'test'

val_split = df3['lon'].quantile(0.9)

df3.loc[df3['lon'] >= val_split, 'subset'] = 'val'

df3.to_csv('./plankton_data/planktons_med_filtered.csv', index=False)