In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

## Get Data

In [6]:
df = pd.read_csv('../plankton_data/planktons_med_filtered.csv', index_col='index')

### Aggregating data

In [14]:
for index in df.index:
    npy = np.load(f'../npy/plankton_med-npy-norm/{index}.npy')
    # print(npy.shape) == (32, 32, 8)
    mean = np.mean(npy, axis=(0, 1))
    center_mean = np.mean(npy[15:17, 15:17], axis=(0, 1))
    features = np.concatenate([mean, center_mean])
    for i in range(features.shape[0]):
        df.loc[index, f'f{i}'] = features[i]
    

In [17]:
features = [f'f{i}' for i in range(16)]

In [38]:
taxons = [
    "Dinophysis acuminata",
    "Karenia mikimotoi",
    "Chaetoceros",
    "Dinophysis", 
    "Alexandrium minutum",
    "Pseudo-nitzschia"
]

In [39]:
# Head of target and features : 
df[['Pseudo-nitzschia'] + features].head()

Unnamed: 0_level_0,Pseudo-nitzschia,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
33008,0.0,0.097599,0.186053,0.153239,0.087633,0.467114,0.11845,0.1108,0.835807,0.114929,0.186053,0.153239,0.087633,0.467114,0.11845,0.10646,0.841798
30414,0.0,0.058595,0.186053,0.153239,0.087633,0.467114,0.11845,0.080034,0.851737,0.041533,0.186053,0.153239,0.087633,0.467114,0.11845,0.064796,0.831095
35172,0.0,0.108287,0.186053,0.153239,0.087633,0.467114,0.11845,0.117305,0.82259,0.110903,0.186053,0.153239,0.087633,0.467114,0.11845,0.107477,0.815026
36633,0.0,0.10658,0.186053,0.153239,0.087633,0.467114,0.11845,0.119321,0.813532,0.105801,0.186053,0.153239,0.087633,0.467114,0.11845,0.119617,0.77701
35174,0.0,0.103183,0.186053,0.153239,0.087633,0.467114,0.11845,0.113857,0.829036,0.113048,0.186053,0.153239,0.087633,0.467114,0.11845,0.107242,0.83136


### Splitting data

In [45]:
# test, train split : 
from sklearn.model_selection import train_test_split

X = df[features]
y = df['Pseudo-nitzschia']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear model

In [46]:
# Linear Regression :
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.014880306594328485

In [47]:
def mle(y_true, y_pred):
    abs_diff = np.abs(y_true - y_pred)
    max_diff = np.maximum(1, abs_diff)
    log_max_diff = np.log(max_diff)
    return np.mean(log_max_diff)

In [48]:
# Print (mse) :

y_pred = lr.predict(X_test)
print(mean_squared_error(y_test, y_pred))

62948941676.85719


In [49]:
# Print (mle) :

print(mle(y_test, y_pred))

10.137739911012691


In [50]:
# print some predictions :
for i in range(10):
    print(f'prediction : {lr.predict([X_test.iloc[i]])[0]}, real : {y_test.iloc[i]}')

prediction : 107332.84204941755, real : 29700.0
prediction : 43152.70474918005, real : 0.0
prediction : 22503.702243103253, real : 0.0
prediction : 26608.858891433214, real : 0.0
prediction : 15263.866066669361, real : 10500.0
prediction : 44035.514039297705, real : 0.0
prediction : 38698.306999302, real : 0.0
prediction : -32725.51093679377, real : 0.0
prediction : 24270.26930359722, real : 5000.0
prediction : -37410.829246320485, real : 0.0




## Random forest :

In [32]:
from sklearn.ensemble import RandomForestRegressor

In [51]:
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

-0.5623497937831046

In [52]:
# Print (mse) :

y_pred = rf.predict(X_test)
print(mean_squared_error(y_test, y_pred))

99833823956.66185


In [53]:
# Print (mle) :

print(mle(y_test, y_pred))

9.347150558766778
