In [1]:
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
from sklearn import preprocessing
from matplotlib import pyplot as plt
from pingouin import multivariate_normality

In [2]:
df = pd.read_csv('leg234_data.csv')

In [3]:
df = df.astype(float)
print(df.dtypes)
names = df.columns
# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(df)
df = pd.DataFrame(scaled_df, columns=names)

LATITUDE                        float64
LONGITUD                        float64
PDMEAN                          float64
TOTAL                           float64
sf_depth                        float64
PCO2.H2O_mmm                    float64
PCO2.CO2_umm                    float64
NMEA.Humidity                   float64
NMEA.Trykk                      float64
FerryBox.SBE45_Salinity         float64
FerryBox.Optode_Saturation      float64
FerryBox.C3_Turbidity           float64
FerryBox.C3_CHLAFluorescence    float64
FerryBox.C3_Temperature         float64
FerryBox.C3_CDOMFluorescence    float64
day                             float64
month                           float64
time                            float64
dtype: object


In [4]:
def smape(A, F):
    return 1/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [5]:
def split_data(df):
    y = df.iloc[:, df.columns == 'TOTAL']
    X = df.iloc[:, df.columns != 'TOTAL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123)

    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, 
        test_size=0.5, random_state=123) 
    

    return X_train, X_val, y_train, y_val


# Establish a baseline
### Very simple model simply returning the average of the biomass at a depth based on the training data. 

In [6]:
X_train, X_val, y_train, y_val = split_data(df)
df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

In [7]:
df_mean = df_train.groupby(['PDMEAN'])['TOTAL'].mean()
df_mean = df_mean.reset_index()

In [8]:
df_mean

Unnamed: 0,PDMEAN,TOTAL
0,-1.672024,-0.478082
1,-1.635900,-0.259161
2,-1.587735,0.100898
3,-1.539570,0.380112
4,-1.491405,0.420310
...,...,...
68,1.591175,0.418352
69,1.639340,0.478976
70,1.687506,0.503190
71,1.735671,0.365988


In [9]:
def predict(avg, val):
    y_pred = []
    for index, row in val.iterrows():
        pred = avg.loc[(avg['PDMEAN'] == row['PDMEAN']), 'TOTAL'].iloc[0]
        y_pred.append(pred)
                   
    return y_pred             

In [10]:
y_pred = predict(df_mean, df_val)

In [11]:
print('MAE score: ', mean_absolute_error(y_val, y_pred))
print('R2 score: ', r2_score(y_val, y_pred))
print('Smape score: ', smape(y_val, np.array(y_pred).reshape(28458,1)))

MAE score:  0.49828110139033344
R2 score:  0.07436671448936549
Smape score:  TOTAL    1.213488
dtype: float64
