In [2]:
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np
from sklearn import preprocessing
from matplotlib import pyplot as plt

In [87]:
df = pd.read_csv('leg234_data.csv')
#Flip wrong datapoints
df['LATITUDE'] = np.where((df['LATITUDE']>=10) & (df['date'] >= '2022-02-13'), -df['LATITUDE'], df['LATITUDE'])
df = df.drop(columns=['NMEA.Wave_Height', 'PCO2.H2O_mmm', 'PCO2.atm_cond', 'NMEA.Wind_Speed', 'NMEA.Wind_Angle', 'date'])
df = df.dropna(subset = ['TOTAL'])


In [88]:
def smape(A, F):
    return 1/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [89]:
def split_data(df):
    y = df.iloc[:, df.columns == 'TOTAL']
    X = df.iloc[:, df.columns != 'TOTAL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, 
        test_size=0.25, random_state=123) 
    

    return X_train, X_val, y_train, y_val


# Establish a baseline
### Very simple model simply returning the average of the biomass at a depth based on the training data. 

In [90]:
X_train, X_val, y_train, y_val = split_data(df)
df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

In [91]:
df_mean = df_train.groupby(['PDMEAN'])['TOTAL'].mean()
df_mean = df_mean.reset_index()

In [92]:
df_mean

Unnamed: 0,PDMEAN,TOTAL
0,7.5,0.000000
1,15.0,21.259855
2,25.0,56.064154
3,35.0,84.623683
4,45.0,87.818252
...,...,...
68,685.0,87.942734
69,695.0,96.413419
70,705.0,92.205449
71,715.0,80.999640


In [93]:
total=df_mean.loc[df['PDMEAN'] == 25, 'TOTAL'].iloc[0]
total

56.064154052882614

In [95]:
def predict(avg, val):
    y_pred = []
    for index, row in val.iterrows():
        pred = avg.loc[(avg['PDMEAN'] == row['PDMEAN']), 'TOTAL'].iloc[0]
        y_pred.append(pred)
                   
    return y_pred
        
        

In [96]:
y_pred = predict(df_mean, df_val)

In [105]:
print('MAE score: ', mean_absolute_error(y_val, y_pred))
print('R2 score: ', r2_score(y_val, y_pred))
print('Smape score: ', smape(y_val, np.array(y_pred).reshape(53358,1)))

MAE score:  48.23614013074081
R2 score:  0.07401743644772274
Smape score:  TOTAL    1.01848
dtype: float64
