In [10]:
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn import preprocessing
from ipynb.fs.full.helpers import *

In [8]:
df = pd.read_csv('leg234_data.csv')

In [None]:
df = df.astype(float)
print(df.dtypes)
names = df.columns
# Create the Scaler object
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(df)
df = pd.DataFrame(scaled_df, columns=names)

In [11]:
def split_data(df):
    y = df.iloc[:, df.columns == 'TOTAL']
    X = df.iloc[:, df.columns != 'TOTAL']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=123)

    X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, 
        test_size=0.5, random_state=123) 
    

    return X_train, X_val, y_train, y_val

def smape(A, F):
    return 1/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

# Establish a baseline
### Very simple model returning the average of the biomass at a depth based on the training data. 

In [12]:
X_train, X_val, y_train, y_val = split_data(df)
df_train = pd.concat([X_train, y_train], axis=1)
df_val = pd.concat([X_val, y_val], axis=1)

In [None]:
df_mean = df_train.groupby(['PDMEAN'])['TOTAL'].mean()
df_mean = df_mean.reset_index()

In [None]:
def predict(avg, val):
    y_pred = []
    for index, row in val.iterrows():
        pred = avg.loc[(avg['PDMEAN'] == row['PDMEAN']), 'TOTAL'].iloc[0]
        y_pred.append(pred)
                   
    return y_pred             

In [None]:
y_pred = predict(df_mean, df_val)

In [None]:
print('MAE score: ', mean_absolute_error(y_val, y_pred))
print('R2 score: ', r2_score(y_val, y_pred))
print('Smape score: ', smape(y_val, np.array(y_pred).reshape(27937,1)))