# 4. Model Development

## Pre-processing

In [1]:
# Libraries imported for this notebook.

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
#import matplotlib.pyplot as plt
#from sklearn.model_selection import TimeSeriesSplit, cross_val_score
#from statsmodels.tsa.arima.model import ARIMA
#from sklearn.metrics import r2_score

In [2]:
# Read Lake_Fictitious_AquaRealTime_Data_Pre_Process_Train into a dataframe, formatted dates, and indexed dates.

df = pd.read_excel('Lake_Fictitious_AquaRealTime_Data_Cleaned_Pre_Process_Train.xlsx', parse_dates=True, index_col=[0])

In [3]:
# Looked at shape

df.shape

(53829, 135)

In [4]:
# Defined Xs and ys.

# Original features 
X_orig = df[[col for col in df.columns if '_' not in col and col != 'phycocyanin']].copy()

# 1st order differenced versions of metrics
X_diff = df.loc[:, df.columns.str.contains('1st_') & ~df.columns.str.contains('_lag_')]

# 1st order differenced and lag 1 versions of metrics
X_diff_lag_1 = df.loc[:, df.columns.str.contains('1st_') & (~df.columns.str.contains('_lag_') | df.columns.str.contains('_lag_1'))] 

# 1st order differenced and lag 1-3 versions of metrics
X_diff_lag_3 = df.loc[:, df.columns.str.contains('1st_') & (~df.columns.str.contains('_lag_') | df.columns.str.contains('_lag_1') | df.columns.str.contains('_lag_2') | df.columns.str.contains('_lag_3'))]

# 1st order differenced and lag 1-5 versions of metrics
X_diff_lag_5 = df.loc[:, df.columns.str.contains('_')]

# Target variable
y = df['phycocyanin']

In [5]:
# Created an empty list to store RMSE values for all models.

errors = []

In [7]:
# Created function to choose X and model.

def fit_model(X, y, model, model_name):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Calculate RMSE for the model
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Root Mean Squared Error (RMSE) for {model_name}: {rmse}")

    # Append RMSE to 'errors' list
    errors.append((model_name, rmse))

## Baseline Model 

In [6]:
# Created a baseline model.

# Calculated the predicted phycocyanin values for each half-hour interval
predicted_phycocyanin = np.array(df['1st_phycocyanin_lag_1'])

# Shifted the actual phycocyanin values by one interval to align with predictions
actual_phycocyanin = np.array(df['1st_phycocyanin'])

# Calculated RMSE for the baseline model
rmse_base = np.sqrt(mean_squared_error(actual_phycocyanin, predicted_phycocyanin))
print("Root Mean Squared Error (RMSE) for the baseline model:", rmse_base)

# Appended RMSE to 'errors' lists
errors.append(("Base", rmse_base))

Root Mean Squared Error (RMSE) for the baseline model: 48.37207406889623


## Linear Regression

In [9]:
fit_model(X_diff, y, LinearRegression(), "Lin Reg Lag 0")

Root Mean Squared Error (RMSE) for Lin Reg Lag 0: 129.6250152422449


In [10]:
fit_model(X_diff_lag_1, y, LinearRegression(), "Lin Reg Lag 1")

Root Mean Squared Error (RMSE) for Lin Reg Lag 1: 128.63595707419233


In [11]:
fit_model(X_diff_lag_3, y, LinearRegression(), "Lin Reg Lag 3")

Root Mean Squared Error (RMSE) for Lin Reg Lag 3: 127.9857230388386


In [12]:
fit_model(X_diff_lag_5, y, LinearRegression(), "Lin Reg Lag 5")

Root Mean Squared Error (RMSE) for Lin Reg Lag 5: 127.5509724272718


The error gets better as I add lags but still not as good as the baseline model!

## Random Forest

In [13]:
fit_model(X_diff, y, RandomForestRegressor(), "Lin Reg Lag 0")

Root Mean Squared Error (RMSE) for Lin Reg Lag 0: 87.62194097035005


In [14]:
fit_model(X_diff_lag_1, y, RandomForestRegressor(), "Lin Reg Lag 1")

Root Mean Squared Error (RMSE) for Lin Reg Lag 1: 77.57797893692965


In [15]:
fit_model(X_diff_lag_3, y, RandomForestRegressor(), "Lin Reg Lag 3")

Root Mean Squared Error (RMSE) for Lin Reg Lag 3: 77.13867158706833


In [16]:
fit_model(X_diff_lag_5, y, RandomForestRegressor(), "Lin Reg Lag 5")

Root Mean Squared Error (RMSE) for Lin Reg Lag 5: 77.26525827108155


The errors are better than linear regression but still not as good as the baseline model!

## LightGBM

In [17]:
fit_model(X_diff, y, lgb.LGBMRegressor(), "Lin Reg Lag 0")

NameError: name 'lgb' is not defined