In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
import xgboost as xgb
from lightgbm import LGBMRegressor, early_stopping
from xgboost import XGBRegressor
from scipy import stats

In [2]:
df = pd.read_table('CAF357.txt', sep = '\t', na_values= 'NA')
first_valid_index = df.iloc[:, 2:].notna().any(axis=1).idxmax()
df_trimmed_top = df.iloc[first_valid_index:].reset_index(drop=True)
df_trimmed_top.head()

Unnamed: 0,Location,Date,VW_30cm,VW_60cm,VW_90cm,VW_120cm,VW_150cm,T_30cm,T_60cm,T_90cm,T_120cm,T_150cm
0,CAF357,05/19/2009,0.299,0.321,0.359,0.331,0.314,11.87,10.1,8.7,7.61,7.09
1,CAF357,05/20/2009,0.301,0.325,0.363,0.335,0.32,11.22,10.09,8.75,7.61,7.0
2,CAF357,05/21/2009,,,,,,,,,,
3,CAF357,05/22/2009,,0.328,0.368,0.342,0.327,12.1,10.11,9.06,7.94,7.22
4,CAF357,05/23/2009,,0.329,0.369,0.343,0.327,12.25,10.4,9.16,8.03,7.32


In [4]:
last_valid_index = df_trimmed_top.iloc[:, 2:].notna().any(axis=1)[::-1].idxmax()
df_final_trimmed = df_trimmed_top.iloc[:last_valid_index + 1].reset_index(drop=True)
df_final_trimmed.to_csv('CAF357_trimmed_test.csv', index=False)

In [5]:
df_final_trimmed.isnull().sum()

Location      0
Date          0
VW_30cm     487
VW_60cm     267
VW_90cm     367
VW_120cm    319
VW_150cm    217
T_30cm      474
T_60cm      267
T_90cm      367
T_120cm     319
T_150cm     217
dtype: int64

In [6]:
df_final_trimmed.isnull().sum().sum()

3301

In [7]:
num_rows_left = df_final_trimmed.shape[0]
print(f"Number of rows left after cleaning: {num_rows_left}")

Number of rows left after cleaning: 2586


In [8]:
df_final_trimmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2586 entries, 0 to 2585
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Location  2586 non-null   object 
 1   Date      2586 non-null   object 
 2   VW_30cm   2099 non-null   float64
 3   VW_60cm   2319 non-null   float64
 4   VW_90cm   2219 non-null   float64
 5   VW_120cm  2267 non-null   float64
 6   VW_150cm  2369 non-null   float64
 7   T_30cm    2112 non-null   float64
 8   T_60cm    2319 non-null   float64
 9   T_90cm    2219 non-null   float64
 10  T_120cm   2267 non-null   float64
 11  T_150cm   2369 non-null   float64
dtypes: float64(10), object(2)
memory usage: 242.6+ KB


In [9]:
data = pd.read_csv('CAF357_trimmed_test.txt', sep='\t')

data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data = data.drop(columns=['Date'])

label_encoder = LabelEncoder()
data['Location'] = label_encoder.fit_transform(data['Location'])

columns_to_fill = data.columns.difference(['VW_30cm'])

for column in data.columns:
    if data[column].isna().any():
        columns_to_fill_temp = data.columns.difference([column])
        data[columns_to_fill_temp] = data[columns_to_fill_temp].fillna(data[columns_to_fill_temp].median())
        
        data_missing = data[data[column].isna()]
        data_not_missing = data[~data[column].isna()]
        
        features = data.columns.difference([column])
        X = data_not_missing[features]
        y = data_not_missing[column]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        
        model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, max_depth=5)
        model.fit(X_train, y_train)
        
        X_missing = data_missing[features]
        predicted_values = model.predict(X_missing)
        data.loc[data[column].isna(), column] = predicted_values

data.to_csv('CAF357_cleaned_xgb_complete.csv', index=False)

In [10]:
df=pd.read_csv('CAF357_cleaned_xgb_complete.csv')
df.head()

Unnamed: 0,Location,VW_30cm,VW_60cm,VW_90cm,VW_120cm,VW_150cm,T_30cm,T_60cm,T_90cm,T_120cm,T_150cm,Day,Month,Year
0,0,0.299,0.321,0.359,0.331,0.314,11.87,10.1,8.7,7.61,7.09,19,5,2009
1,0,0.301,0.325,0.363,0.335,0.32,11.22,10.09,8.75,7.61,7.0,20,5,2009
2,0,0.25392,0.282,0.362,0.277,0.307,9.3255,7.75,9.62,8.9,8.8,21,5,2009
3,0,0.27745,0.328,0.368,0.342,0.327,12.1,10.11,9.06,7.94,7.22,22,5,2009
4,0,0.271642,0.329,0.369,0.343,0.327,12.25,10.4,9.16,8.03,7.32,23,5,2009


In [10]:
df.isnull().sum()

Location    0
VW_30cm     0
VW_60cm     0
VW_90cm     0
VW_120cm    0
VW_150cm    0
T_30cm      0
T_60cm      0
T_90cm      0
T_120cm     0
T_150cm     0
Day         0
Month       0
Year        0
dtype: int64

In [11]:
data = pd.read_csv('CAF357_cleaned_xgb_complete.csv')
data['Location'] = label_encoder.inverse_transform(data['Location'])

data['Date'] = pd.to_datetime(data[['Year', 'Month', 'Day']]).dt.strftime('%m/%d/%Y')

data = data.drop(columns=['Day', 'Month', 'Year'])

correct_column_order = ['Location', 'Date', 'VW_30cm', 'VW_60cm', 'VW_90cm', 'VW_120cm', 'VW_150cm', 
                        'T_30cm', 'T_60cm', 'T_90cm', 'T_120cm', 'T_150cm']

data = data[correct_column_order]

data.to_csv('CAF357_final_cleaned_xgb_complete.csv', index=False)

data.head()

Unnamed: 0,Location,Date,VW_30cm,VW_60cm,VW_90cm,VW_120cm,VW_150cm,T_30cm,T_60cm,T_90cm,T_120cm,T_150cm
0,CAF357,05/19/2009,0.299,0.321,0.359,0.331,0.314,11.87,10.1,8.7,7.61,7.09
1,CAF357,05/20/2009,0.301,0.325,0.363,0.335,0.32,11.22,10.09,8.75,7.61,7.0
2,CAF357,05/21/2009,0.25392,0.282,0.362,0.277,0.307,9.3255,7.75,9.62,8.9,8.8
3,CAF357,05/22/2009,0.27745,0.328,0.368,0.342,0.327,12.1,10.11,9.06,7.94,7.22
4,CAF357,05/23/2009,0.271642,0.329,0.369,0.343,0.327,12.25,10.4,9.16,8.03,7.32


# Temperature

Preprocessing v2.0 w/ MinMax Scaler 

In [4]:
data = pd.read_csv('CAF357_final_cleaned_xgb_complete.csv')

data = data.dropna()

features = data.drop(columns=['Date', 'Location', 'T_90cm'])
targets = data['T_90cm']

X_train, X_temp, y_train, y_temp = train_test_split(features, targets, test_size=0.5, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Testing set size: {len(X_test)}")

Train: (1293, 9), Validation: (646, 9), Test: (647, 9)
Training set size: 1293
Validation set size: 646
Testing set size: 647


KNN

In [6]:
# Required imports
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from scipy import stats

# Define and fit the model with GridSearchCV
knn = KNeighborsRegressor()

param_grid_knn = {
    'n_neighbors': [2, 3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  
}

grid_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='neg_mean_squared_error')
grid_knn.fit(X_train, y_train)

print(f"Best parameters for KNN: {grid_knn.best_params_}")

# Validation predictions
y_val_pred_knn = grid_knn.predict(X_val)

# Calculate metrics for validation set
mse_knn = mean_squared_error(y_val, y_val_pred_knn)
rmse_knn = np.sqrt(mse_knn)  # Root Mean Squared Error
mae_knn = mean_absolute_error(y_val, y_val_pred_knn)  # Mean Absolute Error
r2_knn = r2_score(y_val, y_val_pred_knn)
fsd_knn = rmse_knn / np.std(y_val)  # Fraction of Standard Deviation
similarity_knn = np.dot(y_val, y_val_pred_knn) / (np.sqrt(np.dot(y_val, y_val)) * np.sqrt(np.dot(y_val_pred_knn, y_val_pred_knn)))  # Cosine similarity

print(f"KNN - Validation Metrics:")
print(f"MSE: {mse_knn}")
print(f"RMSE: {rmse_knn}")
print(f"MAE: {mae_knn}")
print(f"R2: {r2_knn}")
print(f"FSD: {fsd_knn}")
print(f"Similarity: {similarity_knn}")

# Test predictions
y_test_pred_knn = grid_knn.predict(X_test)

# Calculate metrics for test set
mse_knn_test = mean_squared_error(y_test, y_test_pred_knn)
rmse_knn_test = np.sqrt(mse_knn_test)
mae_knn_test = mean_absolute_error(y_test, y_test_pred_knn)
r2_knn_test = r2_score(y_test, y_test_pred_knn)
fsd_knn_test = rmse_knn_test / np.std(y_test)
similarity_knn_test = np.dot(y_test, y_test_pred_knn) / (np.sqrt(np.dot(y_test, y_test)) * np.sqrt(np.dot(y_test_pred_knn, y_test_pred_knn)))

print(f"\nKNN - Test Metrics:")
print(f"MSE: {mse_knn_test}")
print(f"RMSE: {rmse_knn_test}")
print(f"MAE: {mae_knn_test}")
print(f"R2: {r2_knn_test}")
print(f"FSD: {fsd_knn_test}")
print(f"Similarity: {similarity_knn_test}")

Best parameters for KNN: {'n_neighbors': 2, 'p': 1, 'weights': 'distance'}
KNN - Validation Metrics:
MSE: 0.0248744978309027
RMSE: 0.15771651096477723
MAE: 0.08950204287366521
R2: 0.998531222485488
FSD: 0.03832463325998032
Similarity: 0.9998886742622205

KNN - Test Metrics:
MSE: 0.019284507492682255
RMSE: 0.13886866994640026
MAE: 0.07971438930244491
R2: 0.9989169734828409
FSD: 0.032909368227893684
Similarity: 0.9999093637244467


Linear Regression

In [5]:
# Required imports
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from scipy import stats

# Fit the model (assuming this part remains the same)
lr = LinearRegression()
lr.fit(X_train, y_train)

# Validation predictions
y_val_pred_lr = lr.predict(X_val)

# Calculate metrics for validation set
mse_lr = mean_squared_error(y_val, y_val_pred_lr)
rmse_lr = np.sqrt(mse_lr)  # Root Mean Squared Error
mae_lr = mean_absolute_error(y_val, y_val_pred_lr)  # Mean Absolute Error
r2_lr = r2_score(y_val, y_val_pred_lr)

# FSD (Fraction of Standard Deviation) - ratio of RMSE to standard deviation of actual values
fsd_lr = rmse_lr / np.std(y_val)

# Similarity (cosine similarity between actual and predicted values)
similarity_lr = np.dot(y_val, y_val_pred_lr) / (np.sqrt(np.dot(y_val, y_val)) * np.sqrt(np.dot(y_val_pred_lr, y_val_pred_lr)))

print(f"Linear Regression - Validation Metrics:")
print(f"MSE: {mse_lr}")
print(f"RMSE: {rmse_lr}")
print(f"MAE: {mae_lr}")
print(f"R2: {r2_lr}")
print(f"FSD: {fsd_lr}")
print(f"Similarity: {similarity_lr}")

# Test predictions
y_test_pred_lr = lr.predict(X_test)

# Calculate metrics for test set
mse_lr_test = mean_squared_error(y_test, y_test_pred_lr)
rmse_lr_test = np.sqrt(mse_lr_test)
mae_lr_test = mean_absolute_error(y_test, y_test_pred_lr)
r2_lr_test = r2_score(y_test, y_test_pred_lr)
fsd_lr_test = rmse_lr_test / np.std(y_test)
similarity_lr_test = np.dot(y_test, y_test_pred_lr) / (np.sqrt(np.dot(y_test, y_test)) * np.sqrt(np.dot(y_test_pred_lr, y_test_pred_lr)))

print(f"\nLinear Regression - Test Metrics:")
print(f"MSE: {mse_lr_test}")
print(f"RMSE: {rmse_lr_test}")
print(f"MAE: {mae_lr_test}")
print(f"R2: {r2_lr_test}")
print(f"FSD: {fsd_lr_test}")
print(f"Similarity: {similarity_lr_test}")

Linear Regression - Validation Metrics:
MSE: 1.0189625858169293
RMSE: 1.0094367666262851
MAE: 0.5950706992279273
R2: 0.9398327819781122
FSD: 0.24529006914648596
Similarity: 0.9954218211549597

Linear Regression - Test Metrics:
MSE: 0.8985179816001905
RMSE: 0.9479018839522318
MAE: 0.5999679448748754
R2: 0.9495388305567805
FSD: 0.22463563707306014
Similarity: 0.9957726685619407


SVM /w GridSearch

In [7]:
# Required imports
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from scipy import stats

# Define and fit the model with GridSearchCV
svr = SVR()

param_grid_svr = {
    'kernel': ['linear', 'rbf', 'poly'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto']
}

grid_svr = GridSearchCV(svr, param_grid_svr, cv=5, scoring='neg_mean_squared_error')
grid_svr.fit(X_train, y_train)

print(f"Best parameters for SVM: {grid_svr.best_params_}")

# Validation predictions
y_val_pred_svr = grid_svr.predict(X_val)

# Calculate metrics for validation set
mse_svr = mean_squared_error(y_val, y_val_pred_svr)
rmse_svr = np.sqrt(mse_svr)  # Root Mean Squared Error
mae_svr = mean_absolute_error(y_val, y_val_pred_svr)  # Mean Absolute Error
r2_svr = r2_score(y_val, y_val_pred_svr)
fsd_svr = rmse_svr / np.std(y_val)  # Fraction of Standard Deviation
similarity_svr = np.dot(y_val, y_val_pred_svr) / (np.sqrt(np.dot(y_val, y_val)) * np.sqrt(np.dot(y_val_pred_svr, y_val_pred_svr)))  # Cosine similarity

print(f"SVM - Validation Metrics:")
print(f"MSE: {mse_svr}")
print(f"RMSE: {rmse_svr}")
print(f"MAE: {mae_svr}")
print(f"R2: {r2_svr}")
print(f"FSD: {fsd_svr}")
print(f"Similarity: {similarity_svr}")

# Test predictions
y_test_pred_svr = grid_svr.predict(X_test)

# Calculate metrics for test set
mse_svr_test = mean_squared_error(y_test, y_test_pred_svr)
rmse_svr_test = np.sqrt(mse_svr_test)
mae_svr_test = mean_absolute_error(y_test, y_test_pred_svr)
r2_svr_test = r2_score(y_test, y_test_pred_svr)
fsd_svr_test = rmse_svr_test / np.std(y_test)
similarity_svr_test = np.dot(y_test, y_test_pred_svr) / (np.sqrt(np.dot(y_test, y_test)) * np.sqrt(np.dot(y_test_pred_svr, y_test_pred_svr)))

print(f"\nSVM - Test Metrics:")
print(f"MSE: {mse_svr_test}")
print(f"RMSE: {rmse_svr_test}")
print(f"MAE: {mae_svr_test}")
print(f"R2: {r2_svr_test}")
print(f"FSD: {fsd_svr_test}")
print(f"Similarity: {similarity_svr_test}")

Best parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVM - Validation Metrics:
MSE: 0.15092768861128875
RMSE: 0.3884941294425036
MAE: 0.16417650790402144
R2: 0.9910880936428744
FSD: 0.09440289379635342
Similarity: 0.999323430002695

SVM - Test Metrics:
MSE: 0.1679796614307954
RMSE: 0.4098532193734672
MAE: 0.1777571461960565
R2: 0.990566187508704
FSD: 0.09712781522970694
Similarity: 0.9992108727746722


RandomForest /w GridSearch

In [None]:
rf = RandomForestRegressor(random_state=42)

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_rf.fit(X_train, y_train)

print(f"Best parameters for Random Forest: {grid_rf.best_params_}")

y_val_pred_rf = grid_rf.predict(X_val)

mse_rf = mean_squared_error(y_val, y_val_pred_rf)
r2_rf = r2_score(y_val, y_val_pred_rf)
print(f"Random Forest - Validation MSE: {mse_rf}, R2: {r2_rf}")

y_test_pred_rf = grid_rf.predict(X_test)

mse_rf_test = mean_squared_error(y_test, y_test_pred_rf)
r2_rf_test = r2_score(y_test, y_test_pred_rf)
print(f"Random Forest - Test MSE: {mse_rf_test}, R2: {r2_rf_test}")

Best parameters for Random Forest: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}
Random Forest - Validation MSE: 0.06070232561087914, R2: 0.9964156779549102
Random Forest - Test MSE: 0.0333161724878445, R2: 0.9981289489364313


XGBRegressor w/ GridSearch

In [8]:
# Required imports
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from scipy import stats

# Define and fit the model with GridSearchCV
xgb = XGBRegressor(random_state=42)

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2]
}

grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
grid_xgb.fit(X_train, y_train)

print(f"Best parameters for XGBoost: {grid_xgb.best_params_}")

# Validation predictions
y_val_pred_xgb = grid_xgb.predict(X_val)

# Calculate metrics for validation set
mse_xgb = mean_squared_error(y_val, y_val_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)  # Root Mean Squared Error
mae_xgb = mean_absolute_error(y_val, y_val_pred_xgb)  # Mean Absolute Error
r2_xgb = r2_score(y_val, y_val_pred_xgb)
fsd_xgb = rmse_xgb / np.std(y_val)  # Fraction of Standard Deviation
similarity_xgb = np.dot(y_val, y_val_pred_xgb) / (np.sqrt(np.dot(y_val, y_val)) * np.sqrt(np.dot(y_val_pred_xgb, y_val_pred_xgb)))  # Cosine similarity

print(f"XGBoost - Validation Metrics:")
print(f"MSE: {mse_xgb}")
print(f"RMSE: {rmse_xgb}")
print(f"MAE: {mae_xgb}")
print(f"R2: {r2_xgb}")
print(f"FSD: {fsd_xgb}")
print(f"Similarity: {similarity_xgb}")

# Test predictions
y_test_pred_xgb = grid_xgb.predict(X_test)

# Calculate metrics for test set
mse_xgb_test = mean_squared_error(y_test, y_test_pred_xgb)
rmse_xgb_test = np.sqrt(mse_xgb_test)
mae_xgb_test = mean_absolute_error(y_test, y_test_pred_xgb)
r2_xgb_test = r2_score(y_test, y_test_pred_xgb)
fsd_xgb_test = rmse_xgb_test / np.std(y_test)
similarity_xgb_test = np.dot(y_test, y_test_pred_xgb) / (np.sqrt(np.dot(y_test, y_test)) * np.sqrt(np.dot(y_test_pred_xgb, y_test_pred_xgb)))

print(f"\nXGBoost - Test Metrics:")
print(f"MSE: {mse_xgb_test}")
print(f"RMSE: {rmse_xgb_test}")
print(f"MAE: {mae_xgb_test}")
print(f"R2: {r2_xgb_test}")
print(f"FSD: {fsd_xgb_test}")
print(f"Similarity: {similarity_xgb_test}")

Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200}
XGBoost - Validation Metrics:
MSE: 0.06689785105413473
RMSE: 0.25864618894183367
MAE: 0.12059782110279191
R2: 0.9960498475158998
FSD: 0.06285023853654176
Similarity: 0.9996999977390341

XGBoost - Test Metrics:
MSE: 0.0737005952284224
RMSE: 0.2714785354837881
MAE: 0.11735448414538709
R2: 0.9958609417952168
FSD: 0.06433551278091487
Similarity: 0.9996540585659551


LightGBM w/ GridSearch

In [9]:
# Required imports
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from scipy import stats

# Define and fit the model with GridSearchCV
lgbm = LGBMRegressor(random_state=42, verbosity=-1)

param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_samples': [10, 20, 30],
    'num_leaves': [31, 50, 100]
}

grid_lgbm = GridSearchCV(lgbm, param_grid_lgbm, cv=5, scoring='neg_mean_squared_error', verbose=0)
grid_lgbm.fit(X_train, y_train)

print(f"Best parameters for LightGBM: {grid_lgbm.best_params_}")

# Validation predictions
y_val_pred_lgbm = grid_lgbm.predict(X_val)

# Calculate metrics for validation set
mse_lgbm = mean_squared_error(y_val, y_val_pred_lgbm)
rmse_lgbm = np.sqrt(mse_lgbm)  # Root Mean Squared Error
mae_lgbm = mean_absolute_error(y_val, y_val_pred_lgbm)  # Mean Absolute Error
r2_lgbm = r2_score(y_val, y_val_pred_lgbm)
fsd_lgbm = rmse_lgbm / np.std(y_val)  # Fraction of Standard Deviation
similarity_lgbm = np.dot(y_val, y_val_pred_lgbm) / (np.sqrt(np.dot(y_val, y_val)) * np.sqrt(np.dot(y_val_pred_lgbm, y_val_pred_lgbm)))  # Cosine similarity

print(f"LightGBM - Validation Metrics:")
print(f"MSE: {mse_lgbm}")
print(f"RMSE: {rmse_lgbm}")
print(f"MAE: {mae_lgbm}")
print(f"R2: {r2_lgbm}")
print(f"FSD: {fsd_lgbm}")
print(f"Similarity: {similarity_lgbm}")

# Test predictions
y_test_pred_lgbm = grid_lgbm.predict(X_test)

# Calculate metrics for test set
mse_lgbm_test = mean_squared_error(y_test, y_test_pred_lgbm)
rmse_lgbm_test = np.sqrt(mse_lgbm_test)
mae_lgbm_test = mean_absolute_error(y_test, y_test_pred_lgbm)
r2_lgbm_test = r2_score(y_test, y_test_pred_lgbm)
fsd_lgbm_test = rmse_lgbm_test / np.std(y_test)
similarity_lgbm_test = np.dot(y_test, y_test_pred_lgbm) / (np.sqrt(np.dot(y_test, y_test)) * np.sqrt(np.dot(y_test_pred_lgbm, y_test_pred_lgbm)))

print(f"\nLightGBM - Test Metrics:")
print(f"MSE: {mse_lgbm_test}")
print(f"RMSE: {rmse_lgbm_test}")
print(f"MAE: {mae_lgbm_test}")
print(f"R2: {r2_lgbm_test}")
print(f"FSD: {fsd_lgbm_test}")
print(f"Similarity: {similarity_lgbm_test}")

Best parameters for LightGBM: {'learning_rate': 0.1, 'max_depth': 9, 'min_child_samples': 20, 'n_estimators': 300, 'num_leaves': 31}
LightGBM - Validation Metrics:
MSE: 0.058260417917071504
RMSE: 0.24137194931696496
MAE: 0.1225680874896391
R2: 0.9965598665587388
FSD: 0.05865265076073835
Similarity: 0.9997388007043301

LightGBM - Test Metrics:
MSE: 0.04755354632493866
RMSE: 0.21806775627070285
MAE: 0.11865524474482415
R2: 0.9973293716899742
FSD: 0.051678122160405696
Similarity: 0.9997762551256572


# Water Volume

Preprocessing 

In [2]:
data = pd.read_csv('CAF357_final_cleaned_xgb_complete.csv')

data = data.dropna()

features = data.drop(columns=['Date', 'Location', 'VW_90cm'])
targets = data['VW_90cm']

X_train, X_temp, y_train, y_temp = train_test_split(features, targets, test_size=0.5, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

print(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Testing set size: {len(X_test)}")

Train: (1293, 9), Validation: (646, 9), Test: (647, 9)
Training set size: 1293
Validation set size: 646
Testing set size: 647


Linear Regression

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_val_pred_lr = lr.predict(X_val)

mse_lr = mean_squared_error(y_val, y_val_pred_lr)
r2_lr = r2_score(y_val, y_val_pred_lr)
print(f"Linear Regression - Validation MSE: {mse_lr}, R2: {r2_lr}")

y_test_pred_lr = lr.predict(X_test)

mse_lr_test = mean_squared_error(y_test, y_test_pred_lr)
r2_lr_test = r2_score(y_test, y_test_pred_lr)
print(f"Linear Regression - Test MSE: {mse_lr_test}, R2: {r2_lr_test}")

Linear Regression - Validation MSE: 0.0010955143156734088, R2: 0.7954319190204371
Linear Regression - Test MSE: 0.0010850961059727065, R2: 0.7982046601561356


KNN

In [44]:
knn = KNeighborsRegressor()

param_grid_knn = {
    'n_neighbors': [1, 2, 3, 5, 7, 9],
    'weights': [ 'distance'],
    'p': [1, 2]  
}

grid_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='neg_mean_squared_error')
grid_knn.fit(X_train, y_train)

print(f"Best parameters for KNN: {grid_knn.best_params_}")

y_val_pred_knn = grid_knn.predict(X_val)

mse_knn = mean_squared_error(y_val, y_val_pred_knn)
r2_knn = r2_score(y_val, y_val_pred_knn)
print(f"KNN - Validation MSE: {mse_knn}, R2: {r2_knn}")

y_test_pred_knn = grid_knn.predict(X_test)

mse_knn_test = mean_squared_error(y_test, y_test_pred_knn)
r2_knn_test = r2_score(y_test, y_test_pred_knn)
print(f"KNN - Test MSE: {mse_knn_test}, R2: {r2_knn_test}")

Best parameters for KNN: {'n_neighbors': 1, 'p': 1, 'weights': 'distance'}
KNN - Validation MSE: 3.618266253869968e-05, R2: 0.9932435224858536
KNN - Test MSE: 7.056414219474498e-05, R2: 0.9868771853694797


SVR w/ GridSearch

In [18]:
svr = SVR()
model = make_pipeline(
    PolynomialFeatures(degree=2),
    SVR()
)

param_grid_svr = {
    'kernel': ['rbf', 'poly'],
    'C': [0.01, 0.1, 1, 10, 100, 1000],
    'gamma': [0.001, 0.01, 0.1, 1, 'scale', 'auto'],
    'degree': [2, 3, 4]  # For 'poly' kernel
}

# Perform grid search
grid_svr = GridSearchCV(svr, param_grid_svr, cv=5, scoring='neg_mean_squared_error')
grid_svr.fit(X_train, y_train)

# Print best parameters
print(f"Best parameters for SVM: {grid_svr.best_params_}")

# Evaluate on validation set
y_val_pred_svr = grid_svr.predict(X_val)
mse_svr = mean_squared_error(y_val, y_val_pred_svr)
r2_svr = r2_score(y_val, y_val_pred_svr)
print(f"SVM - Validation MSE: {mse_svr}, R2: {r2_svr}")

# Evaluate on test set
y_test_pred_svr = grid_svr.predict(X_test)
mse_svr_test = mean_squared_error(y_test, y_test_pred_svr)
r2_svr_test = r2_score(y_test, y_test_pred_svr)
print(f"SVM - Test MSE: {mse_svr_test}, R2: {r2_svr_test}")

Best parameters for SVM: {'C': 0.01, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf'}
SVM - Validation MSE: 0.005903801083591329, R2: -0.10243128809590685
SVM - Test MSE: 0.005862129443585779, R2: -0.09018030455163273


Random Forest w/ GridSearch

In [3]:
rf = RandomForestRegressor(random_state=42)

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_rf.fit(X_train, y_train)

print(f"Best parameters for Random Forest: {grid_rf.best_params_}")

# Validation predictions
y_val_pred_rf = grid_rf.predict(X_val)

# Calculate metrics for validation set
mse_rf = mean_squared_error(y_val, y_val_pred_rf)
rmse_rf = np.sqrt(mse_rf)  # Root Mean Squared Error
mae_rf = mean_absolute_error(y_val, y_val_pred_rf)  # Mean Absolute Error
r2_rf = r2_score(y_val, y_val_pred_rf)
fsd_rf = rmse_rf / np.std(y_val)  # Fraction of Standard Deviation
similarity_rf = np.dot(y_val, y_val_pred_rf) / (np.sqrt(np.dot(y_val, y_val)) * np.sqrt(np.dot(y_val_pred_rf, y_val_pred_rf)))  # Cosine similarity

print(f"Random Forest - Validation Metrics:")
print(f"MSE: {mse_rf}")
print(f"RMSE: {rmse_rf}")
print(f"MAE: {mae_rf}")
print(f"R2: {r2_rf}")
print(f"FSD: {fsd_rf}")
print(f"Similarity: {similarity_rf}")

# Test predictions
y_test_pred_rf = grid_rf.predict(X_test)

# Calculate metrics for test set
mse_rf_test = mean_squared_error(y_test, y_test_pred_rf)
rmse_rf_test = np.sqrt(mse_rf_test)
mae_rf_test = mean_absolute_error(y_test, y_test_pred_rf)
r2_rf_test = r2_score(y_test, y_test_pred_rf)
fsd_rf_test = rmse_rf_test / np.std(y_test)
similarity_rf_test = np.dot(y_test, y_test_pred_rf) / (np.sqrt(np.dot(y_test, y_test)) * np.sqrt(np.dot(y_test_pred_rf, y_test_pred_rf)))

print(f"\nRandom Forest - Test Metrics:")
print(f"MSE: {mse_rf_test}")
print(f"RMSE: {rmse_rf_test}")
print(f"MAE: {mae_rf_test}")
print(f"R2: {r2_rf_test}")
print(f"FSD: {fsd_rf_test}")
print(f"Similarity: {similarity_rf_test}")

Best parameters for Random Forest: {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Random Forest - Validation Metrics:
MSE: 5.5656463576593234e-05
RMSE: 0.007460325969861722
MAE: 0.0026274604818573044
R2: 0.9896071317507394
FSD: 0.10194541799051388
Similarity: 0.9997395204629812

Random Forest - Test Metrics:
MSE: 5.6394427790783885e-05
RMSE: 0.007509622346748463
MAE: 0.002419297829790419
R2: 0.9895123273793891
FSD: 0.10240933854200476
Similarity: 0.9997312420119646


XGB Regressor w/ GridSearch

In [29]:
xgb = XGBRegressor(random_state=42)

param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2, 0.5, 0.8]
}

grid_xgb = GridSearchCV(xgb, param_grid_xgb, cv=5, scoring='neg_mean_squared_error')
grid_xgb.fit(X_train, y_train)

print(f"Best parameters for XGBoost: {grid_xgb.best_params_}")

y_val_pred_xgb = grid_xgb.predict(X_val)

mse_xgb = mean_squared_error(y_val, y_val_pred_xgb)
r2_xgb = r2_score(y_val, y_val_pred_xgb)
print(f"XGBoost - Validation MSE: {mse_xgb}, R2: {r2_xgb}")

y_test_pred_xgb = grid_xgb.predict(X_test)

mse_xgb_test = mean_squared_error(y_test, y_test_pred_xgb)
r2_xgb_test = r2_score(y_test, y_test_pred_xgb)
print(f"XGBoost - Test MSE: {mse_xgb_test}, R2: {r2_xgb_test}")

Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 100}
XGBoost - Validation MSE: 0.0001432306256974724, R2: 0.9732541931974452
XGBoost - Test MSE: 8.529929374520264e-05, R2: 0.9841368890045692


LightBGM w/ GridSearch

In [51]:
lgbm = LGBMRegressor(random_state=42, verbosity=-1)


param_grid_lgbm = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_samples': [10, 20, 30],
    'num_leaves': [31, 50, 100]
}

grid_lgbm = GridSearchCV(lgbm, param_grid_lgbm, cv=5, scoring='neg_mean_squared_error', verbose = 0)
grid_lgbm.fit(X_train, y_train)

print(f"Best parameters for LightGBM: {grid_lgbm.best_params_}")

y_val_pred_lgbm = grid_lgbm.predict(X_val)

mse_lgbm = mean_squared_error(y_val, y_val_pred_lgbm)
r2_lgbm = r2_score(y_val, y_val_pred_lgbm)
print(f"LightGBM - Validation MSE: {mse_lgbm}, R2: {r2_lgbm}")

y_test_pred_lgbm = grid_lgbm.predict(X_test)

mse_lgbm_test = mean_squared_error(y_test, y_test_pred_lgbm)
r2_lgbm_test = r2_score(y_test, y_test_pred_lgbm)
print(f"LightGBM - Test MSE: {mse_lgbm_test}, R2: {r2_lgbm_test}")

Best parameters for LightGBM: {'learning_rate': 0.1, 'max_depth': 9, 'min_child_samples': 20, 'n_estimators': 300, 'num_leaves': 31}
LightGBM - Validation MSE: 0.00010641384844701092, R2: 0.9801290805103959
LightGBM - Test MSE: 9.150246362440958e-05, R2: 0.982983285404857
