In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.1-py3-none-any.whl.metadata (7.2 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.1-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.8/231.8 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.1 colorlog-6.9.0 optuna-4.2.1


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import optuna

In [None]:
# Data Loading/Preprocessing

color_data = pd.read_csv("color_calibration_formatted_data.csv")
# Drop NA Values from input and output columns
color_data = color_data.dropna(subset=["True R","True G","True B","Observed R","Observed G","Observed B",
                                       "Red R","Red G","Red B","Green R","Green G","Green B","Blue R",
                                       "Blue G","Blue B"])
# Each sample has a Red, Blue, and Green color calibration, each with R, G, B values
# If Red R < Red G or Red R < Red B, it is an anomalous sample and should be discarded, same with Green and Blue
red_condition = (color_data['Red R'] > color_data['Red G']) & (color_data['Red R'] > color_data['Red B'])
green_condition = (color_data['Green G'] > color_data['Green R']) & (color_data['Green G'] > color_data['Green B'])
blue_condition = (color_data['Blue B'] > color_data['Blue R']) & (color_data['Blue B'] > color_data['Blue G'])
color_data = color_data[red_condition & green_condition & blue_condition]
color_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2844 entries, 0 to 3207
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Sample Name    52 non-null     object 
 1   Sample Number  2786 non-null   object 
 2   File Name      2760 non-null   object 
 3   True R         2844 non-null   int64  
 4   True G         2844 non-null   int64  
 5   True B         2844 non-null   int64  
 6   Observed R     2844 non-null   float64
 7   Observed G     2844 non-null   float64
 8   Observed B     2844 non-null   float64
 9   Red R          2844 non-null   float64
 10  Red G          2844 non-null   float64
 11  Red B          2844 non-null   float64
 12  Green R        2844 non-null   float64
 13  Green G        2844 non-null   float64
 14  Green B        2844 non-null   float64
 15  Blue R         2844 non-null   float64
 16  Blue G         2844 non-null   float64
 17  Blue B         2844 non-null   float64
dtypes: float64(12

In [None]:
# Split dataset into training/testing sets

# Input values are Observed R,G,B, Red R,G,B, Blue R,G,B, Green R,G,B (12 values)
X = color_data.iloc[:, 6:].values
# Output values are True R,G,B
y = color_data.iloc[:, 3:6].values
# Get list of unique colors (each color is an list of [R,G,B]) which will be used for classification of predictions
color_list = np.unique(y, axis=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train a variety of Regression models

model_list = [LinearRegression(), Ridge(alpha=0.5), DecisionTreeRegressor(max_depth=50), RandomForestRegressor(n_estimators=100)]
model_names = ["Linear Regression", "Ridge Regression", "Decision Tree", "Random Forest", "Neural Network"]
# Store the predictions of each model (on X_test) in prediction_list
prediction_list = []
for model in model_list:
  model.fit(X_train, y_train)
  prediction_list.append(model.predict(X_test))

In [None]:
# Train a vanilla Neural Network with structure 12->128->64->32->3

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Define the neural network
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(12, 128) # Input layer (12) -> Hidden layer (128)
        self.fc2 = nn.Linear(128, 64) # Hidden layer(128) -> Hidden layer (64)
        self.fc3 = nn.Linear(64, 32)   # Hidden layer (64) -> Output layer (32)
        self.fc4 = nn.Linear(32, 3)   # Hidden layer (32) -> Output layer (3)
        self.relu = nn.ReLU()         # Activation function

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.fc4(x)  # No activation in output layer (regression)
        return x

# Initialize the model
model = NeuralNetwork()

# Define loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model
epochs = 1000
for epoch in range(epochs):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:  # Print loss every 10 epochs
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# Make predictions on X_test
with torch.no_grad():
    y_pred_tensor = model(X_test_tensor)
    prediction_list.append(y_pred_tensor.numpy())

Epoch [10/1000], Loss: 32662.2266
Epoch [20/1000], Loss: 12129.2441
Epoch [30/1000], Loss: 5009.4712
Epoch [40/1000], Loss: 2935.2844
Epoch [50/1000], Loss: 1857.7839
Epoch [60/1000], Loss: 1406.2212
Epoch [70/1000], Loss: 1086.7097
Epoch [80/1000], Loss: 876.3218
Epoch [90/1000], Loss: 754.1486
Epoch [100/1000], Loss: 678.0168
Epoch [110/1000], Loss: 626.1211
Epoch [120/1000], Loss: 589.6579
Epoch [130/1000], Loss: 562.6954
Epoch [140/1000], Loss: 542.7375
Epoch [150/1000], Loss: 527.7520
Epoch [160/1000], Loss: 516.5370
Epoch [170/1000], Loss: 507.7448
Epoch [180/1000], Loss: 500.6455
Epoch [190/1000], Loss: 494.6831
Epoch [200/1000], Loss: 489.4276
Epoch [210/1000], Loss: 484.7586
Epoch [220/1000], Loss: 480.4951
Epoch [230/1000], Loss: 476.5366
Epoch [240/1000], Loss: 472.6610
Epoch [250/1000], Loss: 468.8661
Epoch [260/1000], Loss: 465.1012
Epoch [270/1000], Loss: 461.3703
Epoch [280/1000], Loss: 457.6392
Epoch [290/1000], Loss: 453.9196
Epoch [300/1000], Loss: 450.1752
Epoch [310

In [None]:
# Calculate Model Accuracy (number of correctly classified samples in X_test/len(X_test))
def calculate_accuracy(prediction_list):
  # Store the accuracy of each model in accuracy_list
  accuracy_list = []
  for predictions in prediction_list:
    # A corrected prediction is the closest color (from True R,G,B) to a prediction
    corrected_preds = []
    for pred in predictions:
      # Find the closest color to each prediction (Euclidean distance measurement)
      min_dist = np.linalg.norm(pred - color_list[0])
      closest_color = color_list[0]
      for color_vec in color_list:
        if np.linalg.norm(pred - color_vec) < min_dist:
          min_dist = np.linalg.norm(pred - color_vec)
          closest_color = color_vec
      corrected_preds.append(closest_color)
    corrected_preds = np.array(corrected_preds)

    # Find the number of correctly classified samples
    correct = 0
    for i in range(len(y_test)):
      if y_test[i][0] == corrected_preds[i][0] and y_test[i][1] == corrected_preds[i][1] and y_test[i][2] == corrected_preds[i][2]:
        correct += 1

    accuracy_list.append(correct/len(y_test))

  return accuracy_list

# Calculate Mean Squared Error (MSE) and Root Mean Sqaured Error (RMSE)
def calculate_mse(prediction_list):
  # Store the [MSE, RMSE] for each model in mse_list
  mse_list = []
  for predictions in prediction_list:
    mse = mean_squared_error(y_test, predictions)
    rmse = np.sqrt(mse)
    mse_list.append([mse, rmse])

  return mse_list

In [None]:
accuracy_list = calculate_accuracy(prediction_list)
mse_list = calculate_mse(prediction_list)

# Print out the accuracy and MSE for each model
for i in range(len(model_names)):
  print("Model:", model_names[i])
  print("Classification Accuracy:", accuracy_list[i])
  print("MSE:", mse_list[i][0])
  print("RMSE:", mse_list[i][1])
  print()

Model: Linear Regression
Classification Accuracy: 0.16520210896309315
MSE: 329.9956512381501
RMSE: 18.165782428460112

Model: Ridge Regression
Classification Accuracy: 0.16520210896309315
MSE: 329.99569366486634
RMSE: 18.1657835962247

Model: Decision Tree
Classification Accuracy: 0.45694200351493847
MSE: 360.46221441124777
RMSE: 18.985842473044166

Model: Random Forest
Classification Accuracy: 0.43936731107205623
MSE: 213.23811476274173
RMSE: 14.602674918066954

Model: Neural Network
Classification Accuracy: 0.20035149384885764
MSE: 204.38182067871094
RMSE: 14.296217005862458



In [None]:
# Grid Search Hyperparameter Optimization on Random Forest Regressor

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [15, 30, 60],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
grid_search = GridSearchCV(estimator=RandomForestRegressor(),
                           param_grid=param_grid,
                           cv=3,
                           scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best parameters for Random Forest Regressor:", best_params)

predictions = best_model.predict(X_test)
accuracy_list = calculate_accuracy([predictions])
mse_list = calculate_mse([predictions])

print("Accuracy of Best Random Forest Regressor:", accuracy_list[0])
print("MSE of Best Random Forest Regressor:", mse_list[0][0])
print("RMSE of Best Random Forest Regressor:", mse_list[0][1])

Best parameters for Random Forest Regressor: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Accuracy of Best Random Forest Regressor: 0.4270650263620387
MSE of Best Random Forest Regressor: 215.09604444522563
RMSE of Best Random Forest Regressor: 14.666153021335406


In [None]:
# Grid Search Hyperparameter Optimization on Decision Tree

param_grid = {
    'criterion': ['absolute_error', 'squared_error', 'poisson', 'friedman_mse'],
    'max_depth': [15, 30, 60, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
grid_search = GridSearchCV(estimator=DecisionTreeRegressor(),
                           param_grid=param_grid,
                           cv=3,
                           scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print("Best parameters for Decision Tree Regressor:", best_params)

predictions = best_model.predict(X_test)
accuracy_list = calculate_accuracy([predictions])
mse_list = calculate_mse([predictions])

print("Accuracy of Best Decision Tree Regressor:", accuracy_list[0])
print("MSE of Best Decision Tree Regressor:", mse_list[0][0])
print("RMSE of Best Decision Tree Regressor:", mse_list[0][1])

Best parameters for Decision Tree Regressor: {'criterion': 'poisson', 'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 5}
Accuracy of Best Decision Tree Regressor: 0.29876977152899825
MSE of Best Decision Tree Regressor: 344.03152742075844
RMSE of Best Decision Tree Regressor: 18.54808689382165


In [None]:
# Grid Search Hyperparameter Optimization on Neural Network

# Define the neural network with tunable hyperparameters
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)  # No activation in the output layer (for regression)
        return x


# Define the objective function for Optuna
def objective(trial):
    # Hyperparameters to tune
    hidden_size1 = trial.suggest_int("hidden_size1", 32, 128, step=16)
    hidden_size2 = trial.suggest_int("hidden_size2", 16, 64, step=16)
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])

    # Create DataLoader for mini-batch training
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Initialize model, loss function, and optimizer
    model = NeuralNetwork(input_size=12, hidden_size1=hidden_size1, hidden_size2=hidden_size2, output_size=3)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # Train the model
    epochs = 1000
    for epoch in range(epochs):
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate the model on the test set
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        test_loss = criterion(y_pred, y_test_tensor).item()

    return test_loss  # Minimize the loss


# Run hyperparameter optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Get the best parameters
best_params = study.best_params
print("\nBest Hyperparameters:", best_params)

# Train final model with best parameters
best_model = NeuralNetwork(
    input_size=12,
    hidden_size1=best_params["hidden_size1"],
    hidden_size2=best_params["hidden_size2"],
    output_size=3
)
best_optimizer = optim.Adam(best_model.parameters(), lr=best_params["lr"])
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=best_params["batch_size"], shuffle=True)

# Train the best model
for epoch in range(1000):
    for batch_X, batch_y in train_loader:
        best_optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = nn.MSELoss()(outputs, batch_y)
        loss.backward()
        best_optimizer.step()

# Predict using the best model
with torch.no_grad():
    y_pred_tensor = best_model(X_test_tensor)
    predictions = y_pred_tensor.numpy()

accuracy_list = calculate_accuracy([predictions])
mse_list = calculate_mse([predictions])

print("Accuracy of Best Neural Network:", accuracy_list[0])
print("MSE of Best Neural Network:", mse_list[0][0])
print("RMSE of Best Neural Network:", mse_list[0][1])

[I 2025-03-24 04:20:40,911] A new study created in memory with name: no-name-553cf5d6-e1de-4c78-837c-405aa4e95d4b
  lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)
[I 2025-03-24 04:22:47,563] Trial 0 finished with value: 213.05992126464844 and parameters: {'hidden_size1': 32, 'hidden_size2': 48, 'lr': 0.0002047694263188294, 'batch_size': 32}. Best is trial 0 with value: 213.05992126464844.
[I 2025-03-24 04:26:28,669] Trial 1 finished with value: 165.5153350830078 and parameters: {'hidden_size1': 48, 'hidden_size2': 32, 'lr': 0.0006987412679126773, 'batch_size': 16}. Best is trial 1 with value: 165.5153350830078.
[I 2025-03-24 04:30:11,691] Trial 2 finished with value: 137.11451721191406 and parameters: {'hidden_size1': 48, 'hidden_size2': 32, 'lr': 0.006494350317648316, 'batch_size': 16}. Best is trial 2 with value: 137.11451721191406.
[I 2025-03-24 04:31:27,582] Trial 3 finished with value: 251.0258331298828 and parameters: {'hidden_size1': 48, 'hidden_size2': 32, 'lr': 0.000180989227


Best Hyperparameters: {'hidden_size1': 80, 'hidden_size2': 16, 'lr': 0.009680958909769614, 'batch_size': 16}
Accuracy of Best Neural Network: 0.36203866432337434
MSE of Best Neural Network: 175.67047119140625
RMSE of Best Neural Network: 13.25407375833582
