## Dependencies

In [1]:
# Pytorch Dependencies
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

# Pandas
import pandas as pd

# RF from sklearn
from sklearn.ensemble import RandomForestRegressor

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Neural Network with 1 hidden layer

In [2]:
# Simple MLP (Multi-Layer Perceptron)
class ScalingFactorPredictor(nn.Module):

  def __init__(self, input_size, hidden_size, output_size):

    super().__init__()
    self.fc1 = nn.Linear(input_size, hidden_size)  # First layer
    self.relu = nn.ReLU()                          # Activation function (can try different ones)
    self.fc2 = nn.Linear(hidden_size, output_size) # Output layer

  def forward(self, x):
      out = self.fc1(x)
      out = self.relu(out)
      out = self.fc2(out)
      return out

# Prepping the Data

In [3]:
class ScalingFactorDataset(Dataset):

  def __init__(self, csvfile):

    self.data = pd.read_csv(csvfile)

    x = self.data["p"].values
    y = self.data["mean best k"].values

    self.x = torch.tensor(x, dtype=torch.float32)
    self.y = torch.tensor(y, dtype=torch.float32)

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.x[idx].unsqueeze(0), self.y[idx].unsqueeze(0)

## Random Forest

In [None]:
# Random forest regression model
rf_reg = RandomForestRegressor(n_estimators=100)

## Training the Neural Network

In [4]:
def train_model(model, dataloader, epochs, criterion, optimizer, device):

  model.to(device)
  model.train()

  for epoch in range(epochs):
    total_loss = 0.0
    for x_batch, y_batch in dataloader:
      x_batch, y_batch = x_batch.to(device), y_batch.to(device)

      pred = model(x_batch)
      loss = criterion(pred, y_batch)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

def evaluate_model(model, dataloader, criterion, device):

  model.to(device)
  model.eval()
  test_loss = 0.0

  with torch.no_grad():
    for x_batch, y_batch in dataloader:
      x_batch, y_batch = x_batch.to(device), y_batch.to(device)
      pred = model(x_batch)
      loss = criterion(pred, y_batch)
      test_loss += loss.item()

  avg_loss = test_loss / len(dataloader)
  print(f"Test Loss: {avg_loss:.4f}")

In [7]:
input_size = 1     # Input features (probability p)
hidden_size = 4    # Number of neurons in the hidden layer
output_size = 1    # Output features (scaling factor)

model = ScalingFactorPredictor(input_size, hidden_size, output_size)

# Loss and optimizer
criterion = nn.MSELoss()  # Mean Squared Error Loss Function
optimizer = optim.Adam(model.parameters(), lr=0.001)  # GD Optimizer

# create dataset and randomly split into training, validation, and test datasets
dataset = ScalingFactorDataset("best_k_data.csv")
total_size = len(dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size  # Ensures full coverage
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# initialize dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, shuffle=False)
test_loader = DataLoader(test_dataset, shuffle=False)

In [10]:
# Training
train_model(model, train_loader, epochs=100, criterion=criterion, optimizer=optimizer, device=device)

Epoch 1/100, Loss: 0.0479
Epoch 2/100, Loss: 0.0453
Epoch 3/100, Loss: 0.0430
Epoch 4/100, Loss: 0.0408
Epoch 5/100, Loss: 0.0388
Epoch 6/100, Loss: 0.0370
Epoch 7/100, Loss: 0.0353
Epoch 8/100, Loss: 0.0339
Epoch 9/100, Loss: 0.0323
Epoch 10/100, Loss: 0.0312
Epoch 11/100, Loss: 0.0300
Epoch 12/100, Loss: 0.0290
Epoch 13/100, Loss: 0.0280
Epoch 14/100, Loss: 0.0272
Epoch 15/100, Loss: 0.0264
Epoch 16/100, Loss: 0.0258
Epoch 17/100, Loss: 0.0252
Epoch 18/100, Loss: 0.0246
Epoch 19/100, Loss: 0.0241
Epoch 20/100, Loss: 0.0237
Epoch 21/100, Loss: 0.0233
Epoch 22/100, Loss: 0.0230
Epoch 23/100, Loss: 0.0226
Epoch 24/100, Loss: 0.0224
Epoch 25/100, Loss: 0.0221
Epoch 26/100, Loss: 0.0219
Epoch 27/100, Loss: 0.0217
Epoch 28/100, Loss: 0.0215
Epoch 29/100, Loss: 0.0214
Epoch 30/100, Loss: 0.0213
Epoch 31/100, Loss: 0.0211
Epoch 32/100, Loss: 0.0210
Epoch 33/100, Loss: 0.0209
Epoch 34/100, Loss: 0.0209
Epoch 35/100, Loss: 0.0208
Epoch 36/100, Loss: 0.0207
Epoch 37/100, Loss: 0.0206
Epoch 38/1

In [11]:
# Testing
evaluate_model(model, test_loader, criterion, device)

Test Loss: 0.0117


## Train the Random Forest:

In [None]:
rf.fit(<x_train>, <y_train>)