In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

import optuna
from optuna.trial import TrialState

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,root_mean_squared_error
from sklearn.datasets import fetch_california_housing

import numpy as np
import pandas as pd 
# Load your dataset (Replace this with your actual dataset)
# For demonstration, we'll use the California housing dataset
crop_data = pd.read_csv('train_dataset.csv')
X = crop_data.drop(['hg/ha_yield','Unnamed: 0'],axis=1)
y = crop_data['hg/ha_yield']

categorical_columns = X.select_dtypes(include=['object']).columns  # Find categorical columns

X_encoded = pd.get_dummies(X, columns=categorical_columns).astype(int)

# Split the dataset
y.isna().sum()

0

In [2]:
X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

X_train_np = X_train.values
X_test_np = X_val.values
y_train_np = y_train.values
y_test_np = y_val.values
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_np, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_np, dtype=torch.float32).unsqueeze(1)
X_val_tensor = torch.tensor(X_test_np, dtype=torch.float32)
y_val_tensor = torch.tensor(y_test_np, dtype=torch.float32).unsqueeze(1)

# Create DataLoader
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = torch.utils.data.TensorDataset(X_val_tensor, y_val_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Define the models
class MLPRegressor(nn.Module):
    def __init__(self, trial):
        super(MLPRegressor, self).__init__()
        n_layers = trial.suggest_int('n_layers', 1, 3)
        layers = []
        in_features = X_train.shape[1]

        for i in range(n_layers):
            out_features = trial.suggest_int(f'n_units_l{i}', 4, 128)
            layers.append(nn.Linear(in_features, out_features))
            layers.append(nn.ReLU())
            p = trial.suggest_float(f'dropout_l{i}', 0.0, 0.5)
            layers.append(nn.Dropout(p))
            in_features = out_features

        layers.append(nn.Linear(in_features, 1))  # Output layer
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# Objective function for Optuna
def objective(trial):
    # Suggest a model
    model_name = trial.suggest_categorical('model_name', ['LinearRegression', 'RandomForest', 'GradientBoosting', 'SVR', 'MLP'])
    #model_name = trial.suggest_categorical('model_name', ['MLP'])

    if model_name == 'LinearRegression':
        # No hyperparameters to tune for sklearn's LinearRegression
        from sklearn.linear_model import LinearRegression
        model = LinearRegression()
    elif model_name == 'RandomForest':
        from sklearn.ensemble import RandomForestRegressor
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 2, 20)
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    elif model_name == 'GradientBoosting':
        from sklearn.ensemble import GradientBoostingRegressor
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
        max_depth = trial.suggest_int('max_depth', 2, 20)
        model = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=42)
    elif model_name == 'SVR':
        from sklearn.svm import SVR
        C = trial.suggest_float('C', 1e-1, 10, log=True)
        epsilon = trial.suggest_float('epsilon', 1e-2, 1.0, log=True)
        model = SVR(C=C, epsilon=epsilon)
    elif model_name == 'MLP':
        # Use the MLPRegressor defined above
        device = torch.device("cpu")
        model = MLPRegressor(trial).to(device)
        optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
        lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
        optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
        criterion = nn.MSELoss()

        # Training loop
        num_epochs = 10
        for epoch in range(num_epochs):
            model.train()
            for batch_idx, (data, target) in enumerate(train_loader):
                data, target = data.to(device), target.to(device)
                optimizer.zero_grad()
                output = model(data)
                loss = criterion(output, target)
                loss.backward()
                optimizer.step()

        # Validation
        model.eval()
        preds = []
        targets = []
        with torch.no_grad():
            for data, target in val_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                preds.extend(output.cpu().numpy())
                targets.extend(target.cpu().numpy())

        try:
            rmse = root_mean_squared_error(targets, preds)
        except Exception:
            rmse = 10**100
        trial.set_user_attr('model', model)
        return rmse

    # For scikit-learn models
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    try:
        rmse = root_mean_squared_error(y_val, preds)
    except Exception:
        rmse = 10**100
    trial.set_user_attr('model', model)
    return rmse

# Callback to save the best model
def save_best_model(study, trial):
    if study.best_trial.number == trial.number:
        best_model = trial.user_attrs['model']
        # Save the model accordingly
        model_name = trial.params['model_name']
        if model_name == 'MLP':
            torch.save(best_model, 'best_mlp_model.pth')
        else:
            import joblib
            joblib.dump(best_model, f'best_{model_name}_model.pkl')

# Create and run the study
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, callbacks=[save_best_model])

print("Best hyperparameters: ", study.best_trial.params)


[I 2024-10-19 11:05:43,585] A new study created in memory with name: no-name-436204c2-75f0-4f81-8446-9561facd2eac
[I 2024-10-19 11:05:56,164] Trial 0 finished with value: 83980.53999536822 and parameters: {'model_name': 'GradientBoosting', 'n_estimators': 103, 'learning_rate': 0.00015040237560617014, 'max_depth': 9}. Best is trial 0 with value: 83980.53999536822.
[I 2024-10-19 11:06:10,350] Trial 1 finished with value: 14522.07954367364 and parameters: {'model_name': 'RandomForest', 'n_estimators': 157, 'max_depth': 13}. Best is trial 1 with value: 14522.07954367364.
[I 2024-10-19 11:07:13,588] Trial 2 finished with value: 93489.9898011665 and parameters: {'model_name': 'SVR', 'C': 0.5290307056034792, 'epsilon': 0.5188437397813945}. Best is trial 1 with value: 14522.07954367364.
[W 2024-10-19 11:07:24,267] Trial 3 failed with parameters: {'model_name': 'GradientBoosting', 'n_estimators': 115, 'learning_rate': 0.00029424707873745436, 'max_depth': 15} because of the following error: Keyb

KeyboardInterrupt: 

Datasets saved as 'train_dataset.csv' and 'test_dataset.csv'
