In [12]:
pip install optuna

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.12 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install xgboost --upgrade

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.12 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
import optuna
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import numpy as np

# Load your dataset
data = pd.read_csv('test.csv')

# Define features and target variable
X = data.drop('PremiumPrice', axis=1)  # Features (all columns except target)
y = data['PremiumPrice']                # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    model_name = trial.suggest_categorical('model', ['Random Forest', 'XGBoost', 'Linear Regression', 'SVM', 'Gradient Boosting'])

    if model_name == 'Random Forest':
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 20)
        }
        model = RandomForestRegressor(**params)

    elif model_name == 'XGBoost':
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 0, 5)
        }
        model = XGBRegressor(**params)

    elif model_name == 'SVM':
        params = {
            'kernel': trial.suggest_categorical('svm_kernel', ['linear', 'rbf', 'poly']),
            'C': trial.suggest_float('svm_C', 0.1, 10.0)
        }
        model = SVR(**params)

    elif model_name == 'Gradient Boosting':
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'n_estimators': trial.suggest_int('n_estimators', 50, 500),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10)
        }
        model = GradientBoostingRegressor(**params)

    else:  # Linear Regression
        model = LinearRegression()

    # Fit the model and make predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100  # MAPE as percentage

    return r2  # Return R-squared value for optimization

# Create the Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print the results
print("\nModel Performance Metrics-----")

# Evaluate and print results for each modelMedicalpremium

    # Initialize and fit the model with the best parameters
    if model_name == 'Random Forest':
        best_model = RandomForestRegressor(n_estimators=study.best_params.get('n_estimators', 100),
                                            max_depth=study.best_params.get('max_depth', 10))
    elif model_name == 'XGBoost':
        best_model = XGBRegressor(
            learning_rate=study.best_params.get('learning_rate', 0.1),
            max_depth=study.best_params.get('max_depth', 3),
            n_estimators=study.best_params.get('n_estimators', 100),
            min_child_weight=study.best_params.get('min_child_weight', 1),
            subsample=study.best_params.get('subsample', 1.0),
            gamma=study.best_params.get('gamma', 0)
        )
    elif model_name == 'SVM':
        best_model = SVR(
            kernel=study.best_params.get('svm_kernel', 'linear'),
            C=study.best_params.get('svm_C', 1.0)
        )
    elif model_name == 'Gradient Boosting':
        best_model = GradientBoostingRegressor(
            learning_rate=study.best_params.get('learning_rate', 0.1),
            n_estimators=study.best_params.get('n_estimators', 100),
            max_depth=study.best_params.get('max_depth', 3),
            min_samples_split=study.best_params.get('min_samples_split', 2)
        )
    else:  # Linear Regression
        best_model = LinearRegression()

    # Fit the model and make predictions
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100  # MAPE as percentage

    # Print metrics
    print(f"{model_name} MSE: {mse:.4f}")
    print(f"{model_name} RMSE: {rmse:.4f}")
    print(f"{model_name} R-squared: {r2:.4f}")
    print(f"{model_name} MAPE: {mape:.2f}%")


[I 2024-11-14 21:42:48,302] A new study created in memory with name: no-name-b16c8e18-bb9e-4a13-91d6-a49385fc22cf
[I 2024-11-14 21:42:48,330] Trial 0 finished with value: 0.5158939151112787 and parameters: {'model': 'SVM', 'svm_kernel': 'linear', 'svm_C': 1.3363230537399484}. Best is trial 0 with value: 0.5158939151112787.
[I 2024-11-14 21:42:48,429] Trial 1 finished with value: 0.8544477224349976 and parameters: {'model': 'XGBoost', 'learning_rate': 0.16524817081984308, 'max_depth': 4, 'n_estimators': 84, 'min_child_weight': 2, 'subsample': 0.7527678564805768, 'gamma': 1.8819939601823528}. Best is trial 1 with value: 0.8544477224349976.
[I 2024-11-14 21:42:49,295] Trial 2 finished with value: 0.8335126638412476 and parameters: {'model': 'XGBoost', 'learning_rate': 0.2831499397661252, 'max_depth': 6, 'n_estimators': 483, 'min_child_weight': 1, 'subsample': 0.8665671773177084, 'gamma': 2.0662912630518098}. Best is trial 1 with value: 0.8544477224349976.
[I 2024-11-14 21:42:49,802] Trial


Model Performance Metrics-----

Evaluating Random Forest...
Random Forest MSE: 5360037.0301
Random Forest RMSE: 2315.1754
Random Forest R-squared: 0.8743
Random Forest MAPE: 4.48%

Evaluating SVM...
SVM MSE: 20710921.6290
SVM RMSE: 4550.9254
SVM R-squared: 0.5143
SVM MAPE: 12.04%

Evaluating XGBoost...
XGBoost MSE: 8532286.8836
XGBoost RMSE: 2921.0079
XGBoost R-squared: 0.7999
XGBoost MAPE: 4.73%

Evaluating Linear Regression...
Linear Regression MSE: 12221661.7059
Linear Regression RMSE: 3495.9493
Linear Regression R-squared: 0.7134
Linear Regression MAPE: 10.96%

Evaluating Gradient Boosting...
Gradient Boosting MSE: 13229603.1147
Gradient Boosting RMSE: 3637.2521
Gradient Boosting R-squared: 0.6898
Gradient Boosting MAPE: 4.97%
