In [3]:
#!pip install mlflow optuna

Collecting mlflow
  Downloading mlflow-2.13.1-py3-none-any.whl (25.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.0/25.0 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20

In [4]:
import mlflow
import mlflow.sklearn
import optuna
from sklearn.metrics import mean_squared_error

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor

In [5]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [6]:
%cd /content/drive/MyDrive
df = pd.read_csv('/content/drive/MyDrive/Laptop_price.csv')

/content/drive/MyDrive


In [7]:
X = df.drop('Price', axis=1)
y = df['Price']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
categoricalFeatures = X.select_dtypes(include=['object']).columns.tolist()
numericalFeatures = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [10]:
numericalTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categoricalTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numericalTransformer, numericalFeatures),
        ('cat', categoricalTransformer, categoricalFeatures)
    ])

In [12]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('mod', XGBRegressor(random_state=37))
])

In [18]:
def objective(trial):
    params = {
        'mod__n_estimators': trial.suggest_int('mod__n_estimators', 50, 200),
        'mod__learning_rate': trial.suggest_float('mod__learning_rate', 0.01, 0.2),
        'mod__max_depth': trial.suggest_int('mod__max_depth', 3, 8),
        'mod__gamma': trial.suggest_float('mod__gamma', 0, 0.3),
        'mod__subsample': trial.suggest_float('mod__subsample', 0.8, 1.0)
    }

    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    predict = pipeline.predict(X_test)
    mean_se = mean_squared_error(y_test, predict)

    return mean_se

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

bestTrial = study.best_trial
bestParams = bestTrial.params

print("Лучшие параметры: ", bestParams)

[I 2024-06-04 00:15:12,635] A new study created in memory with name: no-name-8eae9a98-3182-41f3-99cf-5eb86297f4a4
[I 2024-06-04 00:15:12,886] Trial 0 finished with value: 40143.411208719444 and parameters: {'mod__n_estimators': 139, 'mod__learning_rate': 0.04440481735432135, 'mod__max_depth': 8, 'mod__gamma': 0.17455025749983805, 'mod__subsample': 0.8244335390335266}. Best is trial 0 with value: 40143.411208719444.
[I 2024-06-04 00:15:13,071] Trial 1 finished with value: 45724.25845077345 and parameters: {'mod__n_estimators': 172, 'mod__learning_rate': 0.14076995844429963, 'mod__max_depth': 6, 'mod__gamma': 0.26905729010844237, 'mod__subsample': 0.996312029003527}. Best is trial 0 with value: 40143.411208719444.
[I 2024-06-04 00:15:13,250] Trial 2 finished with value: 42326.32373075038 and parameters: {'mod__n_estimators': 166, 'mod__learning_rate': 0.08633818412518449, 'mod__max_depth': 5, 'mod__gamma': 0.20087218166637577, 'mod__subsample': 0.9777696466557445}. Best is trial 0 with v

Лучшие параметры:  {'mod__n_estimators': 75, 'mod__learning_rate': 0.10193641704890044, 'mod__max_depth': 3, 'mod__gamma': 0.2416411504813782, 'mod__subsample': 0.9038444113898828}


In [22]:
pipeline.set_params(**bestParams)
pipeline.fit(X_train, y_train)
predict = pipeline.predict(X_test)
mean_se = mean_squared_error(y_test, predict)

print(f"Mean Squared Error: {mean_se}")

Mean Squared Error: 36050.545636335264


In [24]:
mlflow.set_experiment("Optuna optimize")

with mlflow.start_run():
    mlflow.log_params(bestParams)
    mlflow.log_metric("mean_se", mean_se)
    mlflow.sklearn.log_model(pipeline, "mod")

