# Desenvolvimento do modelo de predição

In [2]:
import pandas as pd
from dagshub.data_engine import datasources
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import mlflow.models.signature
from mlflow.models import infer_signature
from catboost import CatBoostRegressor

Em MacOS é necessário instalar o liomp para a biblioteca XGBoost funcionar.

## Obtendo dados do dataset

In [20]:
ds = datasources.get('michelpf/fiap-ds-mlops-laptop-pricing-brl', 'processed')

In [21]:
ds.all().dataframe

Output()

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,laptop-price-brl-processed.csv,86240413,https://dagshub.com/api/v1/repos/michelpf/fiap...,text/plain,79387


In [22]:
res = ds.head()
for dp in res:
    print(dp.path_in_repo)
    print(dp.download_url)
    dataset_url = dp.download_url

Output()

data/processed/laptop-price-brl-processed.csv
https://dagshub.com/api/v1/repos/michelpf/fiap-ds-mlops-laptop-pricing-brl/raw/main/data/processed/laptop-price-brl-processed.csv


In [23]:
df = pd.read_csv(dataset_url)

In [24]:
df.head()

Unnamed: 0,ram_gb,ssd,hdd,graphic_card_gb,warranty,price,brand_asus,brand_dell,brand_hp,brand_lenovo,...,os_windows,weight_casual,weight_gaming,weight_thinnlight,touchscreen_0,touchscreen_1,ram_type_ddr4,ram_type_other,os_bit_32-bit,os_bit_64-bit
0,4,0,1024,4,0,2321,1,0,0,0,...,1,1,0,0,1,0,1,0,0,1
1,4,0,1024,4,0,2613,0,0,0,1,...,1,1,0,0,1,0,1,0,0,1
2,4,0,1024,4,0,2680,0,0,0,1,...,1,1,0,0,1,0,1,0,0,1
3,8,512,0,8,0,4689,1,0,0,0,...,1,1,0,0,1,0,1,0,1,0
4,4,0,512,4,0,1808,1,0,0,0,...,1,1,0,0,1,0,1,0,0,1


## Desenvolvimento de modelos

Rastreamento de experimentos utilizando MLflow.

In [3]:
dagshub.init(repo_owner='michelpf', repo_name='fiap-ds-mlops-laptop-pricing-brl', mlflow=True)

In [26]:
mlflow.autolog()

2025/05/15 23:34:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/05/15 23:34:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/05/15 23:34:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [27]:
features = list(df.columns)

features.remove("price")

features

['ram_gb',
 'ssd',
 'hdd',
 'graphic_card_gb',
 'warranty',
 'brand_asus',
 'brand_dell',
 'brand_hp',
 'brand_lenovo',
 'brand_other',
 'processor_brand_amd',
 'processor_brand_intel',
 'processor_brand_m1',
 'processor_name_core i3',
 'processor_name_core i5',
 'processor_name_core i7',
 'processor_name_other',
 'processor_name_ryzen 5',
 'processor_name_ryzen 7',
 'os_other',
 'os_windows',
 'weight_casual',
 'weight_gaming',
 'weight_thinnlight',
 'touchscreen_0',
 'touchscreen_1',
 'ram_type_ddr4',
 'ram_type_other',
 'os_bit_32-bit',
 'os_bit_64-bit']

In [28]:
len(features)

30

In [29]:
X = df[features]
X

Unnamed: 0,ram_gb,ssd,hdd,graphic_card_gb,warranty,brand_asus,brand_dell,brand_hp,brand_lenovo,brand_other,...,os_windows,weight_casual,weight_gaming,weight_thinnlight,touchscreen_0,touchscreen_1,ram_type_ddr4,ram_type_other,os_bit_32-bit,os_bit_64-bit
0,4,0,1024,4,0,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
1,4,0,1024,4,0,0,0,0,1,0,...,1,1,0,0,1,0,1,0,0,1
2,4,0,1024,4,0,0,0,0,1,0,...,1,1,0,0,1,0,1,0,0,1
3,8,512,0,8,0,1,0,0,0,0,...,1,1,0,0,1,0,1,0,1,0
4,4,0,512,4,0,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
774,4,1024,0,4,1,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
775,4,1024,0,4,1,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
776,4,1024,0,4,1,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
777,4,1024,0,4,1,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1


In [30]:
y = df["price"]
y

0       2321
1       2613
2       2680
3       4689
4       1808
       ...  
774     9111
775     9714
776    10049
777     9580
778     3852
Name: price, Length: 779, dtype: int64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=42)

In [32]:
def evaluate_and_log_model(kind, model_name, model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)

    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("MAPE", mape)

    # Inferir a assinatura automaticamente
    signature = infer_signature(X_test, predictions)

    if kind == "catboost":
        mlflow.sklearn.log_model(model, "model", signature=signature, input_example=X_test[:5])
    elif kind == "xgboost":
        mlflow.xgboost.log_model(model, "model", signature=signature, input_example=X_test[:5])
    elif kind == "lightgbm":
        mlflow.lightgbm.log_model(model, "model", signature=signature, input_example=X_test[:5])
    else:
        mlflow.sklearn.log_model(model, "model", signature=signature, input_example=X_test[:5])

    print(f"Resultados para {model_name}:")
    print(f" MSE: {mse:.4f}")
    print(f" MAE: {mae:.4f}")
    print(f" R2: {r2:.4f}")
    print(f" MAPE: {mape:.4%}")

In [33]:
with mlflow.start_run(run_name="Ridge_Regression"):
    param_grid = {'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]}
    ridge = Ridge()
    grid_search = GridSearchCV(ridge, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_alpha", best_model.alpha)
    evaluate_and_log_model("sklearn", "Ridge Regression", best_model, X_test, y_test)

2025/05/15 23:35:35 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


🏃 View run illustrious-goat-581 at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/fa4efee50b5b42dcb09fe49c6b00bc56
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0
🏃 View run righteous-panda-172 at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/a2513dfed3964df9854fdd93cc8bb920
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0
🏃 View run welcoming-boar-984 at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/4056ed02e2a84986be54aeb21fdbbce8
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0
🏃 View run exultant-deer-700 at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/87914e4d50c1433f9c769b72af47ca33
🧪 View experiment at: https://dagshub.com/michelp

🏃 View run tasteful-moth-31 at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/87319357b6ad4240b2bcfe347690ac40
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0
🏃 View run blushing-shoat-823 at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/c50c23b351644673bde4d3d55a443d5a
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0
🏃 View run magnificent-fish-890 at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/466a4e67d28845f1a062e5ced0e9fe39
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0
🏃 View run unruly-ray-271 at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/120355ff4eee4a6b9d224841d117303e
🧪 View experiment at: https://dagshub.com/michelpf/fiap

In [34]:
with mlflow.start_run(run_name="DecisionTree_Regressor"):
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    }
    tree = DecisionTreeRegressor(random_state=42)
    grid_search = GridSearchCV(tree, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_min_samples_split", best_model.min_samples_split)
    evaluate_and_log_model("sklearn", "Decision Tree Regressor", best_model, X_test, y_test)

2025/05/15 23:36:24 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.


Resultados para Decision Tree Regressor:
 MSE: 4252939.6299
 MAE: 1120.0006
 R2: 0.4972
 MAPE: 17.6409%
🏃 View run DecisionTree_Regressor at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/7180eadd277040c1a8882b92f31dae85
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0


In [35]:
from sklearn.neural_network import MLPRegressor

with mlflow.start_run(run_name="MLP_Regressor"):
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01]
    }
    mlp = MLPRegressor(max_iter=500, random_state=42)
    grid_search = GridSearchCV(mlp, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_hidden_layer_sizes", best_model.hidden_layer_sizes)
    mlflow.log_param("best_activation", best_model.activation)
    mlflow.log_param("best_alpha", best_model.alpha)
    evaluate_and_log_model("sklearn", "MLP Regressor", best_model, X_test, y_test)

2025/05/15 23:37:54 INFO mlflow.sklearn.utils: Logging the 5 best runs, 19 runs will be omitted.


Resultados para MLP Regressor:
 MSE: 3805458.3101
 MAE: 1262.1541
 R2: 0.5501
 MAPE: 23.9940%
🏃 View run MLP_Regressor at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/ad93eb7d84804d5e8802a44bbbfe55c4
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0


In [36]:
with mlflow.start_run(run_name="XGBoost_Regressor"):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3]
    }
    xgb = XGBRegressor(random_state=42, verbosity=0)
    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    evaluate_and_log_model("xgboost", "XGBoost Regressor", best_model, X_test, y_test)

2025/05/15 23:39:26 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.
  self.get_booster().save_model(fname)


Resultados para XGBoost Regressor:
 MSE: 3026852.0000
 MAE: 958.0400
 R2: 0.6421
 MAPE: 15.6366%
🏃 View run XGBoost_Regressor at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/211650eabbba4ef2a9cf2c7aed6fc36b
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0


In [37]:
with mlflow.start_run(run_name="LightGBM_Regressor"):
    param_grid = {
        'num_leaves': [50, 100],
        'learning_rate': [0.05, 0.1],
        'n_estimators': [100, 200]
    }
    model = lgb.LGBMRegressor(random_state=42, verbosity=-1)
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_percentage_error', cv=5)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    # Log
    mlflow.log_params(grid_search.best_params_)
    evaluate_and_log_model("lightgbm", "LightGBM Regressor", best_model, X_test, y_test)

2025/05/15 23:40:20 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


Resultados para LightGBM Regressor:
 MSE: 3139231.7877
 MAE: 1011.3879
 R2: 0.6289
 MAPE: 17.3409%
🏃 View run LightGBM_Regressor at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/27b8ae0bfa904bc2a83aaa58922ed81e
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0


In [38]:
with mlflow.start_run(run_name="Support_Vector_Regressor"):
    param_grid = {
        'C': [0.1, 1.0, 10.0],
        'epsilon': [0.01, 0.1, 0.2],
        'kernel': ['linear', 'rbf']
    }
    model = SVR()
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_percentage_error', cv=5)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_params(grid_search.best_params_)
    evaluate_and_log_model("sklearn", "SVR", best_model, X_test, y_test)

2025/05/15 23:41:13 INFO mlflow.sklearn.utils: Logging the 5 best runs, 13 runs will be omitted.


Resultados para SVR:
 MSE: 4403623.9159
 MAE: 1239.1448
 R2: 0.4794
 MAPE: 21.9444%
🏃 View run Support_Vector_Regressor at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/92ad448489d04d80aacd00246f49f88c
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0


In [39]:
with mlflow.start_run(run_name="CatBoost_Regressor"):
    param_grid = {
        'depth': [4, 6, 8],
        'learning_rate': [0.03, 0.1],
        'iterations': [100, 200]
    }
    model = CatBoostRegressor(verbose=0, random_state=42)
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_percentage_error', cv=5)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_params(grid_search.best_params_)
    evaluate_and_log_model("catboost", "CatBoost Regressor", best_model, X_test, y_test)

2025/05/15 23:42:08 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.


Resultados para CatBoost Regressor:
 MSE: 2831322.0332
 MAE: 952.0668
 R2: 0.6653
 MAPE: 15.8396%
🏃 View run CatBoost_Regressor at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0/runs/6624d527f4d24ce78dc763af00e67bc6
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-laptop-pricing-brl.mlflow/#/experiments/0


In [None]:
with mlflow.start_run(run_name="XGBoost_Regressor_Advanced"):
   
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 1],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [1, 5],
        'min_child_weight': [1, 3]
    }
   
    xgb = XGBRegressor(random_state=42, verbosity=0)
    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    evaluate_and_log_model("xgboost", "XGBoost Regressor", best_model, X_test, y_test)

## Registrando modelo com melhor performance

Depois de verificar os experimentos no painel do MLFlow, decidimos optar pelo modelo XGBoost pois obteve o menor valor de MAPE, alcançando a meta proposta>

In [4]:
run_id = "211650eabbba4ef2a9cf2c7aed6fc36b"
mlflow.register_model(model_uri=f"runs:/{run_id}/model", name="laptop-pricing-model")

Registered model 'laptop-pricing-model' already exists. Creating a new version of this model...
2025/05/22 21:52:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: laptop-pricing-model, version 5
Created version '5' of model 'laptop-pricing-model'.


<ModelVersion: aliases=[], creation_timestamp=1747961571897, current_stage='None', description='', last_updated_timestamp=1747961571897, name='laptop-pricing-model', run_id='211650eabbba4ef2a9cf2c7aed6fc36b', run_link='', source='mlflow-artifacts:/3ef16225938f4c37b7566d2fc4480a7c/211650eabbba4ef2a9cf2c7aed6fc36b/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='5'>