# Desenvolvimento do modelo para predição de preços de laptop

Exploração inicial de diferentes tipos de modelo.

In [2]:
import pandas as pd
from dagshub.data_engine import datasources
import mlflow
import dagshub
from sklearn.model_selection import train_test_split
import mlflow.sklearn
import mlflow.catboost
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
import mlflow.models.signature
from mlflow.models import infer_signature
from catboost import CatBoostRegressor

## Obtendo dados do dataset

In [3]:
ds = datasources.get("michelpf/fiap-ds-mlops-9dtsr-laptop-pricing", "processed")

In [4]:
ds.all().dataframe

Output()

Unnamed: 0,path,datapoint_id,dagshub_download_url,media type,size
0,laptop-price-brl-processed.csv,100654797,https://dagshub.com/api/v1/repos/michelpf/fiap...,text/plain,53354


In [5]:
res = ds.head()

for dp in res:
    dataset_url = dp.download_url

Output()

In [6]:
dataset_url

'https://dagshub.com/api/v1/repos/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing/raw/main/data/processed/laptop-price-brl-processed.csv'

In [7]:
df = pd.read_csv(dataset_url)
df.head()

Unnamed: 0,ram_gb,ssd,hdd,graphic_card_gb,warranty,price,brand_asus,brand_dell,brand_hp,brand_lenovo,...,os_windows,weight_casual,weight_gaming,weight_thinnlight,touchscreen_0,touchscreen_1,ram_type_ddr4,ram_type_other,os_bit_32-bit,os_bit_64-bit
0,4,0,1024,0,0,2321,1,0,0,0,...,1,1,0,0,1,0,1,0,0,1
1,4,0,1024,0,0,2613,0,0,0,1,...,1,1,0,0,1,0,1,0,0,1
2,4,0,1024,0,0,2680,0,0,0,1,...,1,1,0,0,1,0,1,0,0,1
3,8,512,0,2,0,4689,1,0,0,0,...,1,1,0,0,1,0,1,0,1,0
4,4,0,512,0,0,1808,1,0,0,0,...,1,1,0,0,1,0,1,0,0,1


## Desenvolvimento dos modelos

In [8]:
dagshub.init(repo_owner="michelpf", repo_name="fiap-ds-mlops-9dtsr-laptop-pricing", mlflow=True)

In [9]:
mlflow.autolog()

2025/06/11 20:43:03 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2025/06/11 20:43:04 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/06/11 20:43:04 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [10]:
features = list(df.columns)

features.remove("price")

features

['ram_gb',
 'ssd',
 'hdd',
 'graphic_card_gb',
 'warranty',
 'brand_asus',
 'brand_dell',
 'brand_hp',
 'brand_lenovo',
 'brand_other',
 'processor_brand_amd',
 'processor_brand_intel',
 'processor_brand_m1',
 'processor_name_core i3',
 'processor_name_core i5',
 'processor_name_core i7',
 'processor_name_other',
 'processor_name_ryzen 5',
 'processor_name_ryzen 7',
 'os_other',
 'os_windows',
 'weight_casual',
 'weight_gaming',
 'weight_thinnlight',
 'touchscreen_0',
 'touchscreen_1',
 'ram_type_ddr4',
 'ram_type_other',
 'os_bit_32-bit',
 'os_bit_64-bit']

In [11]:
X = df[features]

In [12]:
len(features)

30

In [13]:
X

Unnamed: 0,ram_gb,ssd,hdd,graphic_card_gb,warranty,brand_asus,brand_dell,brand_hp,brand_lenovo,brand_other,...,os_windows,weight_casual,weight_gaming,weight_thinnlight,touchscreen_0,touchscreen_1,ram_type_ddr4,ram_type_other,os_bit_32-bit,os_bit_64-bit
0,4,0,1024,0,0,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
1,4,0,1024,0,0,0,0,0,1,0,...,1,1,0,0,1,0,1,0,0,1
2,4,0,1024,0,0,0,0,0,1,0,...,1,1,0,0,1,0,1,0,0,1
3,8,512,0,2,0,1,0,0,0,0,...,1,1,0,0,1,0,1,0,1,0
4,4,0,512,0,0,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775,4,1024,0,0,1,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
776,4,1024,0,0,1,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
777,4,1024,0,4,1,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1
778,4,1024,0,4,1,1,0,0,0,0,...,1,1,0,0,1,0,1,0,0,1


In [13]:
y = df["price"]
y

0       2321
1       2613
2       2680
3       4689
4       1808
       ...  
775     9111
776     9714
777    10049
778     9580
779     3852
Name: price, Length: 780, dtype: int64

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=42)

In [19]:
def evaluate_and_log_model(kind, model_name, model, X_test, y_test):
    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)

    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)
    mlflow.log_metric("MAPE", mape)

    signature = infer_signature(X_test, predictions)

    if kind == "catboost":
        mlflow.catboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    elif kind == "xgboost":
        mlflow.xgboost.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    elif kind == "lightgbm":
        mlflow.lightgbm.log_model(model, model_name, signature=signature, input_example=X_test[:5])
    else:
        mlflow.sklearn.log_model(model, model_name, signature=signature, input_example=X_test[:5])

    print(f"Model {model_name} logged with MSE: {mse}, MAE: {mae}, R2: {r2}, MAPE: {mape}")
    

### Experimento com Ridge Regression

In [17]:
with mlflow.start_run(run_name="Ridge Regression"):
    param_grid = {"alpha": [0.1, 1.0, 10.0, 100.0]}

    ridge = Ridge()

    grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False))
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_alpha", best_model.alpha)
    evaluate_and_log_model("sklearn", "ridge_regression", best_model, X_test, y_test)

2025/06/11 20:33:10 INFO mlflow.sklearn.utils: Logging the 5 best runs, no runs will be omitted.


🏃 View run capable-squid-897 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/6cc2f69d66d44b84818c75b4a406eb14
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run rogue-auk-863 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/2339fe67f1e74ba284e69a01d540fe67
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run fun-asp-314 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/69273470476e4a51a23cf72cd5108284
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run carefree-ram-932 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/63be58e3f6e34ea686f2c76d74276204
🧪 View experiment at: https://dagshub.com/michelpf/f

🏃 View run resilient-shad-674 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/2eae30e136b2483c96a4904fbc16ba0c
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run casual-roo-186 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/ebd9bb3d7bcb41669101a7f64ee28758
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run languid-bee-349 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/bb3de8b6a35940b390acdae0113ddcc8
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run bright-loon-724 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/99690ee853e54d4fbd977951bbc507ee
🧪 View experiment at: https://dagshub.com/miche

## Decision Tree Regression

In [18]:
with mlflow.start_run(run_name="DecisionTree_Regressor"):
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10]
    }

    tree = DecisionTreeRegressor(random_state=42)
    
    grid_search = GridSearchCV(tree, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_min_samples_split", best_model.min_samples_split)
    
    evaluate_and_log_model("sklearn", "Decision Tree Regressor", best_model, X_test, y_test)

2025/06/11 20:38:55 INFO mlflow.sklearn.utils: Logging the 5 best runs, 7 runs will be omitted.


Model Decision Tree Regressor logged with MSE: 3742850.089654559, MAE: 1073.6755698005697, R2: 0.5570391775972132, MAPE: 0.17104075045176223
🏃 View run DecisionTree_Regressor at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/586482f0b77a4976b6e7c8a3883ad1de
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0


## MLP Regression

In [20]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [21]:
from sklearn.neural_network import MLPRegressor

with mlflow.start_run(run_name="MLP_Regressor"):
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01]
    }
    mlp = MLPRegressor(max_iter=500, random_state=42)

    grid_search = GridSearchCV(mlp, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_

    mlflow.log_param("best_hidden_layer_sizes", best_model.hidden_layer_sizes)
    mlflow.log_param("best_activation", best_model.activation)
    mlflow.log_param("best_alpha", best_model.alpha)
    
    evaluate_and_log_model("sklearn", "MLP Regressor", best_model, X_test, y_test)

2025/06/11 20:44:28 INFO mlflow.sklearn.utils: Logging the 5 best runs, 19 runs will be omitted.


🏃 View run traveling-croc-230 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/6f246475a5144b6e84baef803ec6df09
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run zealous-squirrel-397 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/5b15c57ca72d44ccb2106f91d34054d9
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run learned-boar-113 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/bcdb381ae6444a59a73e450efae6617c
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run dashing-mink-998 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/a670baa716c844f1b7ba8ad1eb48026b
🧪 View experiment at: https://dagshub.c

🏃 View run unique-conch-657 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/486e948cf3b84532b6d98f4aa50663b2
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run fortunate-grouse-495 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/6a4a2b3227874791941f8c0080d01667
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run silent-fox-839 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/a8bec187144844de86fa823e9e832da3
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0
🏃 View run thoughtful-bat-129 at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/eb099fc291ee43919f4938848efcf7d9
🧪 View experiment at: https://dagshub.com

## XGBoost Regressor

In [22]:
with mlflow.start_run(run_name="XGBoost_Regressor"):
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7, 9],
        'learning_rate': [0.01, 0.1, 0.2, 0.3]
    }
    xgb = XGBRegressor(random_state=42, verbosity=0)

    grid_search = GridSearchCV(xgb, param_grid, scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False), cv=5)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_

    mlflow.log_param("best_n_estimators", best_model.n_estimators)
    mlflow.log_param("best_max_depth", best_model.max_depth)
    mlflow.log_param("best_learning_rate", best_model.learning_rate)
    evaluate_and_log_model("xgboost", "XGBoost Regressor", best_model, X_test, y_test)

2025/06/11 20:46:52 INFO mlflow.sklearn.utils: Logging the 5 best runs, 43 runs will be omitted.
  self.get_booster().save_model(fname)


Model XGBoost Regressor logged with MSE: 2563481.5, MAE: 888.87158203125, R2: 0.6966157555580139, MAPE: 0.1418655961751938
🏃 View run XGBoost_Regressor at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0/runs/e41cca33dfbc434eabf6e116741b3865
🧪 View experiment at: https://dagshub.com/michelpf/fiap-ds-mlops-9dtsr-laptop-pricing.mlflow/#/experiments/0


## Model registry

In [23]:
run_id = "e41cca33dfbc434eabf6e116741b3865"

mlflow.register_model(model_uri=f"runs:/{run_id}/model", name="laptop-pricing-model-brl")

Successfully registered model 'laptop-pricing-model-brl'.
2025/06/11 20:52:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: laptop-pricing-model-brl, version 1
Created version '1' of model 'laptop-pricing-model-brl'.


<ModelVersion: aliases=[], creation_timestamp=1749685970958, current_stage='None', description='', last_updated_timestamp=1749685970958, name='laptop-pricing-model-brl', run_id='e41cca33dfbc434eabf6e116741b3865', run_link='', source='mlflow-artifacts:/62ec889fef764eb989a3ff67fbef1bb4/e41cca33dfbc434eabf6e116741b3865/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='1'>