###   Notebook de Treinamento de Modelo

#### Objetivo: 
    Construir um modelo para determinar o preço de um veículo.

Importando as bibliotecas necessarias:

In [1]:
import pandas as pd
import sklearn
import mlflow

Leitura da base de dados:

In [2]:
df = pd.read_csv('/home/pc/Task_0/automobile.csv')

In [3]:
df

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102.0,5500.0,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115.0,5500.0,18,22,17450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114.0,5400.0,23,28,16845
197,-1,95.0,volvo,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160.0,5300.0,19,25,19045
198,-1,95.0,volvo,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134.0,5500.0,18,23,21485
199,-1,95.0,volvo,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106.0,4800.0,26,27,22470


Separar os dados em treinamento e teste:

In [4]:
x = df.drop('price', axis=1)
y = df['price'].copy()

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test =  train_test_split(x, y)

Criando experimento no mlflow:

In [6]:
mlflow.set_experiment('cars-prices')
mlflow.start_run()

<ActiveRun: >

Regressão Linear:

In [7]:
from sklearn.linear_model import LinearRegression

x_train = x_train.apply(pd.to_numeric, errors='coerce')
y_train = y_train.apply(pd.to_numeric, errors='coerce')

x_train.fillna(0, inplace=True)
y_train.fillna(0, inplace=True)
    
lr = LinearRegression()
lr.fit(x_train, y_train)

mlflow.sklearn.log_model(lr, 'lr')

ModelInfo(artifact_path='lr', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.8.10', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.0.2', 'serialization_format': 'cloudpickle'}}, model_uri='runs:/d96f40d04d2542cf914c1914da8975b0/lr', model_uuid='e30484b1e3d94e7f8d14d9eb205b2fe0', run_id='d96f40d04d2542cf914c1914da8975b0', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-05-01 22:43:50.819609')

In [8]:
x_test = x_test.apply(pd.to_numeric, errors='coerce')
y_test = y_test.apply(pd.to_numeric, errors='coerce')

x_test.fillna(0, inplace=True)
y_test.fillna(0, inplace=True)


lr.predict(x_test)

array([17018.62103924, 18311.69896314, 10286.50026913, 22975.93702492,
       11615.60780561, 22566.46825945, 10616.34548332, 15656.55645187,
        9774.92945558,  8669.41510581,  5722.86094941, 13748.40899263,
       23122.01168683, 18272.5217233 ,  8405.92730495, 24576.1907484 ,
       17008.00532243, 10548.00118644,  6395.3873312 , 14355.75436362,
        6584.70582104, 31650.73814073,  6808.31654304, 14410.56459629,
        6432.86304504, 31006.48825268,  6819.79609918,  7897.15510652,
        9633.39756581,  7455.52451733,  9200.48226749, 18203.04463685,
       19349.57394803,  9419.99728873, 14435.07163917,  9103.69540037,
        7057.22216451, 17826.4995421 , 11404.62581316, 11720.10245401,
        9179.24959103, 24230.23735595, 11645.24080368, 20161.82833608,
       14347.8522308 , 15593.7913808 , 40837.75140447, 15569.14499106,
        6839.92507433, 14410.56459629, 18311.69896314])

#### Metricas do modelo:

In [9]:
from sklearn.metrics import mean_squared_error, r2_score
import math

In [10]:
mse = mean_squared_error(y_test, lr.predict(x_test))
rmse = math.sqrt(mse)
r2 = r2_score(y_test, lr.predict(x_test))

Enviar as metricas para o mlflow:

In [11]:
mlflow.log_metric('mse', mse)
mlflow.log_metric('rmse', rmse)
mlflow.log_metric('r2', r2)

In [12]:
rmse

3272.7278327440144

In [13]:
r2

0.8342085608067982

In [14]:
mlflow.end_run()

### Construindo o modelo com XGBoost

In [15]:
!pip install xgboost



Importando as bibliotecas

In [16]:
import xgboost, pandas
from xgboost import XGBRFRegressor

  from pandas import MultiIndex, Int64Index


Criando o modelo e passando as metricas para o mlflow:

In [17]:
with mlflow.start_run():

    xgb = XGBRFRegressor(random_state=42)
    xgb.fit(x_train, y_train)
    mlflow.xgboost.log_model(xgb, 'xboost')
    
    xgb_predicted = xgb.predict(x_test)
    mse = mean_squared_error(y_test, xgb_predicted)
    rmse = math.sqrt(mse)
    r2 = r2_score(y_test, xgb_predicted)
    
    mlflow.log_metric('mse', mse)
    mlflow.log_metric('rmse', rmse)
    mlflow.log_metric('r2', r2)

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
