In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow

from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_parquet('../data/processed/train.parquet')
df.head()

Unnamed: 0,price,energy_certify,metric,description,location,rooms,company,property_type,district,bathroom,condition
0,3000,D,96.0,"Apartamento T3, próximo da NOVA, para arrendar...","Carcavelos e Parede, Cascais, Lisboa",3,,apartamento,Lisboa,2.0,Usado
1,1400,Isento / Em Trâmite,74.0,Novidade - ARRENDAMENTO - T3 Porto Baixa,"Bonfim, Porto",3,,apartamento,Porto,2.0,Usado
2,2650,B,127.0,T3 condomínio fechado | Benfica,"Benfica, Lisboa",3,,apartamento,Lisboa,2.0,Renovado
3,8500,D,245.0,Moradia T5 para arrendamento,"Oeiras e São Julião da Barra, Paço de Arcos e ...",5,RE/MAX Siimgroup,moradia,Lisboa,4.0,Novo
4,1500,C,50.0,Arrendamento Férias - Apartamento T1 no Alto d...,"Portimão, Faro",1,"Abracadabra - Mediação Imobiliária Unipessoal,...",apartamento,Faro,1.0,Usado


In [15]:
df_val = pd.read_parquet('../data/processed/val.parquet')
df_val.head()

Unnamed: 0,price,energy_certify,metric,description,location,rooms,company,property_type,district,bathroom,condition
0,900,A+,90.0,Apartamento para alugar,"Queluz e Belas, Sintra, Lisboa",2,OLX,apartamento,Lisboa,2.0,Renovado
1,1950,A,81.7,Apartamento T2 para arrendamento,"Alcântara, Lisboa",2,Maxgroup,apartamento,Lisboa,1.0,Usado
2,1200,E,140.0,Moradia T3 mobilado,"Campanhã, Porto",3,OLX,moradia,Porto,2.0,Novo
3,800,Isento / Em Trâmite,100.0,Arrenda-se T2 Praia Areia Branca Vista mar,"Lourinhã e Atalaia, Lourinhã, Lisboa",2,OLX,apartamento,Lisboa,1.0,Usado
4,2000,E,160.0,Moradia em Alcochete para arrendar,"Alcochete, Setúbal",4,Casafácil - Mediação Imobiliária,moradia,Setúbal,3.0,Renovado


In [6]:
MLFLOW_TRACKING_URI = 'http://127.0.0.1:5000'

In [7]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [8]:
mlflow.set_experiment('portugal-rent-price')

2023/09/18 17:58:46 INFO mlflow.tracking.fluent: Experiment with name 'portugal-rent-price' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1695056326392, experiment_id='1', last_update_time=1695056326392, lifecycle_stage='active', name='portugal-rent-price', tags={}>

In [10]:
model = LinearRegression()

In [11]:
X_train = df[['metric', 'rooms', 'bathroom']]
y_train = df['price']
print(X_train.shape)
print(y_train.shape)


(2091, 3)
(2091,)


In [13]:
model.fit(X_train, y_train)

In [17]:
pred = model.predict(df_val[['metric', 'rooms', 'bathroom']])

In [19]:
mean_squared_error(df_val['price'], pred, squared=False)

1757.807130498822

In [21]:
with mlflow.start_run():
    mlflow.set_tag('developer', "marcospaulo")

    mlflow.log_param('train-data-path', '../data/processed/train.parquet')

    alpha = 0.1
    mlflow.log_param('alpha', alpha)
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)

    y_pred = lr_model.predict(df_val[['metric', 'rooms', 'bathroom']])

    mse = mean_squared_error(df_val['price'], pred, squared=False)
    mlflow.log_param('mse', mse)