In [246]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score, mean_absolute_percentage_error, explained_variance_score, max_error
pd.set_option('display.max_columns', None)
from IPython.display import display
import lightgbm as lgb
import numpy as np
from sklearn.multioutput import MultiOutputRegressor

data = pd.read_csv("./preprocessed_data.csv")
data.drop_duplicates(subset="link")

Unnamed: 0.1,Unnamed: 0,title,home_type,link,garage,price,description,home_size,home_area,floor,elevator,price_per_sqr_meter,city,source,neighborhood,price_per_sqr_meter_bin,price_bin,home_area_bin,street_names
0,0,apartamento t0 rua eleuterio teixeira 8 capari...,apartamento,https://www.idealista.pt/imovel/33394837/,No,1000,"Estúdio com muita luz, casa de banho com duche...",T0,50,0.0,False,20.000000,Almada,idealista,almada,"(18.686, 23.248]","(355.38, 1364.8]","(42.92, 71.84]",almada
1,1,apartamento t0 sao joao caparica costa caparica,apartamento,https://www.idealista.pt/imovel/33182313/,No,1250,"Apartamento T0, com vista de Mar.\r\nNão perca...",T0,60,10.0,True,20.833333,Almada,idealista,costa caparica,"(18.686, 23.248]","(355.38, 1364.8]","(42.92, 71.84]",costa caparica
2,2,apartamento t0 sao joao caparica costa caparica,apartamento,https://www.idealista.pt/imovel/33225472/,No,1200,Não perca a oportunidade de viver num apartame...,T0,57,6.0,True,21.052632,Almada,idealista,costa caparica,"(18.686, 23.248]","(355.38, 1364.8]","(42.92, 71.84]",costa caparica
3,3,apartamento t0 avenida liberdade 5 caparica tr...,apartamento,https://www.idealista.pt/imovel/33132721/,No,1200,"Excelente apartamento t1 na Trafaria, a 2 minu...",T0,50,1.0,False,24.000000,Almada,idealista,almada,"(23.248, 27.81]","(355.38, 1364.8]","(42.92, 71.84]",almada
4,4,apartamento t1 almada,apartamento,https://www.idealista.pt/imovel/33394381/,No,918,Apartamento de tipologia T1 em prédio totalmen...,T1,55,0.0,False,16.690909,Almada,idealista,apartamento t1 almada,"(14.124, 18.686]","(355.38, 1364.8]","(42.92, 71.84]",apartamento t1 almada
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7322,7322,apartamento t3 rua senhora graca repeses sao s...,apartamento,https://www.idealista.pt/imovel/33389978/,No,750,Apartamento com 3 quartos com moveis\r\nCozinh...,T3,110,2.0,False,6.818182,Viseu,idealista,viseu,"(4.886, 9.562]","(355.38, 1364.8]","(100.76, 129.68]",viseu
7323,7323,apartamento t3 avenida doutor antonio jose alm...,apartamento,https://www.idealista.pt/imovel/30425307/,No,880,Apartamento no centro de Viseu no último andar...,T3,140,4.0,True,6.285714,Viseu,idealista,viseu,"(4.886, 9.562]","(355.38, 1364.8]","(129.68, 158.6]",avenida doutor antonio jose
7324,7324,apartamento t3 avenida belgica 42 aguieira s...,apartamento,https://www.idealista.pt/imovel/33355326/,No,700,,T3,86,1.0,False,8.139535,Viseu,idealista,viseu,"(4.886, 9.562]","(355.38, 1364.8]","(71.84, 100.76]",santiago viseu
7325,7325,apartamento t3 rua antonio gois guerreiro 185 ...,apartamento,https://www.idealista.pt/imovel/33358685/,No,1200,,T3,100,5.0,True,12.000000,Viseu,idealista,viseu,"(9.562, 14.124]","(355.38, 1364.8]","(71.84, 100.76]",viseu


In [247]:
selected_features = ["home_type", "garage", "home_size", "floor", "elevator", "city", "neighborhood"]
target = ["price", "home_area"]

data = data[selected_features + target]
data.head(5)

Unnamed: 0,home_type,garage,home_size,floor,elevator,city,neighborhood,price,home_area
0,apartamento,No,T0,0.0,False,Almada,almada,1000,50
1,apartamento,No,T0,10.0,True,Almada,costa caparica,1250,60
2,apartamento,No,T0,6.0,True,Almada,costa caparica,1200,57
3,apartamento,No,T0,1.0,False,Almada,almada,1200,50
4,apartamento,No,T1,0.0,False,Almada,apartamento t1 almada,918,55


In [248]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
from typing import List

def process_columns(df: pd.DataFrame, 
                    numeric_cols: List[str], 
                    categorical_cols: List[str], 
                    ordinal_cols: List[str],
                    preprocess_numeric = False) -> pd.DataFrame:
    """
    Process columns of a DataFrame based on their type: numeric, categorical, and ordinal.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to process.
    - numeric_cols (List[str]): List of column names to treat as numeric.
    - categorical_cols (List[str]): List of column names to treat as categorical.
    - ordinal_cols (List[str]): List of column names to treat as ordinal.
    
    Returns:
    - pd.DataFrame: A new DataFrame with processed columns.
    """

    # Process numeric columns - normalize using MinMaxScaler
    df_numeric = df[numeric_cols] # Empty DataFrame if no numeric columns
    if preprocess_numeric:
        if numeric_cols:  # Check if there are any numeric columns
            scaler = MinMaxScaler()  # Initialize the MinMaxScaler
            df_numeric = pd.DataFrame(scaler.fit_transform(df[numeric_cols]),
                                    columns=numeric_cols,
                                    index=df.index)

    # Process categorical columns with OneHotEncoder
    if categorical_cols:  # Check if there are any categorical columns
        ohe = OneHotEncoder(sparse_output=False)  # Initialize OneHotEncoder with sparse_output set to False
        df_categorical = pd.DataFrame(ohe.fit_transform(df[categorical_cols]),
                                      columns=ohe.get_feature_names_out(),
                                      index=df.index)
    else:
        df_categorical = pd.DataFrame(index=df.index)  # Empty DataFrame if no categorical columns
    
    # Process ordinal columns with OrdinalEncoder
    if ordinal_cols:  # Check if there are any ordinal columns
        oe = OrdinalEncoder()  # Initialize OrdinalEncoder
        df_ordinal = pd.DataFrame(oe.fit_transform(df[ordinal_cols]),
                                  columns=ordinal_cols,
                                  index=df.index)
    else:
        df_ordinal = pd.DataFrame(index=df.index)  # Empty DataFrame if no ordinal columns
    
    # Concatenate all processed dataframes
    df_processed = pd.concat([df_numeric, df_categorical, df_ordinal], axis=1)
    
    return df_processed



In [249]:
numeric_columns = ["price", "elevator", "home_area"] # include bool
categorical_columns = ["home_size", "home_type", "city", "neighborhood", "garage"]
ordinal_columns = []

processed_df = process_columns(data, numeric_columns, categorical_columns, ordinal_columns, preprocess_numeric= True)
processed_df.head(5)

Unnamed: 0,price,elevator,home_area,home_size_T0,home_size_T1,home_size_T2,home_size_T3,home_size_T4,home_size_T5,home_size_T6,home_type_andar,home_type_apartamento,home_type_casa,home_type_duplex,home_type_moradia,home_type_penthouse,home_type_solar,home_type_nan,city_Almada,city_Aveiro,city_Cascais,city_Coimbra,city_Gondomar,city_Guimarães,city_Leiria,city_Lisboa,city_Loures,city_Maia,city_Matosinhos,city_Porto,city_Setúbal,city_Sintra,city_Viana do Castelo,city_Vila Nova de Gaia,city_Viseu,neighborhood_agualva mira sintra,neighborhood_ajuda,neighborhood_alcabideche,neighborhood_alcantara,neighborhood_aldoar foz douro nevogilde,neighborhood_algueirao mem martins,neighborhood_almada,neighborhood_almada cova piedade pragal cacilhas,neighborhood_alvalade,neighborhood_apartamento t1 almada,neighborhood_apartamento t2 almada,neighborhood_apartamento t3 almada,neighborhood_arcozelo,neighborhood_areeiro,neighborhood_arroios,neighborhood_aveiro,neighborhood_avenidas,neighborhood_azeitao,neighborhood_belem,neighborhood_benfica,neighborhood_bonfim,neighborhood_cacem sao marcos,neighborhood_camarate unhos apelacao,neighborhood_campanha,neighborhood_campo ourique,neighborhood_campolide,neighborhood_canelas,neighborhood_canidelo,neighborhood_carcavelos parede,neighborhood_carnide,neighborhood_cascais estoril,neighborhood_cedofeita santo ildefonso miragaia sao nicolau vitoria,neighborhood_charneca caparica sobreda,neighborhood_cidade coimbra,neighborhood_cidade maia,neighborhood_coimbra,neighborhood_colares,neighborhood_costa caparica,neighborhood_custoias leca balio guifoes,neighborhood_estrela,neighborhood_gloria vera cruz,neighborhood_gondomar,neighborhood_grijo sermonde,neighborhood_guimaraes,neighborhood_gulpilhares valadares,neighborhood_laranjeiro feijo,neighborhood_leiria,neighborhood_lisboa,neighborhood_lordelo ouro massarelos,neighborhood_loures,neighborhood_lumiar,neighborhood_madalena,neighborhood_mafamude vilar paraiso,neighborhood_maia,neighborhood_massama monte abraao,neighborhood_matosinhos,neighborhood_matosinhos leca palmeira,neighborhood_misericordia,neighborhood_oliveira douro,neighborhood_paranhos,neighborhood_parque nacoes,neighborhood_pedroso seixezelo,neighborhood_penha franca,neighborhood_queluz belas,neighborhood_ramalde,neighborhood_rio mouro,neighborhood_rio tinto,neighborhood_sandim olival lever crestuma,neighborhood_santa maria,neighborhood_santa marinha sao pedro afurada,neighborhood_santo antonio,neighborhood_santo antonio olivais,neighborhood_sao cosme valbom jovim,neighborhood_sao domingos benfica,neighborhood_sao domingos rana,neighborhood_sao felix marinha,neighborhood_sao mamede infesta senhora hora,neighborhood_sao sebastiao,neighborhood_sao vicente,neighborhood_setubal,neighborhood_sintra,neighborhood_viana castelo,neighborhood_viana castelo meadela,neighborhood_vila gaia,neighborhood_vilar andorinho,neighborhood_viseu,garage_No,garage_Yes
0,0.025183,0.0,0.049793,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.035337,1.0,0.063624,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.033306,1.0,0.059474,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.033306,0.0,0.049793,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.021852,0.0,0.056708,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [250]:
print(processed_df.isna().sum())
processed_df.dropna(inplace= True)

price                           0
elevator                        0
home_area                       0
home_size_T0                    0
home_size_T1                    0
                               ..
neighborhood_vila gaia          0
neighborhood_vilar andorinho    0
neighborhood_viseu              0
garage_No                       0
garage_Yes                      0
Length: 123, dtype: int64


In [251]:
X_train, X_test, y_train, y_test = train_test_split(processed_df.drop(columns=target), 
                                                    processed_df[target], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [252]:


model = lgb.LGBMRegressor()
ovr = MultiOutputRegressor(model)
ovr.fit(X_train, y_train)
Y_pred_ovr = ovr.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000213 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 5861, number of used features: 80
[LightGBM] [Info] Start training from score 0.079214
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 5861, number of used features: 80
[LightGBM] [Info] Start training from score 0.133810


In [253]:
def regression_scores(y_test, y_pred):
    scores = {}
    scores['mean_squared_error'] = mean_squared_error(y_test, y_pred)
    scores['root_mean_squared_error'] = mean_squared_error(y_test, y_pred, squared=False)
    scores['mean_absolute_error'] = mean_absolute_error(y_test, y_pred)
    scores['median_absolute_error'] = median_absolute_error(y_test, y_pred)
    scores['r2_score'] = r2_score(y_test, y_pred)
    scores['mean_absolute_percentage_error'] = mean_absolute_percentage_error(y_test, y_pred)
    scores['explained_variance_score'] = explained_variance_score(y_test, y_pred)
    # scores['max_error'] = max_error(y_test, y_pred)

    return scores

regression_scores(Y_pred_ovr, y_test)



{'mean_squared_error': 0.0024221947695826085,
 'root_mean_squared_error': 0.04917534469533243,
 'mean_absolute_error': 0.027173063092894884,
 'median_absolute_error': 0.014855565993509016,
 'r2_score': 0.60648277113667,
 'mean_absolute_percentage_error': 0.2618851319346094,
 'explained_variance_score': 0.6069454197641897}

In [254]:
from sklearn.multioutput import RegressorChain

model = lgb.LGBMRegressor()
chain = RegressorChain(base_estimator=model, order=[0, 1])
chain.fit(X_train, y_train)
Y_pred_ovr = chain.predict(X_test)
regression_scores(Y_pred_ovr, y_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 160
[LightGBM] [Info] Number of data points in the train set: 5861, number of used features: 80
[LightGBM] [Info] Start training from score 0.079214
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 370
[LightGBM] [Info] Number of data points in the train set: 5861, number of used features: 81
[LightGBM] [Info] Start training from score 0.133810




{'mean_squared_error': 0.0026041398946440994,
 'root_mean_squared_error': 0.050895364387342525,
 'mean_absolute_error': 0.027968985329874747,
 'median_absolute_error': 0.01526659123681808,
 'r2_score': 0.6055495696500928,
 'mean_absolute_percentage_error': 0.26689556334573916,
 'explained_variance_score': 0.6061286221258061}

In [255]:
print(Y_pred_ovr, y_test)

[[0.04015966 0.08532018]
 [0.06722833 0.03868948]
 [0.18669076 0.29394704]
 ...
 [0.03563446 0.03544253]
 [0.08438147 0.12862108]
 [0.07312454 0.11812466]]          price  home_area
3163  0.033306   0.091286
1562  0.029245   0.035961
4482  0.187652   0.291840
3721  0.106418   0.284924
3688  0.071893   0.123098
...        ...        ...
527   0.106418   0.117566
4161  0.063769   0.165975
5079  0.015435   0.009682
565   0.053615   0.118949
4508  0.073924   0.117566

[1466 rows x 2 columns]
