# Proyecto Properatio: Creación y evaluación de model predictivo

Creacion de un modelo capaz de predecir valores en USD de propiedades tipo Oficinas y Locales Comerciales en CABA Argentina <br><br>
Creado por: Adriana Villalobos

## 1. Importación de librerías y Carga del dataset

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import mlflow.sklearn
from mlflow.models.signature import infer_signature

In [58]:
df = pd.read_csv('../data/data_cleaned.csv', sep=",")
df.head()

Unnamed: 0,neighborhood,rooms,bathrooms,surface_covered,property_type,operation_type,price_usd,price_per_m2,bathrooms_missing,rooms_missing
0,Once,2.0,1.0,20.0,Oficina,Venta,32000.0,1600.0,0,1
1,Flores,2.0,1.0,32.0,Oficina,Alquiler,514800.0,16088.0,0,1
2,Flores,2.0,1.0,46.0,Oficina,Alquiler,643500.0,13989.0,0,1
3,Palermo,2.0,2.0,70.0,Oficina,Alquiler,2316600.0,33094.0,0,1
4,Tribunales,2.0,1.0,40.0,Oficina,Venta,89000.0,2225.0,0,1


In [59]:
# Incluyo mlflow para trackear el desempeño de diferentes opciones a lo largo del proceso
mlflow.set_experiment(experiment_name="Proyectio_Properatio")

<Experiment: artifact_location='file:///Users/cosmos/Adri/Developer/DataScience/Clases%20y%20Consignas/Proyecto%20Final/notebooks/mlruns/417185556600615154', creation_time=1762538423016, experiment_id='417185556600615154', last_update_time=1762538423016, lifecycle_stage='active', name='Proyectio_Properatio', tags={}>

## 2. One Hot Encoding para property_type y operation_type

In [60]:
df = pd.get_dummies(df, columns=['property_type', 'operation_type'], drop_first=True)
df.head()

Unnamed: 0,neighborhood,rooms,bathrooms,surface_covered,price_usd,price_per_m2,bathrooms_missing,rooms_missing,property_type_Oficina,operation_type_Venta
0,Once,2.0,1.0,20.0,32000.0,1600.0,0,1,True,True
1,Flores,2.0,1.0,32.0,514800.0,16088.0,0,1,True,False
2,Flores,2.0,1.0,46.0,643500.0,13989.0,0,1,True,False
3,Palermo,2.0,2.0,70.0,2316600.0,33094.0,0,1,True,False
4,Tribunales,2.0,1.0,40.0,89000.0,2225.0,0,1,True,True


## 3. Separación de dataframes para distintos modelos

La variable del barrio se va a pasar por OHE para el modelo de LinearRegressor<br>
Para el de RandomForest se usará LabelEncoder en esa columna

In [61]:
from sklearn.preprocessing import LabelEncoder

### 3.1 LabelEncoder para RandomForest

In [62]:
# Versión para RandomForest
df_tree = df.copy()
le = LabelEncoder()
df_tree['neighborhood_encoded'] = le.fit_transform(df_tree['neighborhood'])
df_tree.drop(columns=['neighborhood'], inplace=True)

X_tree = df_tree.drop(columns=['price_usd', 'price_per_m2'])
y_tree = df_tree['price_usd']


In [63]:
with open('../models/columns_labelEncoder.pkl', 'wb') as f:
    pickle.dump(df.columns.tolist(), f)

### 3.2 OHE para LinearRegressor

In [64]:
# Versión para LinearRegression
df_linear = pd.get_dummies(df, columns=['neighborhood'], drop_first=True)
X_linear = df_linear.drop(columns=['price_usd', 'price_per_m2'])
y_linear = df_linear['price_usd']

In [65]:
# Reemplazo los espacios por _ para mantener la consistencia en los nombres de columnas

df.columns = df.columns.str.replace(' ', '_', regex=False)
df.columns = df.columns.str.replace('/', '', regex=False)

# Paso todos los nombres de columnas a lowercase

df.columns = df.columns.str.lower()

In [66]:
df.columns

Index(['neighborhood', 'rooms', 'bathrooms', 'surface_covered', 'price_usd',
       'price_per_m2', 'bathrooms_missing', 'rooms_missing',
       'property_type_oficina', 'operation_type_venta'],
      dtype='object')

In [67]:
with open('../models/columns_OHE.pkl', 'wb') as f:
    pickle.dump(df.columns.tolist(), f)

## 3. Separación de Tr/Ts

In [68]:
X_tree_reg = df_tree.drop(columns=['price_usd', 'price_per_m2'])
y_tree_reg = df_tree['price_per_m2']

X_linear_reg = df_linear.drop(columns=['price_usd', 'price_per_m2'])
y_linear_reg = df_linear['price_per_m2']

In [69]:
mlflow.log_param("Tamaño dataset_3_tree", X_tree_reg.shape)
mlflow.log_param("Tamaño dataset_3_linear", X_linear_reg.shape)

(21778, 45)

Para medir el rendimiento según el porcentaje asignado a test, lo guardo como parámetro de mlFlow. <br>
Inicialmente separo un 70% de datos para el entrenamiento y un 30% para test

In [70]:
TEST_SIZE = 0.3
RANDOM_STATE = 42
mlflow.log_param("Tamaño de Test", TEST_SIZE)
mlflow.log_param("Random state", RANDOM_STATE)

42

In [71]:
from sklearn.model_selection import train_test_split

X_tree_train, X_tree_test, y_tree_train, y_tree_test = train_test_split(X_tree_reg, y_tree_reg, test_size=TEST_SIZE, random_state=RANDOM_STATE)
X_linear_train, X_linear_test, y_linear_train, y_linear_test = train_test_split(X_linear_reg, y_linear_reg, test_size=TEST_SIZE, random_state=RANDOM_STATE)

Por la alta varianza de los precios, transformaremos el target con log(x +1)

In [72]:

# --- Transformación del target ---
y_tree_train_log = np.log1p(y_tree_train)
y_tree_test_log  = np.log1p(y_tree_test)

y_linear_train_log = np.log1p(y_linear_train)
y_linear_test_log  = np.log1p(y_linear_test)

## 4. Creación de modelo de Regresión lineal

### 4.1 Modelo de Regresión Lineal de sklearn

In [73]:
from sklearn.linear_model import LinearRegression

# Creamos modelo
model_linreg = LinearRegression()

# fiteamos
model_linreg.fit(X_linear_train, y_linear_train_log)

# obtenemos predicciónes (aún en escala logarítmica)

y_linear_pred_log=model_linreg.predict(X_linear_test)

# Volvemos al dominio original
y_linear_pred = np.expm1(y_linear_pred_log)
y_linear_true = np.expm1(y_linear_test_log)

In [74]:
import sklearn.metrics as metrics

mse = metrics.mean_squared_error(y_linear_true, y_linear_pred)
r2 = metrics.r2_score(y_linear_true, y_linear_pred)
rmse = metrics.root_mean_squared_error(y_linear_true, y_linear_pred)
mae = metrics.mean_absolute_error(y_linear_true, y_linear_pred)

print("r2 ", round(r2, 4))
print("mse: ", round(mse, 4))
print("rmse: ", round(rmse, 4))
print("mae: ", round(mae, 4))

mlflow.log_metric("Mean Squared Error", mse)
mlflow.log_metric("R2", r2)
mlflow.log_metric("Root Mean Squared Error", rmse)
mlflow.log_metric("Mean Absolute Error", mae)

# --- Coeficientes e interpretación ---
coefs = pd.Series(model_linreg.coef_, index=X_linear_train.columns).sort_values(ascending=False)
print("\nTop 10 variables con coeficiente positivo:")
print(coefs.head(10))
print("\nTop 10 variables con coeficiente negativo:")
print(coefs.tail(10))

r2  -0.0049
mse:  93777890674.823
rmse:  306231.7597
mae:  25595.4406

Top 10 variables con coeficiente positivo:
neighborhood_Villa Urquiza    0.389613
bathrooms_missing             0.140000
neighborhood_Colegiales       0.121989
neighborhood_Boedo            0.077388
neighborhood_Chacarita        0.071914
neighborhood_Belgrano         0.004873
surface_covered              -0.000654
neighborhood_Congreso        -0.024571
neighborhood_Tribunales      -0.042137
rooms                        -0.042344
dtype: float64

Top 10 variables con coeficiente negativo:
rooms_missing                -0.796364
neighborhood_Boca            -0.881654
neighborhood_Nuñez           -1.029539
property_type_Oficina        -1.084878
operation_type_Venta         -1.120557
neighborhood_Retiro          -1.223690
neighborhood_Saavedra        -1.330400
neighborhood_Barracas        -1.368010
neighborhood_Puerto Madero   -3.336538
neighborhood_Catalinas       -3.656753
dtype: float64


### 4.2 Creación de modelo con RF + hypertuning (usando grid search)

In [75]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor 

model_rf = RandomForestRegressor(random_state=RANDOM_STATE)

In [76]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}


grid_search = GridSearchCV(
    estimator=model_rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1, 
    scoring='neg_root_mean_squared_error'
)

grid_search.fit(X_tree_train, y_tree_train_log)




0,1,2
,estimator,RandomForestR...ndom_state=42)
,param_grid,"{'max_depth': [10, 20, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], ...}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'log2'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [77]:
# Predicciones (aún en escala logarítmica)
y_tree_pred_log = grid_search.predict(X_tree_test)

# Invertimos la transformación para volver a dólares
y_tree_pred = np.expm1(y_tree_pred_log)
y_tree_true = np.expm1(y_tree_test_log)  # igualamos para coherencia

# --- Métricas en dominio original ---
mse = metrics.mean_squared_error(y_tree_true, y_tree_pred)
rmse = np.sqrt(mse)  
mae = metrics.mean_absolute_error(y_tree_true, y_tree_pred)
r2 = metrics.r2_score(y_tree_true, y_tree_pred)

print("Mejores parámetros:", grid_search.best_params_)
print(f"R² real: {r2:.4f}")
print(f"RMSE (USD): {rmse:,.0f}")
print(f"MAE (USD): {mae:,.0f}")

Mejores parámetros: {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}
R² real: -0.0002
RMSE (USD): 305,519
MAE (USD): 21,658


Analizo la proporción de registros que cae en “desconocido” para ver si domina alguna variable

In [78]:
best_rf = grid_search.best_estimator_
importances = pd.Series(best_rf.feature_importances_, index=X_tree_train.columns)
importances[importances.index.str.contains('desconocido')].sort_values(ascending=False)

Series([], dtype: float64)