# Proyecto Properatio: Creación y evaluación de model predictivo

Creacion de un modelo capaz de predecir valores en USD de propiedades tipo Oficinas y Locales Comerciales en CABA Argentina <br><br>
Creado por: Adriana Villalobos

## 1. Importación de librerías y Carga del dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import mlflow.sklearn
from mlflow.models.signature import infer_signature

In [2]:
import os
from pathlib import Path

print("Actual directorio de trabajo:", os.getcwd())

# Configuro el directorio para no tener /mlruns dentro de los notebooks
current_dir = Path.cwd()
if current_dir.name == "notebooks":
    os.chdir(current_dir.parent)

print("Nuevo directorio de trabajo:", os.getcwd())

Actual directorio de trabajo: /Users/cosmos/Adri/Developer/DataScience/Clases y Consignas/Proyecto Final/notebooks
Nuevo directorio de trabajo: /Users/cosmos/Adri/Developer/DataScience/Clases y Consignas/Proyecto Final


In [3]:
df = pd.read_csv('data/data_cleaned.csv', sep=",")
df.head()

Unnamed: 0,neighborhood,rooms,bathrooms,surface_covered,property_type,operation_type,price_usd,price_per_m2,bathrooms_missing,rooms_missing
0,Once,2.0,1.0,20.0,Oficina,Venta,32000.0,1600.0,0,1
1,Flores,2.0,1.0,32.0,Oficina,Alquiler,514800.0,16088.0,0,1
2,Flores,2.0,1.0,46.0,Oficina,Alquiler,643500.0,13989.0,0,1
3,Palermo,2.0,2.0,70.0,Oficina,Alquiler,2316600.0,33094.0,0,1
4,Tribunales,2.0,1.0,40.0,Oficina,Venta,89000.0,2225.0,0,1


In [4]:
df['operation_type'].value_counts()

operation_type
Alquiler    12024
Venta        9754
Name: count, dtype: int64

In [33]:
df['property_type'].value_counts()

property_type
Oficina            6650
Local comercial    5374
Name: count, dtype: int64

In [5]:
# Acoto el dataset al tipo de transacción a un solo tipo de transacción
t = 'Alquiler'
df = df[df['operation_type'] == t]
df = df.drop(columns=['operation_type'])

In [6]:
df.sample(5)

Unnamed: 0,neighborhood,rooms,bathrooms,surface_covered,property_type,price_usd,price_per_m2,bathrooms_missing,rooms_missing
3269,Boedo,1.0,1.0,34.0,Oficina,806000.0,23706.0,0,0
4555,San Nicolás,2.0,6.0,962.0,Oficina,28830.0,30.0,0,1
12011,Belgrano,2.0,2.0,150.0,Oficina,24190000.0,161267.0,0,1
17343,Monserrat,1.0,1.0,35.0,Local comercial,1143000.0,32657.0,0,1
19866,Almagro,1.0,1.0,73.0,Local comercial,1335000.0,18288.0,0,1


In [7]:
# Incluyo mlflow para trackear el desempeño de diferentes opciones a lo largo del proceso

mlflow.set_tracking_uri(f"file://{os.getcwd()}/mlruns")
mlflow.set_experiment(experiment_name="Proyectio_Properatio")

<Experiment: artifact_location='file:///Users/cosmos/Adri/Developer/DataScience/Clases%20y%20Consignas/Proyecto%20Final/notebooks/mlruns/417185556600615154', creation_time=1762538423016, experiment_id='417185556600615154', last_update_time=1762538423016, lifecycle_stage='active', name='Proyectio_Properatio', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [8]:
mlflow.log_param('Type os transaction', t)

'Alquiler'

## 2. One Hot Encoding para property_type y operation_type

In [None]:
# df = pd.get_dummies(df, columns=['operation_type'], drop_first=True)
# df.sample(5)

KeyError: "None of [Index(['operation_type'], dtype='object')] are in the [columns]"

## 3. Separación de datos: df_oficina y df_local_comercial

In [10]:
df_oficina = df[df['property_type'] == 'Oficina'].copy()
df_local = df[df['property_type'] == 'Local comercial'].copy()

df_oficina = df_oficina.drop(columns=['property_type'])
df_local = df_local.drop(columns=['property_type'])

df_local.head(3)

Unnamed: 0,neighborhood,rooms,bathrooms,surface_covered,price_usd,price_per_m2,bathrooms_missing,rooms_missing
15,Palermo,1.0,1.0,21.0,300300.0,14300.0,0,1
17,Balvanera,1.0,1.0,40.0,686400.0,17160.0,0,1
18,Chacarita,1.0,2.0,130.0,2145000.0,16500.0,0,1


### 3.1 Separación de dataframes para distintos modelos

La variable del barrio se va a pasar por OHE para el modelo de LinearRegressor<br>
Para el de RandomForest se usará LabelEncoder en esa columna

In [11]:
from sklearn.preprocessing import LabelEncoder

### 3.1 LabelEncoder para RandomForest

In [12]:
# Versión para RandomForest
df_oficina_tree = df_oficina.copy()
df_local_tree = df_local.copy()

le = LabelEncoder()
df_oficina_tree['neighborhood_encoded'] = le.fit_transform(df_oficina_tree['neighborhood'])
df_oficina_tree.drop(columns=['neighborhood'], inplace=True)

df_local_tree['neighborhood_encoded'] = le.fit_transform(df_local_tree['neighborhood'])
df_local_tree.drop(columns=['neighborhood'], inplace=True)


X_oficina_tree = df_oficina_tree.drop(columns=['price_usd', 'price_per_m2'])
y_oficina_tree = df_oficina_tree['price_usd']

X_local_tree = df_local_tree.drop(columns=['price_usd', 'price_per_m2'])
y_local_tree = df_local_tree['price_usd']


In [13]:
with open('models/columns_labelEncoder.pkl', 'wb') as f:
    pickle.dump(df.columns.tolist(), f)

### 3.2 OHE para LinearRegressor

In [14]:
# Versión para LinearRegressor
df_oficina_linear = df_oficina.copy()
df_local_linear = df_local.copy()

In [15]:
df_oficina_linear.columns


Index(['neighborhood', 'rooms', 'bathrooms', 'surface_covered', 'price_usd',
       'price_per_m2', 'bathrooms_missing', 'rooms_missing'],
      dtype='object')

In [16]:
# Reemplazo los espacios por _ para mantener la consistencia en los nombres de columnas

df_oficina_linear['neighborhood'] = df_oficina_linear['neighborhood'].str.replace(' ', '_', regex=False)
df_oficina_linear['neighborhood'] = df_oficina_linear['neighborhood'].str.replace('/', 'o', regex=False)

df_local_linear['neighborhood'] = df_local_linear['neighborhood'].str.replace(' ', '_', regex=False)
df_local_linear['neighborhood'] = df_local_linear['neighborhood'].str.replace('/', 'o', regex=False)

# Paso todos los nombres de columnas a lowercase

df_oficina_linear['neighborhood'] = df_oficina_linear['neighborhood'].str.lower()
df_local_linear['neighborhood'] = df_local_linear['neighborhood'].str.lower()

In [17]:
df_oficina_linear['neighborhood'].unique()

array(['flores', 'palermo', 'san_nicolás', 'puerto_madero',
       'centro_o_microcentro', 'congreso', 'retiro', 'barracas',
       'chacarita', 'monserrat', 'san_telmo', 'colegiales', 'tribunales',
       'parque_patricios', 'catalinas', 'almagro', 'recoleta', 'nuñez',
       'abasto', 'parque_chacabuco', 'belgrano', 'floresta', 'liniers',
       'barrio_norte', 'villa_urquiza', 'once', 'caballito', 'balvanera',
       'san_cristobal', 'otros', 'saavedra', 'villa_crespo',
       'villa_devoto', 'boedo', 'paternal', 'villa_del_parque',
       'mataderos', 'boca', 'constitución'], dtype=object)

In [18]:
# Versión para LinearRegression
df_oficina_linear = pd.get_dummies(df_oficina_linear, columns=['neighborhood'], drop_first=True)
X_oficina_linear = df_oficina_linear.drop(columns=['price_usd', 'price_per_m2'])
y_oficina_linear = df_oficina_linear['price_usd']

df_local_linear = pd.get_dummies(df_local_linear, columns=['neighborhood'], drop_first=True)
X_local_linear = df_local_linear.drop(columns=['price_usd', 'price_per_m2'])
y_local_linear = df_local_linear['price_usd']

In [19]:
print(df_local_linear.columns == df_oficina_linear.columns)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True]


In [20]:
with open('models/columns_oficina_OHE.pkl', 'wb') as f:
    pickle.dump(df_oficina_linear.columns.tolist(), f)

## 3. Separación de Tr/Ts

In [21]:
X_oficina_tree_reg = df_oficina_tree.drop(columns=['price_usd', 'price_per_m2'])
y_oficina_tree_reg = df_oficina_tree['price_per_m2']

X_local_tree_reg = df_local_tree.drop(columns=['price_usd', 'price_per_m2'])
y_local_tree_reg = df_local_tree['price_per_m2']


X_oficina_linear_reg = df_oficina_linear.drop(columns=['price_usd', 'price_per_m2'])
y_oficina_linear_reg = df_oficina_linear['price_per_m2']

X_local_linear_reg = df_local_linear.drop(columns=['price_usd', 'price_per_m2'])
y_local_linear_reg = df_local_linear['price_per_m2']

In [22]:
mlflow.log_param("Tamaño dataset_4_oficina_tree", X_oficina_tree_reg.shape)
mlflow.log_param("Tamaño dataset_4_local_tree", X_local_tree_reg.shape)

mlflow.log_param("Tamaño dataset_4_oficina_linear", X_oficina_linear_reg.shape)
mlflow.log_param("Tamaño dataset_4_local_linear", X_local_linear_reg.shape)

(5374, 43)

Para medir el rendimiento según el porcentaje asignado a test, lo guardo como parámetro de mlFlow. <br>
Inicialmente separo un 70% de datos para el entrenamiento y un 30% para test

In [23]:
TEST_SIZE = 0.3
RANDOM_STATE = 42
mlflow.log_param("Tamaño de Test", TEST_SIZE)
mlflow.log_param("Random state", RANDOM_STATE)

42

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
# Splist para modelos LinearRegression
X_oficina_tree_train, X_oficina_tree_test, y_oficina_tree_train, y_oficina_tree_test = train_test_split(X_oficina_tree_reg, y_oficina_tree_reg, test_size=TEST_SIZE, random_state=RANDOM_STATE)
X_local_tree_train, X_local_tree_test, y_local_tree_train, y_local_tree_test = train_test_split(X_local_tree_reg, y_local_tree_reg, test_size=TEST_SIZE, random_state=RANDOM_STATE) 

In [26]:

# Split para modelos LinearRegression
X_oficina_linear_train, X_oficina_linear_test, y_oficina_linear_train, y_oficina_linear_test = train_test_split(X_oficina_linear_reg, y_oficina_linear_reg, test_size=TEST_SIZE, random_state=RANDOM_STATE)
X_local_linear_train, X_local_linear_test, y_local_linear_train, y_local_linear_test = train_test_split(X_local_linear_reg, y_local_linear_reg, test_size=TEST_SIZE, random_state=RANDOM_STATE) 

Por la alta varianza de los precios, transformaremos el target con log(x +1)

In [27]:

# Transformación del target para RandomForest
y_oficina_tree_train_log = np.log1p(y_oficina_tree_train)
y_oficina_tree_test_log  = np.log1p(y_oficina_tree_test)

y_local_tree_train_log = np.log1p(y_local_tree_train)
y_local_tree_test_log  = np.log1p(y_local_tree_test)

# Transformación del target para LinearRegressor
y_oficina_linear_train_log = np.log1p(y_oficina_linear_train)
y_oficina_linear_test_log  = np.log1p(y_oficina_linear_test)

y_local_linear_train_log = np.log1p(y_local_linear_train)
y_local_linear_test_log  = np.log1p(y_local_linear_test)

## 4. Creación de modelo de Regresión lineal

### 4.1 Modelo de Regresión Lineal de sklearn

4.1.1 Para Oficinas

In [28]:
from sklearn.linear_model import LinearRegression

# Creamos modelo
model_linreg_oficina = LinearRegression()

# fiteamos
model_linreg_oficina.fit(X_oficina_linear_train, y_oficina_linear_train_log)

# obtenemos predicciónes (aún en escala logarítmica)

y_linear_pred_log_oficina=model_linreg_oficina.predict(X_oficina_linear_test)

# Volvemos al dominio original
y_linear_pred_oficina = np.expm1(y_linear_pred_log_oficina)
y_linear_true_oficina = np.expm1(y_oficina_linear_test_log)

In [29]:
import sklearn.metrics as metrics

mse = metrics.mean_squared_error(y_linear_true_oficina, y_linear_pred_oficina)
r2 = metrics.r2_score(y_linear_true_oficina, y_linear_pred_oficina)
rmse = metrics.root_mean_squared_error(y_linear_true_oficina, y_linear_pred_oficina)
mae = metrics.mean_absolute_error(y_linear_true_oficina, y_linear_pred_oficina)

print("Resultados Linear Regression - Oficinas")
print("r2 ", round(r2, 4))
print("mse: ", round(mse, 4))
print("rmse: ", round(rmse, 4))
print("mae: ", round(mae, 4))

mlflow.log_metric("LinearRegressor - Oficinas - Mean Squared Error", mse)
mlflow.log_metric("LinearRegressor - Oficinas - R2", r2)
mlflow.log_metric("LinearRegressor - Oficinas - Root Mean Squared Error", rmse)
mlflow.log_metric("LinearRegressor - Oficinas - Mean Absolute Error", mae)

# --- Coeficientes e interpretación ---
coefs = pd.Series(model_linreg_oficina.coef_, index=X_oficina_linear_train.columns).sort_values(ascending=False)
print("\nTop 10 variables con coeficiente positivo:")
print(coefs.head(10))
print("\nTop 10 variables con coeficiente negativo:")
print(coefs.tail(10))

Resultados Linear Regression - Oficinas
r2  -0.2348
mse:  802625182.0667
rmse:  28330.6403
mae:  16529.5099

Top 10 variables con coeficiente positivo:
neighborhood_mataderos           0.901981
neighborhood_paternal            0.725646
neighborhood_san_cristobal       0.688081
neighborhood_villa_del_parque    0.532797
bathrooms_missing                0.515693
neighborhood_villa_crespo        0.510353
neighborhood_parque_chacabuco    0.374586
neighborhood_boedo               0.369220
neighborhood_once                0.288424
neighborhood_balvanera           0.213746
dtype: float64

Top 10 variables con coeficiente negativo:
neighborhood_nuñez              -1.657847
neighborhood_parque_patricios   -1.723479
neighborhood_san_telmo          -1.909530
neighborhood_retiro             -2.206220
neighborhood_saavedra           -2.393642
neighborhood_barracas           -2.687425
neighborhood_boca               -3.786589
neighborhood_constitución       -4.036175
neighborhood_catalinas          -

### 4.2 Creación de modelo con RF + hypertuning (usando grid search)

In [30]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor 

model_rf = RandomForestRegressor(random_state=RANDOM_STATE)

In [31]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}


grid_search = GridSearchCV(
    estimator=model_rf,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1, 
    scoring='neg_root_mean_squared_error'
)

grid_search.fit(X_oficina_tree_train, y_oficina_tree_train_log)




0,1,2
,estimator,RandomForestR...ndom_state=42)
,param_grid,"{'max_depth': [10, 20, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [1, 2], 'min_samples_split': [2, 5], ...}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,500
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [32]:
# Predicciones (aún en escala logarítmica)
y_tree_pred_log = grid_search.predict(X_oficina_tree_test)

# Invertimos la transformación para volver a dólares
y_tree_pred = np.expm1(y_tree_pred_log)
y_tree_true = np.expm1(y_oficina_tree_test_log)  # igualamos para coherencia

# --- Métricas en dominio original ---
mse = metrics.mean_squared_error(y_tree_true, y_tree_pred)
rmse = np.sqrt(mse)  
mae = metrics.mean_absolute_error(y_tree_true, y_tree_pred)
r2 = metrics.r2_score(y_tree_true, y_tree_pred)

mlflow.log_metric("RandomForest - Oficinas - Mean Squared Error", mse)
mlflow.log_metric("RandomForest - Oficinas - R2", r2)
mlflow.log_metric("RandomForest - Oficinas - Root Mean Squared Error", rmse)
mlflow.log_metric("RandomForest - Oficinas - Mean Absolute Error", mae)

print("Mejores parámetros para Oficina:", grid_search.best_params_)
print(f"R² real: {r2:.4f}")
print(f"RMSE (USD): {rmse:,.0f}")
print(f"MAE (USD): {mae:,.0f}")

Mejores parámetros para Oficina: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 500}
R² real: 0.0886
RMSE (USD): 24,340
MAE (USD): 12,444
