In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder


In [3]:
df = pd.read_csv('../datos_1/datos_limpios.csv', index_col=0)

In [5]:
# Crear un conjunto de datos ficticio

data = df
# Separar características y target
X = data.drop(columns=['price'])
y = data['price']


In [11]:
data.head()

Unnamed: 0,price,propertyType,size,exterior,rooms,bathrooms,distance,status,floor,hasLift
0,750.0,flat,60.0,True,1,1,7037,good,3,True
1,750.0,flat,70.0,True,2,1,16145,good,bj,False
2,400.0,penthouse,67.0,True,2,2,55041,good,2,False
3,590.0,flat,70.0,True,3,2,56596,good,Desconocido,False
4,684.0,studio,45.0,True,0,1,10656,Desconocido,Desconocido,Desconocido


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 353 entries, 0 to 352
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         353 non-null    float64
 1   propertyType  353 non-null    object 
 2   size          353 non-null    float64
 3   exterior      353 non-null    bool   
 4   rooms         353 non-null    int64  
 5   bathrooms     353 non-null    int64  
 6   distance      353 non-null    int64  
 7   status        353 non-null    object 
 8   floor         353 non-null    object 
 9   hasLift       353 non-null    object 
dtypes: bool(1), float64(2), int64(3), object(4)
memory usage: 27.9+ KB


# Preprocesado

In [12]:
# Separamos los datos segun el tipo de variable
categorical_columns = X.select_dtypes(include='object').columns
numerical_columns = X.select_dtypes(include=np.number).columns

# Hacemos la gestion de las variables categoricas
target_encoder = TargetEncoder(cols=categorical_columns)
X_encoded = target_encoder.fit_transform(X, y)

# Escalamos las variables
scaler = StandardScaler()
X_encoded[numerical_columns] = scaler.fit_transform(X_encoded[numerical_columns])

# Guardar los transformadores (TargetEncoder y StandardScaler)
with open('target_encoder.pkl', 'wb') as f:
    pickle.dump(target_encoder, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [15]:
target_encoder

In [16]:
scaler

# Modelo

In [13]:

# Dividimos los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.7, random_state=42)

# Entrenamos los datos.
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# predecimos
y_pred = model.predict(X_test)

# calculamos el rmse
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Precio real vs Predicción:")
display(pd.DataFrame({'Real': y_test.values, 'Predicción': y_pred}).head(2))
print(f"\nRMSE: {rmse:.2f}")

Precio real vs Predicción:


Unnamed: 0,Real,Predicción
0,720.0,670.26
1,699.0,709.55



RMSE: 50.68


In [14]:
# guardo el modelo
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(model, f)