In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from category_encoders import TargetEncoder


In [5]:
df = pd.read_csv('../Datos/datos_1/datos_limpios.csv', index_col=0)

In [6]:
data = df

In [7]:
data['bathrooms'] = data['bathrooms'].astype('O')
data['rooms'] = data['rooms'].astype('O')
data['exterior'] = data['exterior'].astype('O')

In [8]:
# Crear un conjunto de datos ficticio

# Separar características y target
X = data.drop(columns=['price'])
y = data['price']


In [9]:
data.head()

Unnamed: 0,propertyCode,numPhotos,price,propertyType,operation,size,exterior,rooms,bathrooms,address,...,superTopHighlight,topNewDevelopment,externalReference,floor,district,neighborhood,hasLift,parkingSpace,labels,newDevelopmentFinished
0,98903350,33,550.0,chalet,rent,371.0,False,6,3,Numancia de la Sagra,...,False,False,,,,,,,,
1,91615378,12,750.0,flat,rent,60.0,True,1,1,"Calle de Alejandro Villegas, 23",...,False,False,AV23,3,Hortaleza,Canillas,True,,,
2,88363329,21,750.0,flat,rent,70.0,True,2,1,Gregorio izquierdo,...,False,False,,bj,Centro Urbano,,False,,,
3,98621753,28,400.0,penthouse,rent,67.0,True,2,2,Calle Madrid,...,False,False,90004,2,,,False,,,
4,98222714,22,450.0,flat,rent,89.0,False,2,1,Calle Arroyo,...,False,False,RP1882022101719,2,,,True,"{'hasParkingSpace': True, 'isParkingSpaceInclu...",,


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 450 entries, 0 to 449
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   propertyCode            450 non-null    int64  
 1   numPhotos               450 non-null    int64  
 2   price                   450 non-null    float64
 3   propertyType            450 non-null    object 
 4   operation               450 non-null    object 
 5   size                    450 non-null    float64
 6   exterior                450 non-null    object 
 7   rooms                   450 non-null    object 
 8   bathrooms               450 non-null    object 
 9   address                 450 non-null    object 
 10  province                450 non-null    object 
 11  municipality            450 non-null    object 
 12  country                 450 non-null    object 
 13  latitude                450 non-null    object 
 14  longitude               450 non-null    object 

# Preprocesado

In [None]:
# Separamos los datos segun el tipo de variable
categorical_columns = X.select_dtypes(include='object').columns
numerical_columns = X.select_dtypes(include=np.number).columns

# Hacemos la gestion de las variables categoricas
target_encoder = TargetEncoder(cols=categorical_columns)
X_encoded = target_encoder.fit_transform(X, y)

# Escalamos las variables
scaler = StandardScaler()
X_encoded[numerical_columns] = scaler.fit_transform(X_encoded[numerical_columns])

# Guardar los transformadores (TargetEncoder y StandardScaler)
with open('target_encoder.pkl', 'wb') as f:
    pickle.dump(target_encoder, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
    


In [12]:
categorical_columns

Index(['propertyType', 'operation', 'exterior', 'rooms', 'bathrooms',
       'address', 'province', 'municipality', 'country', 'latitude',
       'longitude', 'description', 'hasVideo', 'status', 'newDevelopment',
       'detailedType', 'suggestedTexts', 'hasPlan', 'has3DTour', 'has360',
       'hasStaging', 'superTopHighlight', 'topNewDevelopment',
       'externalReference', 'floor', 'district', 'neighborhood', 'hasLift',
       'parkingSpace', 'labels', 'newDevelopmentFinished'],
      dtype='object')

In [13]:
numerical_columns

Index(['propertyCode', 'numPhotos', 'size', 'distance', 'priceByArea'], dtype='object')

In [63]:
target_encoder

In [64]:
scaler

# Modelo

In [65]:

# Dividimos los datos en train y test
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.7, random_state=42)

# Entrenamos los datos.
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# predecimos
y_pred = model.predict(X_test)

# calculamos el rmse
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Precio real vs Predicción:")
display(pd.DataFrame({'Real': y_test.values, 'Predicción': y_pred}).head(2))
print(f"\nRMSE: {rmse:.2f}")

Precio real vs Predicción:


Unnamed: 0,Real,Predicción
0,720.0,669.93
1,699.0,707.63



RMSE: 50.98


In [66]:
# guardo el modelo
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(model, f)