In [1]:
import cv2
import numpy as np
import pandas
import imageio.v3 as io

from tqdm import tqdm
from typing import Optional, Union

In [4]:
def download_images(paths: list,
                    canvas: tuple = (224, 224),
                    nb_channels: int = 3,
                    max_imgs: Optional[int] = None
                    ) -> tuple:
  """ Download a list of images from url adresses, converting them to a specific
  canvas size.

  Args:
    paths: Paths or url adresses from which to load images.
    canvas: Desired image width and height.
    nb_channels: Channels in images (1 for B/W, 3 for RGB).
    max_imgs: Upper threshold in the number of images to download.

  Return:
    a tuple of:
      - image values
      - indices within the paths that were successfull.

  """
  n_images = len(paths) if not max_imgs else max_imgs
  images = np.zeros((n_images, canvas[0], canvas[1], nb_channels),
                       dtype=np.uint8)
  downloaded_idxs = []

  for i_img, url in enumerate(tqdm(paths, total=n_images)):
    if i_img >= n_images:
      break
    try:
        img = io.imread(url)
        img = cv2.resize(img, (canvas[0], canvas[1]))
        downloaded_idxs.append(i_img)
        images[i_img] = img
    except (IOError, ValueError) as e:  # Unavailable url / conversion error
        pass
  return images[downloaded_idxs], downloaded_idxs

# OBTENIENDO LOS DATOS

In [5]:
# descargamos los datos
data = pandas.read_csv("dataset.csv", sep = ';')

# eliminamos los valores nulos de price
data = data.dropna(subset=['price'])

# eliminamos posibles filas duplicadas
data = data.drop_duplicates()

# descargamos las imagenes para obtener saber que datos vamos a usar
_ , idx = download_images(data['picture_url'])

# almacenamos los datos los idxs de los datos tabulares
data_filtered = data.iloc[idx]

# seleccionamos las columnas con las que vayamos a trabajar
data_filtered = data_filtered.filter(['property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'minimum_nights', 'maximum_nights', 'number_of_reviews','latitude', 'longitude', 'neighbourhood_cleansed', 'bathrooms_text', 'price'])

# eliminamos el simbolo del dolar y lo convertirmos a numerico
data_filtered['price'] = data_filtered['price'].str.lstrip('$')
data_filtered['price'] = data_filtered['price'].str.replace(',', '').astype(float)

100%|██████████| 3600/3600 [16:13<00:00,  3.70it/s]


In [6]:
# numpy.save('images.npy', images)
data_filtered.to_csv('filtered_data.csv', sep=';', index=False)

# Modelo de regresión

## Dividimos los datos

In [53]:
from sklearn.model_selection import train_test_split

# dividimos los datos
train, test = train_test_split(data_filtered, test_size=0.2, shuffle=True, random_state=0)

print(f'Dimensiones del dataset de training: {train.shape}')
print(f'Dimensiones del dataset de test: {test.shape}')

# guardamos los datos en difirentes archivos
train.to_csv('./airbnb-listings-extract_train.csv', sep=';', decimal='.', index=False)
test.to_csv('./airbnb-listings-extract_test.csv', sep=';', decimal='.', index=False)


# cargamos los ficheros
airbnb_data_train = pandas.read_csv("./airbnb-listings-extract_train.csv",sep=';', decimal='.')
airbnb_data_test = pandas.read_csv("./airbnb-listings-extract_test.csv",sep=';', decimal='.')

Dimensiones del dataset de training: (2503, 14)
Dimensiones del dataset de test: (626, 14)


## Imputamos los datos


In [8]:

# las caracteristicas con valores nulos son bathrooms, bedrooms, beds. Los imputamos
print(f"Los valores nulos de bathrooms son {airbnb_data_train['bathrooms'].isnull().sum()}")
print(f"Los valores nulos de beds son {airbnb_data_train['beds'].isnull().sum()}")
print(f"Los valores nulos de bedrooms son {airbnb_data_train['bedrooms'].isnull().sum()}")
print(f"Los valores nulos de bathrooms_text son {airbnb_data_train['bathrooms_text'].isnull().sum()}")

# observamos que tanto bathrooms como bedrooms todos sus valores son nulos, en este caso lo eliminamos
airbnb_data_train = airbnb_data_train.drop(['bathrooms', 'bedrooms'], axis=1)

# al no tener dato de bathrooms observamos en los datos que existe otra columna que se llama bathrooms_text sin valores nulos y contiene con texto de los bathrooms.
# tienen muchas categorias asique en vez de codificarlo, obtamos por eliminar el texto quedandonos solo con el numero y convertirlo en float
airbnb_data_train['bathrooms_text'] = airbnb_data_train['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)

# imputamos los valores nulos
airbnb_data_train['beds'] = airbnb_data_train['beds'].fillna(airbnb_data_train['beds'].mean())

airbnb_data_train['bathrooms_text'] = airbnb_data_train['bathrooms_text'].fillna(airbnb_data_train['bathrooms_text'].mean())




Los valores nulos de bathrooms son 2503
Los valores nulos de beds son 37
Los valores nulos de bedrooms son 2503
Los valores nulos de bathrooms_text son 0


## Codificamos las variables categoricas

In [9]:
# primero empezamos con la caracteristica neighbourhood_cleansed, vamos a realizar un target enconding porque puede ser que aparezcan bastantes nuevas

airbnb_data_train['neighbourhood_cleansed_encoded'] = airbnb_data_train['neighbourhood_cleansed'].map(
    airbnb_data_train.groupby('neighbourhood_cleansed')['price'].mean()
)

airbnb_data_train['property_type_encoded'] = airbnb_data_train['property_type'].map(
    airbnb_data_train.groupby('property_type')['price'].mean()
)

airbnb_data_train['room_type_encoded'] = airbnb_data_train['room_type'].map(
    airbnb_data_train.groupby('room_type')['price'].mean()
)

airbnb_data_train = airbnb_data_train.drop(['neighbourhood_cleansed', 'property_type', 'room_type',"number_of_reviews"], axis=1)

In [10]:
airbnb_data_train.describe()

Unnamed: 0,accommodates,beds,minimum_nights,maximum_nights,latitude,longitude,bathrooms_text,price,neighbourhood_cleansed_encoded,property_type_encoded,room_type_encoded
count,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0,2503.0
mean,3.235318,1.88159,27.063524,513.10827,42.335487,-71.082803,1.242709,186.433879,186.433879,186.433879,186.433879
std,2.357904,1.567993,35.406754,408.864097,0.027902,0.032716,0.623066,349.431855,55.111546,114.794339,60.337988
min,1.0,1.0,1.0,1.0,42.2353,-71.173442,0.0,20.0,112.0,20.0,66.3
25%,2.0,1.0,1.0,180.0,42.317805,-71.09949,1.0,75.0,136.983607,95.863636,122.118701
50%,2.0,1.0,28.0,365.0,42.34291,-71.07607,1.0,120.0,183.594595,187.762603,218.110759
75%,4.0,2.0,29.0,1125.0,42.354705,-71.06161,1.5,200.0,220.836066,187.762603,218.110759
max,16.0,22.0,400.0,1125.0,42.391317,-70.996,6.0,10000.0,349.5,676.434783,615.7


## Dividimos los datos y escalamos

In [11]:
from sklearn.preprocessing import MinMaxScaler


# dividimos entra caracteristicas y etiquetas
X = airbnb_data_train.drop('price', axis=1)
y = airbnb_data_train['price']

# dividimos entre train,val,test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15,shuffle=True, random_state=1)

# escalamos las características
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Obtener el valor mínimo y máximo
precio_min = np.min(y_train)
precio_max = np.max(y_train)
precio_min_val = np.min(y_val)
precio_max_val = np.max(y_val)

# Normalizar los precios
precios_normalizados = (y_train - precio_min) / (precio_max - precio_min)
precios_normalizados_val = (y_val - precio_min_val) / (precio_max_val - precio_min_val)



In [12]:
print("Valores nulos en X_train:", np.isnan(X_train_scaled).sum())
print("Valores nulos en y_train:", np.isnan(y_train).sum())

# Verificar las dimensiones
print("Dimensiones de X_train_scaled:", X_train_scaled.shape)
print("Dimensiones de X_val_scaled:", X_val_scaled.shape)

print("Rango X_train_scaled:", X_train_scaled.min(), X_train_scaled.max())
print("Rango X_val_scaled:", X_val_scaled.min(), X_val_scaled.max())

Valores nulos en X_train: 0
Valores nulos en y_train: 0
Dimensiones de X_train_scaled: (2127, 10)
Dimensiones de X_val_scaled: (376, 10)
Rango X_train_scaled: 0.0 1.0
Rango X_val_scaled: -0.005148624746993846 1.0


## Creamos el modelo secuencial

In [29]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import numpy as np

# vamos a hacer una busqueda de hiperparametros

# creamos el espacio
space = {
    'batch_size': hp.choice('batch_size', [64, 128, 256]),
      "dropout": hp.uniform("dropout", 0.0, 0.5),
    'epochs': hp.choice('epochs', [10, 20, 30])
}

# nos definimos unos callbacks para que la red pare cuando no vea mejora
def	get_callbacks(pars):
  callbacks	= [EarlyStopping(monitor="val_loss", min_delta=0.0001, patience=2, verbose=0, mode="auto")]
  return callbacks

# definimos la funcion objetivo
def model_objective(pars):
  print ('Parametros: ', pars)
  # creamos la red y le vamos añadiendo las capas
  model = Sequential()
  model.add(Dense(64,activation='relu', input_shape=(X_train_scaled.shape[1],)))
  model.add(Dropout(pars["dropout"]))
  model.add(Dense(32,activation='relu'))
  model.add(Dropout(pars["dropout"]))
  model.add(Dense(16,activation='relu'))
  model.add(Dropout(pars["dropout"]))
  model.add(Dense(1))

  # creamos el optimizador
  optimizer = Adam(learning_rate=0.1)

  # compilamos el modelo utilizando el optimizador ADAM y funcion de perdida mse
  model.compile(optimizer=optimizer, loss='mse')

  # entrenamos el modelo y guardamos el historial
  historico = model.fit(X_train_scaled, precios_normalizados, epochs=pars['epochs'], batch_size=pars['batch_size'],validation_data=(X_val_scaled, precios_normalizados_val),verbose=0,callbacks=get_callbacks(pars))

  # extraer la pérdida minima entre todas las epocas
  val_loss = min(historico.history['val_loss'])
  return {'loss': val_loss, 'status': STATUS_OK}

In [30]:
# buscamos
trials = Trials()
best = fmin(fn=model_objective,
            space=space,
            algo=tpe.suggest,
            max_evals=3,
            trials=trials)
print(best)

Parametros: 
{'batch_size': 256, 'dropout': 0.10016355152499617, 'epochs': 20}
  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Parametros: 
{'batch_size': 256, 'dropout': 0.19875024985995604, 'epochs': 10}
 33%|███▎      | 1/3 [00:01<00:03,  1.87s/trial, best loss: 0.003926925361156464]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Parametros: 
{'batch_size': 256, 'dropout': 0.44538050557630277, 'epochs': 20}
 67%|██████▋   | 2/3 [00:03<00:01,  1.88s/trial, best loss: 0.003926925361156464]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



100%|██████████| 3/3 [00:05<00:00,  1.80s/trial, best loss: 0.003926925361156464]
{'batch_size': 2, 'dropout': 0.10016355152499617, 'epochs': 1}


In [38]:
best_mapped = {
    'batch_size': [64, 128, 256][best['batch_size']],  # Convertir índice en valor
    'dropout': best['dropout'],  # Este es continuo, no necesita mapeo
    'epochs': [10, 20, 30][best['epochs']]  # Convertir índice en valor
}

In [44]:
# evaluamos el modelo con los mejores parametros

model = Sequential()
model.add(Dense(64,activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(best_mapped["dropout"]))
model.add(Dense(32,activation='relu'))
model.add(Dropout(best_mapped["dropout"]))
model.add(Dense(16,activation='relu'))
model.add(Dropout(best_mapped["dropout"]))
model.add(Dense(1))

# creamos el optimizador
optimizer = Adam(learning_rate=0.1)

# compilamos el modelo utilizando el optimizador ADAM y funcion de perdida mse
model.compile(optimizer=optimizer, loss='mse')

# entrenamos el modelo y guardamos el historial
historico = model.fit(X_train_scaled, precios_normalizados, epochs=best_mapped['epochs'], batch_size=best_mapped['batch_size'],validation_data=(X_val_scaled, precios_normalizados_val),verbose=0)

val_loss = min(historico.history['val_loss'])

print(val_loss)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


0.004026900045573711


In [54]:
# una vez entrenado el modelo realizamos las predicciones

# observamos que tanto bathrooms como bedrooms todos sus valores son nulos, en este caso lo eliminamos
airbnb_data_test = airbnb_data_test.drop(['bathrooms', 'bedrooms'], axis=1)

# al no tener dato de bathrooms observamos en los datos que existe otra columna que se llama bathrooms_text sin valores nulos y contiene con texto de los bathrooms.
# tienen muchas categorias asique en vez de codificarlo, obtamos por eliminar el texto quedandonos solo con el numero y convertirlo en float
airbnb_data_test['bathrooms_text'] = airbnb_data_test['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)

# imputamos los valores nulos
airbnb_data_test['beds'] = airbnb_data_test['beds'].fillna(airbnb_data_test['beds'].mean())

airbnb_data_test['bathrooms_text'] = airbnb_data_test['bathrooms_text'].fillna(airbnb_data_test['bathrooms_text'].mean())


airbnb_data_test['neighbourhood_cleansed_encoded'] = airbnb_data_test['neighbourhood_cleansed'].map(
    airbnb_data_test.groupby('neighbourhood_cleansed')['price'].mean()
)

airbnb_data_test['property_type_encoded'] = airbnb_data_test['property_type'].map(
    airbnb_data_test.groupby('property_type')['price'].mean()
)

airbnb_data_test['room_type_encoded'] = airbnb_data_test['room_type'].map(
    airbnb_data_test.groupby('room_type')['price'].mean()
)

airbnb_data_test = airbnb_data_test.drop(['neighbourhood_cleansed', 'property_type', 'room_type',"number_of_reviews"], axis=1)

# dividimos entra caracteristicas y etiquetas
X_test = airbnb_data_test.drop('price', axis=1)
y_test = airbnb_data_test['price']

# escalamos las características
scaler.fit(X_test)

X_test_scaled = scaler.transform(X_test)




In [52]:
# Realizar predicciones
y_pred = model.predict(X_test_scaled)
y_test = airbnb_data_test['price']
# Evaluar las predicciones (por ejemplo, MAE)
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE en el conjunto de prueba: {mae}")

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
MAE en el conjunto de prueba: 173.9428669849512
