In [3]:
""" 
Importación de ficheros de AIRBNB. Preprocesado y cargado a la base de datos 

Obtenemos los datos desde https://insideairbnb.com/get-the-data/

Podríamos hacer scraping, obtener la fecha de la última actualización de Madrid y generar el enlace para descargarlo, pero no es necesario por que se actualiza cada mucho tiempo y no es necesario tener la última versión. Podemos cambiar la info procesable en la carpeta data manualmente.

"""
import os
import pandas as pd
import numpy as np
import datetime
from config_bd import *
import requests
import sys
import gzip
import shutil

DATA_DIR = os.path.join('data')
# DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
TEMP_DIR = os.path.join(DATA_DIR, 'TEMP')


""" Comprobamos que existe el directorio de datos """

if not os.path.exists(DATA_DIR):
    print(f"El directorio {DATA_DIR} no existe. No es posible continuar...")
    sys.exit()

""" Descomprimimos el archivo de calendario """

if not os.path.exists(TEMP_DIR):
    os.makedirs(TEMP_DIR)


gz_file_path = os.path.join(DATA_DIR, 'calendar.csv.gz')
output_file_path = os.path.join(TEMP_DIR, os.path.splitext(os.path.basename(gz_file_path))[0])

if os.path.exists(gz_file_path):
    try:
        with gzip.open(gz_file_path, 'rb') as f_in:
            with open(output_file_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        print(f"Archivo {gz_file_path} descomprimido como {output_file_path}")
    except Exception as e:
        print(f"Error al descomprimir el archivo {gz_file_path}: {e}")
        sys.exit()
else:
    print(f"El archivo {gz_file_path} no existe. No es posible continuar...")
    sys.exit()


Archivo data\calendar.csv.gz descomprimido como data\TEMP\calendar.csv


In [33]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
listing_id,9236806.0,6.438998e+17,5.35787e+17,21853.0,36760070.0,8.115293e+17,1.142566e+18,1.369179e+18
available,9236806.0,0.4376801,0.4961011,0.0,0.0,0.0,1.0,1.0
price,9236806.0,211.6178,639.6718,8.0,55.0,92.0,160.0,21000.0
minimum_nights,9236806.0,10.16833,30.81097,1.0,1.0,3.0,6.0,1125.0
maximum_nights,9236806.0,614.4267,447.4725,1.0,364.0,365.0,1125.0,11111.0


In [34]:
df = pd.read_csv(output_file_path, low_memory=False)
df['price'] = df['price'].replace({'\$': '', ',': ''}, regex=True).astype(float)
df['price'] = df['price'].fillna(0)
df['available'] = df['available'].replace({'t': 1, 'f': 0})
df['minimum_nights'] = df['minimum_nights'].fillna(df['minimum_nights'].median()).astype(int)
df['maximum_nights'] = df['maximum_nights'].fillna(df['maximum_nights'].median()).astype(int)

  df['available'] = df['available'].replace({'t': 1, 'f': 0})


In [None]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
listing_id,9236806.0,6.438998e+17,5.35787e+17,21853.0,36760070.0,8.115293e+17,1.142566e+18,1.369179e+18
available,9236806.0,0.4376801,0.4961011,0.0,0.0,0.0,1.0,1.0
price,9236806.0,211.6178,639.6718,8.0,55.0,92.0,160.0,21000.0
minimum_nights,9236806.0,10.16833,30.81097,1.0,1.0,3.0,6.0,1125.0
maximum_nights,9236806.0,614.4267,447.4725,1.0,364.0,365.0,1125.0,11111.0


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9236806 entries, 0 to 9236805
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   listing_id      int64  
 1   date            object 
 2   available       int64  
 3   price           float64
 4   adjusted_price  object 
 5   minimum_nights  float64
 6   maximum_nights  float64
dtypes: float64(3), int64(2), object(2)
memory usage: 493.3+ MB


In [22]:
df.sample(10)

Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
3683834,598291878293746299,2025-10-23,t,156.0,,2.0,1125.0
8868318,1342619174581220181,2026-02-24,t,142.0,,30.0,75.0
4858914,862099228513582123,2025-12-07,f,69.0,,32.0,341.0
9059051,1355765170985479049,2026-01-10,f,100.0,,1.0,4.0
5870535,1005526626276873625,2025-06-23,t,153.0,,5.0,365.0
4263650,732304230728454849,2025-11-17,t,71.0,,30.0,180.0
9079944,1356367271797102260,2025-08-04,t,100.0,,1.0,365.0
4761311,841776009395434553,2025-06-30,t,75.0,,2.0,31.0
786354,16464496,2025-12-13,t,140.0,,29.0,1125.0
5720663,984201333970039034,2025-03-25,f,150.0,,4.0,10.0
