In [1]:
import numpy  as np  
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

## 1. Preprocesamiento de datos antes de división

#### Elimino caracteristicas como url, y descripciones porque no voy a hacer procesamiento de texto libre. También elimino información de metadata e ID's

In [2]:
raw_data = pd.read_csv("airbnb-listings-extract.csv", sep=';')  # cargamos fichero
print(raw_data.shape)
raw_data.head(5).T                                 # visualizamos 5 primeras filas

(14780, 89)


Unnamed: 0,0,1,2,3,4
ID,11210388,17471131,17584891,5398030,18104606
Listing Url,https://www.airbnb.com/rooms/11210388,https://www.airbnb.com/rooms/17471131,https://www.airbnb.com/rooms/17584891,https://www.airbnb.com/rooms/5398030,https://www.airbnb.com/rooms/18104606
Scrape ID,20170306202425,20170407214050,20170407214050,20170407214050,20170407214050
Last Scraped,2017-03-07,2017-04-08,2017-04-08,2017-04-08,2017-04-08
Name,The Loft-Full Bath-Deck w/View,"Claris I, Friendly Rentals","Style Terrace Red, Friendly Rentals",Picasso Suite 1.4 Paseo de Gracia,Smart City Centre Apartment II
...,...,...,...,...,...
Cancellation Policy,moderate,super_strict_30,super_strict_30,strict,flexible
Calculated host listings count,1.0,106.0,106.0,24.0,92.0
Reviews per Month,3.5,0.86,,1.09,
Geolocation,"30.3373609355,-97.8632766782","41.3896829422,2.17262543017","41.3930345489,2.16217327868","41.3969668101,2.1674178103","41.3886851936,2.15514963616"


In [3]:
# Crear una copia de los datos para trabajar
data = raw_data.copy()

# OPCIÓN 1: Eliminar columnas específicas por nombre
# Ejemplo: eliminar columnas de URLs e IDs que no aportan al ML
columns_to_drop = [
    'ID', 'Listing Url', 'Scrape ID', 'Thumbnail Url', 'Medium Url', 
    'Picture Url', 'XL Picture Url', 'Host URL', 'Host Thumbnail Url', 
    'Host Picture Url', 'Last Scraped', 'Name', 'Summary', 'Space', 
    'Description', 'Neighborhood Overview', 'License', 'Jurisdiction Names', 
    'Cancellation Policy', 'Notes', 'Transit', 'Access', 'Interaction', 'House Rules', 'Host ID', 'Host Name', 
    'Host Location', 'Host About', 'Host Response Time', 'Geolocation', 'Smart Location', 'Market', 'Street',
    'State', 'Country', 'Host Verifications', 'Neighbourhood', 'Neighbourhood Cleansed', 
    'Neighbourhood Group Cleansed', 'Country Code', 'Amenities', 'Calendar Updated', 
    'Calendar last Scraped', 'Features', 'Zipcode', 'Host Neighbourhood'
]

selected_data = data.drop(columns=columns_to_drop)  # APLICANDO la eliminación de columnas

print(f"Dataset original: {raw_data.shape}")
print(f"Dataset después de limpieza: {selected_data.shape}")
selected_data.head(5).T

Dataset original: (14780, 89)
Dataset después de limpieza: (14780, 43)


Unnamed: 0,0,1,2,3,4
Experiences Offered,none,none,none,none,none
Host Since,2016-02-11,2016-11-08,2016-11-08,2010-05-25,2016-01-12
Host Response Rate,100.0,100.0,100.0,100.0,99.0
Host Acceptance Rate,,,,,
Host Listings Count,1.0,162.0,162.0,27.0,207.0
Host Total Listings Count,1.0,162.0,162.0,27.0,207.0
City,Austin,Barcelona,Barcelona,Barcelona,Barcelona
Latitude,30.337361,41.389683,41.393035,41.396967,41.388685
Longitude,-97.863277,2.172625,2.162173,2.167418,2.15515
Property Type,Loft,Apartment,Apartment,Apartment,Apartment


## 2. División de datos en Train / Test


In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(selected_data, test_size=0.2, shuffle=True, random_state=0)

print(f'Dimensiones del dataset de training: {train.shape}')
print(f'Dimensiones del dataset de test: {test.shape}')

# Guardamos
train.to_csv('airbnb-listings-extract-train.csv', sep=';', decimal='.', index=False)
test.to_csv('airbnb-listings-extract-test.csv', sep=';', decimal='.', index=False)

# Trabajo sólo con el dataset de train

house_data = pd.read_csv('airbnb-listings-extract-train.csv', sep=';', decimal='.')
house_data.head(5).T

Dimensiones del dataset de training: (11824, 43)
Dimensiones del dataset de test: (2956, 43)


Unnamed: 0,0,1,2,3,4
Experiences Offered,none,none,none,none,none
Host Since,2015-04-13,2016-07-22,2016-07-17,2014-05-08,2012-02-06
Host Response Rate,100.0,100.0,100.0,100.0,100.0
Host Acceptance Rate,,,,,
Host Listings Count,2.0,1.0,16.0,114.0,2.0
Host Total Listings Count,2.0,1.0,16.0,114.0,2.0
City,Madrid,Madrid,Madrid,Madrid,Madrid
Latitude,40.407732,40.415802,40.389048,40.412814,40.438631
Longitude,-3.684819,-3.70534,-3.740374,-3.703052,-3.713716
Property Type,Apartment,Apartment,Apartment,Apartment,Apartment


## 3. Análisis exploratorio Train

In [27]:
!pip install ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.17.0-py2.py3-none-any.whl.metadata (22 kB)
Collecting scipy<1.16,>=1.4.1 (from ydata-profiling)
  Using cached scipy-1.15.3-cp312-cp312-macosx_14_0_x86_64.whl.metadata (61 kB)
Collecting matplotlib<=3.10,>=3.5 (from ydata-profiling)
  Using cached matplotlib-3.10.0-cp312-cp312-macosx_10_13_x86_64.whl.metadata (11 kB)
Collecting pydantic>=2 (from ydata-profiling)
  Downloading pydantic-2.11.10-py3-none-any.whl.metadata (68 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Using cached visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting numpy<2.2,>=1.16.0 (from ydata-profiling)
  Using cached numpy-2.1.3-cp312-cp312-macosx_14_0_x86_64.whl.metadata (62 kB)
Collecting minify-html>=0.15.0 (from ydata-profiling)
  Downloading minify_html-0.16.4-cp312-cp312-macosx_10_12_x86_64.whl.metadata (18 kB)
Collecting filetype>=1.0.0 (from ydata-profiling)
  Downloading filetype-1.2.0-p

In [5]:
from ydata_profiling import ProfileReport

profile = ProfileReport(house_data, title="Profiling Report")

In [None]:
# Generar y mostrar el reporte de profiling en HTML
profile.to_file("profiling_report_airbnb.html")

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Reporte de profiling generado exitosamente en 'profiling_report_airbnb.html'
Puedes abrir el archivo HTML en tu navegador para ver el reporte completo e interactivo


In [8]:
# Mostrar resumen básico del dataset
print("=== RESUMEN DEL DATASET DE AIRBNB ===")
print(f"📊 Forma del dataset: {house_data.shape}")
print(f"📈 Número de variables: {house_data.shape[1]}")
print(f"📋 Número de observaciones: {house_data.shape[0]}")
print()

# Información básica de las columnas
print("=== TIPOS DE DATOS ===")
info_df = pd.DataFrame({
    'Columna': house_data.columns,
    'Tipo': house_data.dtypes,
    'Valores_Nulos': house_data.isnull().sum(),
    'Porcentaje_Nulos': (house_data.isnull().sum() / len(house_data) * 100).round(2)
})
print(info_df.to_string(index=False))
print()

# Variables numéricas vs categóricas
numeric_cols = house_data.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = house_data.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"🔢 Variables numéricas ({len(numeric_cols)}): {numeric_cols}")
print(f"📝 Variables categóricas ({len(categorical_cols)}): {categorical_cols}")

=== RESUMEN DEL DATASET DE AIRBNB ===
📊 Forma del dataset: (11824, 43)
📈 Número de variables: 43
📋 Número de observaciones: 11824

=== TIPOS DE DATOS ===
                       Columna    Tipo  Valores_Nulos  Porcentaje_Nulos
           Experiences Offered  object              0              0.00
                    Host Since  object              3              0.03
            Host Response Rate float64           1507             12.75
          Host Acceptance Rate  object          11794             99.75
           Host Listings Count float64              3              0.03
     Host Total Listings Count float64              3              0.03
                          City  object              4              0.03
                      Latitude float64              0              0.00
                     Longitude float64              0              0.00
                 Property Type  object              0              0.00
                     Room Type  object              0 

In [25]:
# Explorar valores únicos de características categóricas


categorical_columns = ['Property Type', 'Room Type', 'Bed Type', 'City']

for col in categorical_columns:
    if col in house_data.columns:
        print(f"\n=== {col.upper()} ===")
        print(f"Valores únicos: {house_data[col].nunique()}")
        print(f"Valores: {list(house_data[col].unique())}")
        print("Conteos:")
        print(house_data[col].value_counts().head(10))  # Top 10
        print("-" * 50)



=== PROPERTY TYPE ===
Valores únicos: 21
Valores: ['Apartment', 'Loft', 'House', 'Bed & Breakfast', 'Dorm', 'Chalet', 'Condominium', 'Guesthouse', 'Hostel', 'Other', 'Villa', 'Boutique hotel', 'Camper/RV', 'Casa particular', 'Townhouse', 'Serviced apartment', 'Guest suite', 'Boat', 'Tent', 'Earth House', 'Bungalow']
Conteos:
Property Type
Apartment          9598
House              1085
Condominium         284
Bed & Breakfast     274
Loft                248
Other               171
Dorm                 37
Guesthouse           32
Chalet               22
Villa                15
Name: count, dtype: int64
--------------------------------------------------

=== ROOM TYPE ===
Valores únicos: 3
Valores: ['Entire home/apt', 'Shared room', 'Private room']
Conteos:
Room Type
Entire home/apt    7191
Private room       4471
Shared room         162
Name: count, dtype: int64
--------------------------------------------------

=== BED TYPE ===
Valores únicos: 5
Valores: ['Real Bed', 'Pull-out Sofa', '