In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [73]:
data=pd.read_csv("listings.csv")

In [74]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13043 entries, 0 to 13042
Data columns (total 79 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            13043 non-null  int64  
 1   listing_url                                   13043 non-null  object 
 2   scrape_id                                     13043 non-null  int64  
 3   last_scraped                                  13043 non-null  object 
 4   source                                        13043 non-null  object 
 5   name                                          13043 non-null  object 
 6   description                                   12669 non-null  object 
 7   neighborhood_overview                         6929 non-null   object 
 8   picture_url                                   13043 non-null  object 
 9   host_id                                       13043 non-null 

In [75]:
data.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,31840,https://www.airbnb.com/rooms/31840,20250619031413,2025-06-19,city scrape,Residenza Martin Classic room,"Nice, private and quiet double room, classic s...",,https://a0.muscache.com/pictures/prohost-api/H...,380378,...,4.91,4.89,4.65,IT048017B4R5K5BW8E,t,24,14,10,0,0.85
1,32180,https://www.airbnb.com/rooms/32180,20250619031413,2025-06-19,city scrape,Charming Gem - Oltrarno City centre,.,,https://a0.muscache.com/pictures/bcb23395-7215...,13925330,...,4.7,4.81,4.84,IT048017C232S8WUEN,f,1,1,0,0,0.22
2,39115,https://www.airbnb.com/rooms/39115,20250619031413,2025-06-19,previous scrape,Central Double Shared Bathroom,"Double Room Shared Bathroom: comfortable, larg...","San Lorenzo Church, the market and Cappelle Me...",https://a0.muscache.com/pictures/17793998/5f09...,167739,...,4.74,4.91,4.54,IT048017B9LED47MUB,f,11,3,8,0,0.48
3,39165,https://www.airbnb.com/rooms/39165,20250619031413,2025-06-19,previous scrape,Florence Central Double Private Bathroom,"Double room private bathroom: comfortable, lar...","San Lorenzo Church, the market and Cappelle Me...",https://a0.muscache.com/pictures/7e94f597-ac04...,167739,...,4.64,4.95,4.41,IT048017B9LED47MUB,f,11,3,8,0,0.14
4,39822,https://www.airbnb.com/rooms/39822,20250619031413,2025-06-19,city scrape,Apartment Santa Monaca Oltrarno old town,Location details.<br />The bridge right next t...,,https://a0.muscache.com/pictures/51118538/b6b9...,154769,...,4.71,4.87,4.48,IT048017C2VVSRW6CW,f,3,3,0,0,0.19


1- Eliminamos las columnas con cero datos

In [76]:
columnas_vacias = data.columns[data.isnull().all()].tolist()
print(f"Columnas vacías encontradas: {columnas_vacias}")
data = data.drop(columns=columnas_vacias)
print(f"Eliminadas {len(columnas_vacias)} columnas vacías")

Columnas vacías encontradas: ['neighbourhood_group_cleansed', 'calendar_updated']
Eliminadas 2 columnas vacías


2- Convertimos los precios a numeros

In [77]:
def limpiar_precio(precio):
    if pd.isna(precio):
        return np.nan
    return float(str(precio).replace('$', '').replace(',', ''))
data['price'] = data['price'].apply(limpiar_precio)

print("Precios convertidos a números")
print(f"Precio más bajo: ${data['price'].min():.2f}")
print(f"Precio más alto: ${data['price'].max():.2f}")
print(f"Precio promedio: ${data['price'].mean():.2f}")

Precios convertidos a números
Precio más bajo: $9.00
Precio más alto: $18000.00
Precio promedio: $263.78


3- Convertimos los porcentajes a numeros

In [78]:
def limpiar_porcentaje(porcentaje):
    if pd.isna(porcentaje):
        return np.nan
    return float(str(porcentaje).replace('%', '')) / 100
data['host_response_rate'] = data['host_response_rate'].apply(limpiar_porcentaje)
data['host_acceptance_rate'] = data['host_acceptance_rate'].apply(limpiar_porcentaje)

print("Porcentajes convertidos a decimales (0 a 1)")
print(f"Tasa de respuesta promedio: {data['host_response_rate'].mean():.2%}")
print(f"Tasa de aceptación promedio: {data['host_acceptance_rate'].mean():.2%}")

Porcentajes convertidos a decimales (0 a 1)
Tasa de respuesta promedio: 97.98%
Tasa de aceptación promedio: 94.24%


4- Convertimos columnas si/no a booleanas

In [79]:
columnas_bool = ['host_is_superhost',
    'host_has_profile_pic',
    'host_identity_verified',
    'has_availability',
    'instant_bookable']
for col in columnas_bool:
    if col in data.columns:
        data[col] = data[col].map({'t': True, 'f': False})
        print(f"{col} convertida")

print(f"\nSuperhosts: {data['host_is_superhost'].sum()} de {data['host_is_superhost'].notna().sum()}")

host_is_superhost convertida
host_has_profile_pic convertida
host_identity_verified convertida
has_availability convertida
instant_bookable convertida

Superhosts: 5388 de 12102


5- Convertimos fechas a un solo formato

In [80]:
columnas_fecha = ['last_scraped',
    'host_since',
    'calendar_last_scraped',
    'first_review',
    'last_review']


for col in columnas_fecha:
    if col in data.columns:
        data[col] = pd.to_datetime(data[col], errors='coerce')
        print(f"{col} convertida a fecha")

print(f"\nHost más antiguo: {data['host_since'].min()}")
print(f"Host más nuevo: {data['host_since'].max()}")

last_scraped convertida a fecha
host_since convertida a fecha
calendar_last_scraped convertida a fecha
first_review convertida a fecha
last_review convertida a fecha

Host más antiguo: 2009-05-03 00:00:00
Host más nuevo: 2025-06-16 00:00:00


6- Juntar las columnas de datos que pueden estar juntas

In [81]:
review_columns = [col for col in data.columns if col.startswith('review_scores')]
data['review_scores'] = data[review_columns].mean(axis=1)

print("Nueva columna 'review_scores' creada con el promedio de las puntuaciones de revisión.")
print(data[['review_scores_rating', 'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin','review_scores_communication', 'review_scores_location','review_scores_value','review_scores']].head())

Nueva columna 'review_scores' creada con el promedio de las puntuaciones de revisión.
   review_scores_rating  review_scores_accuracy  review_scores_cleanliness  \
0                  4.67                    4.72                       4.85   
1                  4.77                    4.81                       4.84   
2                  4.52                    4.70                       4.60   
3                  4.41                    4.32                       4.55   
4                  4.48                    4.39                       4.45   

   review_scores_checkin  review_scores_communication  review_scores_location  \
0                   4.85                         4.91                    4.89   
1                   4.78                         4.70                    4.81   
2                   4.80                         4.74                    4.91   
3                   4.73                         4.64                    4.95   
4                   4.71                

7- Eliminamos columnas innecesarias


In [82]:
columnas_eliminar = [
    'listing_url',
    'scrape_id',
    'host_url',
    'host_thumbnail_url',
    'host_picture_url',
    'picture_url',
    'host_neighbourhood',
    'source',
    'host_verifications',
    'amenities',
    'host_about',
    'description',
    'neighborhood_overview',"minimum_minimum_nights",
    "maximum_minimum_nights",
    "minimum_maximum_nights", "maximum_maximum_nights",
    "minimum_nights_avg_ntm","maximum_nights_avg_ntm","bathrooms","host_location",
    "host_listings_count", 'review_scores_rating',
    'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
    'review_scores_communication', 'review_scores_location',"review_scores_value"]

columnas_eliminadas = [col for col in columnas_eliminar if col in data.columns]
data = data.drop(columns=columnas_eliminadas)

print(f"Eliminadas {len(columnas_eliminadas)} columnas innecesarias")
print(f"Columnas restantes: {data.shape[1]}")

Eliminadas 29 columnas innecesarias
Columnas restantes: 49


8- Eliminamos los duplicados

In [83]:
duplicados = data.duplicated().sum()

if duplicados > 0:
    data = data.drop_duplicates()
    print(f"Eliminados {duplicados} registros duplicados")
else:
    print("No hay duplicados")

No hay duplicados


9- Eliminamos propieades inactivas

In [84]:
data['days_since_last_review'] = (pd.Timestamp.now() - data['last_review']).dt.days

antes = len(data)

data = data[(data['days_since_last_review'].isna()) | (data['days_since_last_review'] <= 730)]

eliminados = antes - len(data)
print(f"Eliminadas {eliminados} propiedades inactivas (sin reviews en 2+ años)")

Eliminadas 994 propiedades inactivas (sin reviews en 2+ años)


10- Eliminamos las propiedades sin precio

In [85]:
antes = len(data)
data = data[data['price'].notna()]

eliminados = antes - len(data)
print(f"Eliminadas {eliminados} propiedades sin precio")

Eliminadas 843 propiedades sin precio


11- Eliminamos datos inconsistentes

In [86]:
antes = len(data)
data = data[(data['beds'].isna()) | (data['bedrooms'].isna()) | (data['beds'] >= data['bedrooms'])]

eliminados = antes - len(data)
print(f"Eliminadas {eliminados} propiedades inconsistentes (más habitaciones que camas)")

Eliminadas 205 propiedades inconsistentes (más habitaciones que camas)


12- Eliminamos precios anómalos

In [87]:
antes = len(data)


data = data[(data['price'].isna()) | ((data['price'] >= 20) & (data['price'] <= 10000))]

eliminados = antes - len(data)
print(f"Eliminadas {eliminados} propiedades con precios anómalos")
print(f"Nuevo rango de precios: ${data['price'].min():.2f} - ${data['price'].max():.2f}")

Eliminadas 18 propiedades con precios anómalos
Nuevo rango de precios: $20.00 - $10000.00


13- Llenar los datos faltantes

In [88]:
data['neighbourhood'].fillna("Florence,Tuscany,Italy", inplace = True)
data['host_response_time'].replace(np.nan, 'Sin registro', inplace = True)
data['host_response_rate'].replace(np.nan, 'Sin registro', inplace = True)
data['host_acceptance_rate'].replace(np.nan, 'Sin registro', inplace = True)
data['host_is_superhost'].replace(np.nan, 'Sin registro', inplace = True)
data['beds'].replace(np.nan, 0, inplace = True)
data['bedrooms'].replace(np.nan, 0, inplace = True)
data['bathrooms_text'].replace(np.nan, 'Sin registro', inplace = True)
data['has_availability'].replace(np.nan, 'Sin registro', inplace = True)
data['first_review'].replace(np.nan, 'Sin registro', inplace = True)
data['last_review'].replace(np.nan, 'Sin registro', inplace = True)
data['license'].replace(np.nan, 'Sin registro', inplace = True)
data['reviews_per_month'].replace(np.nan, 0, inplace = True)
data['days_since_last_review'].replace(np.nan, 'Sin registro', inplace = True)
data['review_scores'].replace(np.nan, 0, inplace = True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['neighbourhood'].fillna("Florence,Tuscany,Italy", inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['host_response_time'].replace(np.nan, 'Sin registro', inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because th

In [89]:
data.isnull().sum()

id                                              0
last_scraped                                    0
name                                            0
host_id                                         0
host_name                                       0
host_since                                      0
host_response_time                              0
host_response_rate                              0
host_acceptance_rate                            0
host_is_superhost                               0
host_total_listings_count                       0
host_has_profile_pic                            0
host_identity_verified                          0
neighbourhood                                   0
neighbourhood_cleansed                          0
latitude                                        0
longitude                                       0
property_type                                   0
room_type                                       0
accommodates                                    0


14- Crear columnas nuevas que simplifiquen

In [90]:
print("Creando nuevas columnas...\n")

# 1. Precio por persona
data['price_per_person'] = data['price'] / data['accommodates']
print("price_per_person: precio dividido entre capacidad")

# 4. Categoría de precio
data['price_category'] = pd.cut(
    data['price'],
    bins=[0, 50, 100, 200, 500, 10000],
    labels=['Muy Barato', 'Económico', 'Medio', 'Caro', 'Lujo']
)
print("price_category: clasificación de precio")

# 5. Tipo de propiedad simplificado
def simplificar_tipo(tipo):
    tipo = str(tipo).lower()
    if 'apartment' in tipo or 'rental unit' in tipo or 'condo' in tipo:
        return 'Apartment'
    elif 'house' in tipo or 'home' in tipo or 'villa' in tipo:
        return 'House'
    elif 'hotel' in tipo:
        return 'Hotel'
    elif 'bed and breakfast' in tipo or 'bnb' in tipo:
        return 'B&B'
    elif 'loft' in tipo:
        return 'Loft'
    else:
        return 'Other'

data['property_type_simple'] = data['property_type'].apply(simplificar_tipo)
print("property_type_simple: tipo de propiedad simplificado")


# 8. Tasa de ocupación
data['occupancy_rate'] = data['estimated_occupancy_l365d'] / 365
print("occupancy_rate: porcentaje de días ocupados")

# 9. Si es propiedad completa
data['is_entire_place'] = data['room_type'] == 'Entire home/apt'
print("is_entire_place: si es departamento/casa completa")

# 10. Categoría por capacidad
data['capacity_category'] = pd.cut(
    data['accommodates'],
    bins=[0, 2, 4, 6, 20],
    labels=['Pareja', 'Pequeño', 'Mediano', 'Grande']
)
print("capacity_category: clasificación por capacidad")



print(f"\n{12} nuevas columnas creadas")

Creando nuevas columnas...

price_per_person: precio dividido entre capacidad
price_category: clasificación de precio
property_type_simple: tipo de propiedad simplificado
occupancy_rate: porcentaje de días ocupados
is_entire_place: si es departamento/casa completa
capacity_category: clasificación por capacidad

12 nuevas columnas creadas


15- Eliminar las columnas que ya no sirven

In [91]:
columnas_viejas = ['property_type','estimated_occupancy_l365d','room_type']

columnas_viejas = [col for col in columnas_viejas if col in data.columns]
data = data.drop(columns=columnas_viejas)

print(f"Eliminadas {len(columnas_viejas)} columnas innecesarias")
print(f"Columnas restantes: {data.shape[1]}")

Eliminadas 3 columnas innecesarias
Columnas restantes: 53


In [92]:
data.info()
data.to_csv("listings_cleaned.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 10983 entries, 0 to 13042
Data columns (total 53 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   id                                            10983 non-null  int64         
 1   last_scraped                                  10983 non-null  datetime64[ns]
 2   name                                          10983 non-null  object        
 3   host_id                                       10983 non-null  int64         
 4   host_name                                     10983 non-null  object        
 5   host_since                                    10983 non-null  datetime64[ns]
 6   host_response_time                            10983 non-null  object        
 7   host_response_rate                            10983 non-null  object        
 8   host_acceptance_rate                          10983 non-null  object   