In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("Mexico.csv", encoding='latin-1')
data.head(5)

Unnamed: 0,last_scraped,source,name,description,neighborhood_overview,host_name,host_since,host_location,host_about,host_response_time,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month
0,25/09/2024,city scrape,Villa Dante,"Dentro de Villa un estudio de arte con futon, ...","Santa Fe Mall, Interloma Park and the Lion Des...",Dici,28/06/2010,"Mexico City, Mexico","Master in visual arts, film photography & Mark...",a few days or more,...,,,,,,,,f,1,
1,26/09/2024,previous scrape,Condesa Haus,A new concept of hosting in mexico through a b...,,Fernando,09/08/2010,"Mexico City, Mexico",Condesa Haus offers independent studios and ...,within an hour,...,4.58,4.56,4.7,4.87,4.78,4.98,4.48,f,8,0.41
2,26/09/2024,city scrape,"2 bedroom apt. deco bldg, Condesa","Comfortably furnished, sunny, 2 bedroom apt., ...",,Nicholas,04/01/2011,"Mexico City, Mexico","I am a journalist writing about food, (book an...",within an hour,...,4.9,4.81,4.75,4.94,4.92,4.98,4.91,f,2,0.31
3,25/09/2024,city scrape,Beautiful light Studio Coyoacan- full equipped !,COYOACAN designer studio quiet & safe! well eq...,Coyoacan is a beautiful neighborhood famous fo...,Trisha,24/08/2010,"Mexico City, Mexico","I am a mother, documentary film maker and phot...",within a few hours,...,4.91,4.9,4.96,4.96,4.98,4.96,4.92,f,3,0.83
4,25/09/2024,city scrape,NEW DESIGNER LOFT,Is the best ever place triple L <br />Location...,"Is located in the best area of Mexico City, Po...",Andrea,27/04/2011,"Mexico City, Mexico",I Leave in Mexico City... I am an Architect an...,within an hour,...,4.91,5.0,5.0,5.0,4.73,4.91,4.82,f,3,0.11


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26582 entries, 0 to 26581
Data columns (total 50 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   last_scraped                    26582 non-null  object 
 1   source                          26582 non-null  object 
 2   name                            26582 non-null  object 
 3   description                     25649 non-null  object 
 4   neighborhood_overview           15687 non-null  object 
 5   host_name                       26582 non-null  object 
 6   host_since                      26582 non-null  object 
 7   host_location                   21141 non-null  object 
 8   host_about                      15617 non-null  object 
 9   host_response_time              22960 non-null  object 
 10  host_response_rate              22960 non-null  object 
 11  host_acceptance_rate            23713 non-null  object 
 12  host_is_superhost               

In [4]:
valores_nulos=data.isnull().sum()
valores_nulos

last_scraped                          0
source                                0
name                                  0
description                         933
neighborhood_overview             10895
host_name                             0
host_since                            0
host_location                      5441
host_about                        10965
host_response_time                 3622
host_response_rate                 3622
host_acceptance_rate               2869
host_is_superhost                  1509
host_listings_count                   0
host_total_listings_count             0
host_verifications                    0
host_has_profile_pic                  0
host_identity_verified                0
neighbourhood_cleansed                0
property_type                         0
room_type                             0
accommodates                          0
bathrooms                          2915
bedrooms                            866
beds                               2953


In [5]:
#vamos a estar usando data2 para los filtros
data2=data.copy()

In [6]:
#Description
#Rellenamos aquellas valores nulos con una descripcion especifica que puede quedar
#muy bien con cualquier propiedad
data2["description"]=data2["description"].fillna("Your best option to stay if you want to visit Mexican lands and enjoy your stay to the fullest without worrying about comfort")

In [7]:
#neighborhood_overview
#Rellenamos aquellas valores nulos con una descripcion especifica que puede quedar
#muy bien con cualquier vecindario de México
data2["neighborhood_overview"]=data2["neighborhood_overview"].fillna("Quiet and safe area, with the characteristics of a typical neighborhood in Mexico City area of Mexico City.")

In [8]:
#host_location
#Como estamos tratando los datos de la ciudad de México, aquellos valores nulos
#los rellenamos con el valor "Mexico City, Mexico"
data2["host_location"]=data2["host_location"].fillna("Mexico City, Mexico")


In [9]:
#host_about
#Rellenamos aquellas valores nulos con una descripcion especifica
#acerca de un host amable
data2["host_about"]=data2["host_about"].fillna("I am a social person, who likes to provide the best customer service, give them advice and recommendations, so that they have everything they are looking for in a suite so that they feel comfortable and at home.")


In [10]:
#host_response_time
#Ya que la mayoria de los host tienen un tiempo de respuesta de un día
#decidimos rellenar aquellos valores nulos con "within a day"
data2["host_response_time"]=data2["host_response_time"].fillna("within a day")


In [11]:
#host_response_rate
# en este caso los datos contenían un simbolo % que no nos permitia obtener
#el promedio de los datos por lo que primero eliminamos ese simbolo
data2['host_response_rate'] = data2['host_response_rate'].astype(str)

#convertir a tipo numérico (float)
host_response_numeric = data2['host_response_rate'].str.rstrip('%').astype(float)

# Calcular el promedio ignorando valores nulos
mean_response_rate = host_response_numeric.mean()

#rellenamos los valores nulos con el promedio que obtuvimos de los datos
host_response_numeric = host_response_numeric.fillna(mean_response_rate)

# Convertir nuevamente a formato de porcentaje y actualizar la columna original
data2['host_response_rate'] = host_response_numeric.apply(lambda x: f"{x:.0f}%")




In [12]:
#host_acceptance_rate
# en este caso los datos contenían un simbolo % que no nos permitia obtener
#el promedio de los datos por lo que primero eliminamos ese simbolo
data2['host_acceptance_rate'] = data2['host_acceptance_rate'].astype(str)

#convertir a tipo numérico (float)
host_acceptance_numeric = data2['host_acceptance_rate'].str.rstrip('%').astype(float)

# Calcular el promedio ignorando valores nulos
mean_acceptance_rate = host_acceptance_numeric.mean()

#rellenamos los valores nulos con el promedio que obtuvimos de los datos
host_acceptance_numeric = host_acceptance_numeric.fillna(mean_acceptance_rate)

# Convertir nuevamente a formato de porcentaje y actualizar la columna original
data2['host_acceptance_rate'] = host_acceptance_numeric.apply(lambda x: f"{x:.0f}%")

In [13]:
#host_is_superhost
#ya que para ser superhost se necesitan verificaciones es mejor
#rellenar aquellos valores nulos con false
data2["host_is_superhost"]=data2["host_is_superhost"].fillna("f")

In [14]:
valores_nulos=data2.isnull().sum()
valores_nulos

last_scraped                         0
source                               0
name                                 0
description                          0
neighborhood_overview                0
host_name                            0
host_since                           0
host_location                        0
host_about                           0
host_response_time                   0
host_response_rate                   0
host_acceptance_rate                 0
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
property_type                        0
room_type                            0
accommodates                         0
bathrooms                         2915
bedrooms                           866
beds                              2953
amenities                

In [15]:
#BATHROOMS
#usamos median para no alterar significativamente los datos
data2["bathrooms"]=data2["bathrooms"].fillna(round(data["bathrooms"].median(),1))

In [16]:
#BEDROOMS
#usamos median para no alterar significativamente los datos
data2["bedrooms"]=data2["bedrooms"].fillna(round(data["bedrooms"].median(),1))

In [17]:
#BEDS
#usamos median para no alterar significativamente los datos
data2["beds"]=data2["beds"].fillna(round(data["beds"].median(),1))

In [18]:
#PRICE
#usamos median porque los numeros son muy grandes
#asegurar que todos los valores sean cadenas
data2["price"] = data2["price"].astype(str)
#Eliminar las simbolos no numericos $ ,
data2["price"] = data2["price"].str.replace('[\$,]', '', regex=True)
#Convertir a numérico
data2["price"] = pd.to_numeric(data2["price"], errors='coerce')
data2["price"] = data2["price"].fillna(round(data2["price"].median(), 1))



  data2["price"] = data2["price"].str.replace('[\$,]', '', regex=True)


In [22]:
#has_availability la mayoria es t agregamos ese valor en especifico
#ya que para ser has_hability se necesitan verificaciones es mejor
#rellenar aquellos valores nulos con false para no perjudicar los que son true
data2["has_availability"]=data2["has_availability"].fillna("f")

In [23]:
#first_review rellenar con una fecha con la que sepan que no hay review y se tome como inexistente
data2["first_review"] = data2["first_review"].fillna(pd.Timestamp("1900-01-01"))

In [24]:
#first_review rellenar con una fecha con la que sepan que no hay review y se tome como inexistente
data2["last_review"] = data2["last_review"].fillna(pd.Timestamp("1900-01-01"))

In [25]:
#review_scores_rating
#usamos median para no alterar significativamente los datos
data2["review_scores_rating"]=data2["review_scores_rating"].fillna(round(data["review_scores_rating"].median(),1))

In [26]:
valores_nulos=data2.isnull().sum()
valores_nulos

last_scraped                         0
source                               0
name                                 0
description                          0
neighborhood_overview                0
host_name                            0
host_since                           0
host_location                        0
host_about                           0
host_response_time                   0
host_response_rate                   0
host_acceptance_rate                 0
host_is_superhost                    0
host_listings_count                  0
host_total_listings_count            0
host_verifications                   0
host_has_profile_pic                 0
host_identity_verified               0
neighbourhood_cleansed               0
property_type                        0
room_type                            0
accommodates                         0
bathrooms                            0
bedrooms                             0
beds                                 0
amenities                

In [27]:
#sustituir valores nulos de review_scores_accuracy con mediana
data2["review_scores_accuracy"]=data2["review_scores_accuracy"].fillna(round(data["review_scores_accuracy"].median(),1))

#corrroborar valor nulo
valores_nulos_columna=data2["review_scores_accuracy"].isnull().sum()
valores_nulos_columna

np.int64(0)

In [28]:
#sustituir valores nulos de review_scores_cleanliness con mediana
#primer metodo de sustitucion de valores null
#sustituir valores null con promedio o media
data2["review_scores_cleanliness"]=data2["review_scores_cleanliness"].fillna(round(data["review_scores_cleanliness"].mean(),1))


#corrroborar valor nulo
valores_nulos_columna=data2["review_scores_cleanliness"].isnull().sum()
valores_nulos_columna

np.int64(0)

In [29]:
#sustituir valores nulos de review_scores_checkin con mediana
data2["review_scores_checkin"]=data2["review_scores_checkin"].fillna(round(data["review_scores_checkin"].median(),1))

#corrroborar valor nulo
valores_nulos_columna=data2["review_scores_checkin"].isnull().sum()
valores_nulos_columna

np.int64(0)

In [30]:
#sustituir valores nulos de review_scores_comunicación con mediana
data2["review_scores_communication"]=data2["review_scores_communication"].fillna(round(data["review_scores_communication"].median(),1))

#corrroborar valor nulo
valores_nulos_columna=data2["review_scores_communication"].isnull().sum()
valores_nulos_columna

np.int64(0)

In [31]:
#sustituir valores nulos de review_scores_location con mediana
data2["review_scores_location"]=data2["review_scores_location"].fillna(round(data["review_scores_location"].median(),1))

#corrroborar valor nulo
valores_nulos_columna=data2["review_scores_location"].isnull().sum()
valores_nulos_columna

np.int64(0)

In [32]:
#primer metodo de sustitucion de valores null
#sustituir valores null con promedio o media
data2["review_scores_value"]=data2["review_scores_value"].fillna(round(data["review_scores_value"].mean(),1))


#corrroborar valor nulo
valores_nulos_columna=data2["review_scores_value"].isnull().sum()
valores_nulos_columna

np.int64(0)

In [None]:

data2["reviews_per_month"]=data2["reviews_per_month"].fillna(round(data["reviews_per_month"].median(),1))

#corrobora valor nulo
valores_nulos=data2.isnull().sum()
valores_nulos

last_scraped                      0
source                            0
name                              0
description                       0
neighborhood_overview             0
host_name                         0
host_since                        0
host_location                     0
host_about                        0
host_response_time                0
host_response_rate                0
host_acceptance_rate              0
host_is_superhost                 0
host_listings_count               0
host_total_listings_count         0
host_verifications                0
host_has_profile_pic              0
host_identity_verified            0
neighbourhood_cleansed            0
property_type                     0
room_type                         0
accommodates                      0
bathrooms                         0
bedrooms                          0
beds                              0
amenities                         0
price                             0
minimum_nights              

In [34]:
data2.to_csv('Mexico_sin_nulos.csv')