# Análise Exploratória de Dados - EDA

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
%matplotlib inline



In [3]:
#missing data
def missingData(df:pd.DataFrame, num:int=20, limit=0.0):
    total = df.isnull().sum().sort_values(ascending=False)   
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'])
    if limit != 0.0:
        limit_perc = missing_data['Percent'] >= limit
        num = missing_data.Percent[limit_perc].count()
    return missing_data.head(num)

In [4]:
def infoDf( df:pd.DataFrame):
    df_info = pd.DataFrame({
      'Columns': df.columns,
      'Type': df.dtypes,
      'Unique': df.nunique(),
      'Size': df.shape[0],
      'Missing': df.isna().sum()
  })
    df_info['% Unique'] = round(df_info['Unique']/df_info['Size'], 6)
    df_info['% Missing'] = (df_info['Missing']/df_info['Size'])
    return df_info

In [5]:
landing_path = "./data/landing/"
file_listing = landing_path + 'listings.csv'

In [6]:
df_listings = pd.read_csv(file_listing)
df_listings.shape

(36008, 75)

In [7]:
df_listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,17878,https://www.airbnb.com/rooms/17878,20231226034138,2023-12-27,city scrape,Condo in Rio de Janeiro · ★4.70 · 2 bedrooms ·...,,This is the one of the bests spots in Rio. Bec...,https://a0.muscache.com/pictures/65320518/3069...,68997,...,4.91,4.77,4.67,,f,1,1,0,0,1.9
1,25026,https://www.airbnb.com/rooms/25026,20231226034138,2023-12-27,city scrape,Rental unit in Rio de Janeiro · ★4.72 · 1 bedr...,,Copacabana is a lively neighborhood and the ap...,https://a0.muscache.com/pictures/a745aa21-b8dd...,102840,...,4.92,4.84,4.6,,f,1,1,0,0,1.67
2,35764,https://www.airbnb.com/rooms/35764,20231226034138,2023-12-27,city scrape,Loft in Rio de Janeiro · ★4.90 · 1 bedroom · 1...,,Our guests will experience living with a local...,https://a0.muscache.com/pictures/23782972/1d3e...,153691,...,4.95,4.94,4.89,,f,1,1,0,0,2.82
3,41198,https://www.airbnb.com/rooms/41198,20231226034138,2023-12-27,city scrape,Rental unit in Rio de Janeiro · ★4.21 · 2 bedr...,,,https://a0.muscache.com/pictures/3576716/2d6a9...,178975,...,4.56,4.44,4.38,,f,2,2,0,0,0.13
4,326205,https://www.airbnb.com/rooms/326205,20231226034138,2023-12-27,city scrape,Condo in Rio de Janeiro · ★4.57 · 1 bedroom · ...,,,https://a0.muscache.com/pictures/c550151d-910c...,1603206,...,4.77,4.83,4.59,,f,5,5,0,0,1.07


In [7]:
#!pip install ydata_profiling
#from ydata_profiling import ProfileReport
#profile = ProfileReport(df_listings, title="Pandas Profiling Report") #cria o relatório

#profile.to_file("resultados.html") #salva os resultados em um arquivo

# 1. Analisandos Features e Dados Missing

In [8]:
infoDf(df_listings)

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
id,id,int64,36008,36008,0,1.000000,0.000000
listing_url,listing_url,object,36008,36008,0,1.000000,0.000000
scrape_id,scrape_id,int64,1,36008,0,0.000028,0.000000
last_scraped,last_scraped,object,5,36008,0,0.000139,0.000000
source,source,object,2,36008,0,0.000056,0.000000
...,...,...,...,...,...,...,...
calculated_host_listings_count,calculated_host_listings_count,int64,58,36008,0,0.001611,0.000000
calculated_host_listings_count_entire_homes,calculated_host_listings_count_entire_homes,int64,60,36008,0,0.001666,0.000000
calculated_host_listings_count_private_rooms,calculated_host_listings_count_private_rooms,int64,18,36008,0,0.000500,0.000000
calculated_host_listings_count_shared_rooms,calculated_host_listings_count_shared_rooms,int64,16,36008,0,0.000444,0.000000


In [130]:
missingData(df_listings, num=77, limit=0.000001)

Unnamed: 0,Total,Percent
bedrooms,36008,1.0
license,36008,1.0
calendar_updated,36008,1.0
description,36008,1.0
bathrooms,36008,1.0
neighbourhood_group_cleansed,36008,1.0
host_about,18644,0.517774
neighborhood_overview,17736,0.492557
neighbourhood,17736,0.492557
last_review,10302,0.286103


In [10]:
def analiseDeColunasComCorrespondencia(filtro:pd.Series, df:pd.DataFrame(), coluna_alvo:str ):
    '''
    Essa função faz um verificação dos valores da coluna_alvo com os valores contidos em outras colunas.
    Objetivo dela é tentar obter informações que sejam relavantes para o entendimentos dos dados,
    como por exemplo se os valores dessa coluna_alvo estão contidos em outras colunas,
    ou se quando a coluna_alvo é nula outras colunas tambem são.
    '''
    columns = df[filtro].columns

    df_result = pd.DataFrame({
                    'Columns': df[filtro].columns,
                    'quantidade': 0, 
                    'NaN':0,
                    'Qtd.Registros':df[filtro].shape[0]
                    
                      })

    for index, row in df[filtro].iterrows():
        for col in columns:
            if col != coluna_alvo:
                filtra_registro = (df_result['Columns'] == col)
                if str(row[coluna_alvo]) in str(row[col]):                    
                    df_result.quantidade[filtra_registro] = df_result.quantidade[filtra_registro] + 1                

                if str(row[col])=='nan' :
                    df_result['NaN'][filtra_registro] += 1
                    
    df_result['%'] = round(df_result.quantidade / df[filtro].shape[0], 6)
    df_result['% NaN'] = round(df_result['NaN'] / df[filtro].shape[0], 6)
    return df_result

## 1.2. Removendo colunas com 100% de valores NaN

Com base nas informações de quantidade de valores missing que existem em cada uma das colunas, identificamos que as colunas ***"bedrooms", "license", "calendar_updated", "description", "bathrooms","neighbourhood_group_cleansed"*** estão com 100% dos seus dados com valores missing.

In [11]:
temp_missing = missingData(df_listings, num=75, limit=1)

In [12]:
for col in temp_missing.index:
    print(f"{col}: ", df_listings[col].unique())
    

bedrooms:  [nan]
license:  [nan]
calendar_updated:  [nan]
description:  [nan]
bathrooms:  [nan]
neighbourhood_group_cleansed:  [nan]


In [13]:
# Identificando as colunas com 100% de missing values
colunas_nan = temp_missing.index

df_eda = df_listings.copy()
df_eda.drop(columns=colunas_nan, inplace=True)
df_eda.shape, df_listings.shape


((36008, 69), (36008, 75))

----

## 1.3. Analisando Dados missing das outras colunas com valores NaN

In [14]:
#df_missing = missingData(df_eda, num=75, limit=0.00001)
missingData(df_eda, num=75, limit=0.00001)

Unnamed: 0,Total,Percent
host_about,18644,0.517774
neighbourhood,17736,0.492557
neighborhood_overview,17736,0.492557
first_review,10302,0.286103
reviews_per_month,10302,0.286103
last_review,10302,0.286103
review_scores_checkin,10287,0.285687
review_scores_accuracy,10286,0.285659
review_scores_location,10286,0.285659
review_scores_value,10286,0.285659


In [15]:
df_info = infoDf(df_eda)

In [16]:
# Analisando os valores de cada uma das colunas que possuem valores missing
perc_missing = df_info['% Missing'] >= 0.000001

for col in df_info[perc_missing].index:
    row = df_info.index == col
    unique = df_eda[col].unique
    print("==============================")
    print(f"Coluna: {col}")
    print("* ", df_info[row])
    print('############')
    print("Unique: ", unique)
    print('############')
    

Coluna: neighborhood_overview
*                                       Columns    Type  Unique   Size  Missing  \
neighborhood_overview  neighborhood_overview  object   16008  36008    17736   

                       % Unique  % Missing  
neighborhood_overview  0.444568   0.492557  
############
Unique:  <bound method Series.unique of 0        This is the one of the bests spots in Rio. Bec...
1        Copacabana is a lively neighborhood and the ap...
2        Our guests will experience living with a local...
3                                                      NaN
4                                                      NaN
                               ...                        
36003                                                  NaN
36004    Copacabana is located in Rio de Janeiro's stun...
36005                                                  NaN
36006                                                  NaN
36007    O apartamento está localizado no centro do Rio...
Name: neighbor

Iremos realizar a analise individual de coluna a coluna para entender melhor os seus dados e escolher a melhor de forma de tratar os dados missing de cada uma individualmente.

In [17]:
# Lista de Colunas com Dados Missing
perc_missing = df_info['% Missing'] > 0.0
df_info[perc_missing].index

Index(['neighborhood_overview', 'host_name', 'host_since', 'host_location',
       'host_about', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url',
       'host_picture_url', 'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'bathrooms_text', 'beds', 'price', 'has_availability', 'first_review',
       'last_review', 'review_scores_rating', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'reviews_per_month'],
      dtype='object')

### 1.3.1. Feature: *'neighborhood_overview'*

In [18]:
df_eda['neighborhood_overview'].unique()

array(['This is the one of the bests spots in Rio. Because of the large balcony and proximity to the beach, it has huge advantages in the current situation.',
       "Copacabana is a lively neighborhood and the apartment is located very close to an area in Copa full of bars, cafes and restaurants at Rua Bolivar and Domingos Ferreira. Copacabana never sleeps, there is always movement and it's a great mix of all kinds of people.",
       'Our guests will experience living with a local peole "Carioca"  in a very friendly building with 24 hours a day security with all kind of stores, banks, transports, restaurants.',
       ...,
       'Apartamento se localiza no melhor ponto de Copacabana, munido de toda infraestrutura para uma estadia de comodidade: próximo a bares, restaurantes, supermercados, lojas de conveniência, quiosques beira mar.',
       'Barra da Tijuca é um dos bairros mais nobres do Rio de Janeiro, que conta com as melhores praias e restaurantes. Vale conhecer a Praia da Joat

In [19]:
df_info[df_info.index =='neighborhood_overview']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
neighborhood_overview,neighborhood_overview,object,16008,36008,17736,0.444568,0.492557


**neighborhood_overview.isna()**

In [None]:
#neighborhood_overview_isna = df_eda['neighborhood_overview'].isna()
#df_infneighborhood_overview = analiseDeColunasComCorrespondencia(filtro=neighborhood_overview_isna, df=df_eda, coluna_alvo='neighborhood_overview' )


In [None]:
#df_infneighborhood_overview

In [None]:
#df_infneighborhood_overview[df_infneighborhood_overview['%']>0]

**neighborhood_overview.notna()**

In [None]:
#neighborhood_overview_notna = df_eda['neighborhood_overview'].notna()
#df_infNeighbourhood_notna = analiseDeColunasComCorrespondencia(filtro=neighborhood_overview_notna, df=df_eda, coluna_alvo='neighborhood_overview' )


In [None]:
#df_infNeighbourhood_notna

In [None]:
#df_infNeighbourhood_notna[df_infNeighbourhood_notna['%']>0]

In [20]:
# A feature neighborhood_overview apresenta dados de texto, e por esse motivo iremos substituir os valores NaN por "Não informado".
df_eda['neighborhood_overview'].fillna("nao informado", inplace=True)

### 1.3.2. Feature: *''neighbourhood'*

In [21]:
df_info[df_info.index =='neighbourhood']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
neighbourhood,neighbourhood,object,286,36008,17736,0.007943,0.492557


In [22]:
df_eda['neighbourhood'].unique()

array(['Rio de Janeiro, Brazil', nan, 'Ipanema, Rio de Janeiro, Brazil',
       'Rio, Rio de Janeiro, Brazil', 'Rio de Janeiro, Rj, Brazil',
       'Joatinga, Rio de Janeiro, Brazil',
       'Santa Teresa, Rio de Janeiro, Brazil',
       'Rio de Janeiro , Rio de Janeiro, Brazil',
       'Rio de janeiro , Rio de Janeiro, Brazil',
       'Barra da Tijuca, Rio de Janeiro, Brazil',
       'Itanhangá, Rio de Janeiro, Brazil',
       'Praça Seca, Rio de Janeiro, Brazil',
       'Copacabana, Rio de Janeiro, Brazil',
       'Rio de Janeiro / Copacabana , Rio de Janeiro, Brazil',
       'Urca, Rio de Janeiro, Brazil',
       'Rio de Janeiro - Leblon, Rio de Janeiro, Brazil',
       'Rio de Janeiro, Recreio dos bandeirantes, Brazil',
       'Jacarepagua, Rio de Janeiro, Brazil',
       'Rio de Janeiro, Copacabana, Brazil',
       'Rio de Janeiro, État de Rio de Janeiro, Brazil',
       'RJ, Rio de Janeiro, Brazil', 'Riocentro, Rio de Janeiro, Brazil',
       'Rio De Janeiro, Brazil',
       'Cop

Analisando os valores unicos da features *'neighbourhood'* temos a impressão que essa features apresenta dados referente a localização do local, como cidade, estado, pais e em alguns casos bairros, ruas e outros detalhes da localização.

Diante desse entendimento, iremos verificar se os registros que possuem a feature *'neighbourhood'* como não nulo possuem correspondencia outra feature. Caso isso seja confirmado, poderemos preencher os valores missing dessa features com base na features que ela possui mais correspondencia, ou até mesmo decidir pelo remoção dessa feature do conjunto de dados

In [23]:
neighbourhood_isna = df_eda['neighbourhood'].isna()

Iremos realizar um comparativo nos registros que possuem valores não nulos na coluna *'neighbourhood'* com os valores das outras colunas. O objetivo é verificar qual coluna possue semelhanças com a *'neighbourhood'*, e quantidade de registros com essas semelhanças.

In [None]:
df_infNeighbourhood = analiseDeColunasComCorrespondencia(filtro=df_eda['neighbourhood'].notna(), df=df_eda, coluna_alvo='neighbourhood' )

In [None]:
df_infNeighbourhood[df_infNeighbourhood['quantidade']>0]

Com as analises acima identificamos que a coluna *'host_location'* é a que mais possui correspondencia com a *"neighbourhood"*, porém apenas 41% dos registros que possuem correspondencia com ao *"neighbourhood"*.


Visualização nos registros que possuem não correspondencia entre a *'host_location'* e a *"neighbourhood"*

In [None]:
df_auxNeigbourhood = df_eda.copy()
df_auxNeigbourhood['Neigh_Host'] = df_eda['neighbourhood']

In [None]:

num = df_auxNeigbourhood.shape[0]
for idx in range(0,num):    
    if str(df_auxNeigbourhood['neighbourhood'].iloc[idx]) in str(df_auxNeigbourhood['host_location'].iloc[idx]):
    
        df_auxNeigbourhood['Neigh_Host'].iloc[idx] = 'S'
    else:
        df_auxNeigbourhood['Neigh_Host'].iloc[idx] = 'N'


                                   

In [None]:
filtro_1 = (df_auxNeigbourhood['Neigh_Host'] =='N')
filtro_2 = (df_auxNeigbourhood['neighbourhood'].notna())
colunas_ax = ['neighbourhood', 'host_location']

df_auxNeigbourhood[colunas_ax][ filtro_1 & filtro_2]

O valor da *"neighbourhood"* é referente ao bairro, e devido ao numero de dados missing e a falta de padronização, iremos preencher todos que estão como NaN como "Rio de Janeiro, Brazil", devido esse dataset ser referente a dados do Rio de Janeiro, Brazil"

In [24]:
#neighbourhood_isna = df_eda['neighbourhood'].isna()
#df_eda['neighbourhood'][neighbourhood_isna] = "Rio de Janeiro, Brazil"

df_eda['neighbourhood'].fillna("nao informado", inplace=True)

### 1.3.4. Feature: *'host_name'*

Quando visualizamos o numero de missing de cada um das features podemos notar que a features *'host_name'* possuía a mesma quantidade de missing que outras colunas, então iremos verificar quais as outras features possuem dados missing quando o host_name possui um valor missing.

In [25]:
filtro_host_name = df_eda['host_name'].isna()
df_eda.id[filtro_host_name].count()

9

In [26]:
df_info[df_info.index =='host_name']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_name,host_name,object,6268,36008,9,0.174072,0.00025


In [None]:
filtro_host_name = df_eda['host_name'].isna()

df_infHost = analiseDeColunasComCorrespondencia(filtro=filtro_host_name, df=df_eda, coluna_alvo='host_name' )

In [None]:
df_infHost[df_infHost['%']>0]

In [None]:
df_infHost[df_infHost['%']>=1]

In [None]:
cols_infHost = df_infHost['Columns'][df_infHost['%']>=1]

In [None]:
missingData(df_eda[cols_infHost])

O dataframe possui 9 registros com o campo "host_name" com valor missing. Com a analise de comparação de registros com valores missing na coluna "host_name" com as outras colunas, obtivemos um numero de 13 feature que sempre que a feature "host_name" é missing, os valores dessas outras colunas tambem são missing. Porém dentre essas 13 features, 7 delas sempre são NaN quando a *"host_name"* for NaN. As outras 6 colunas tambem possuem valor NaN quando a *"host_name"* é NaN, porém possui valores NaN em outros registros em que o *"host_name"* não é NaN.

host_name, host_since, host_thumbnail_url, host_picture_url, host_listings_count, host_total_listings_count, host_has_profile_pic, host_identity_verified



**Analise de NotNa na feature *'host_name'***

In [None]:
filtro_host_name_notna = df_eda['host_name'].notna()

df_infHost_notna = analiseDeColunasComCorrespondencia(filtro=filtro_host_name_notna, df=df_eda, coluna_alvo='host_name' )

In [None]:
df_infHost_notna[df_infHost_notna['%']>0]

In [27]:
df_eda['host_name'].unique()

array(['Matthias', 'Viviane', 'Patricia Miranda & Paulo', ...,
       'João Renato', 'Kauanne Bruna', 'Grenrick'], dtype=object)

Fizemos essa analise com o objetivo de verificar se alguma outra feature tivesse informações que poderiamos utilizar no preenchimento dos valores missing da feature *"host_name"*. Mas não conseguimos identificar nenhuma outra feature que poderíamos utilizar para o preenchimento dos missing values da feature *"host_name"*

Desta forma, iremos tratar os dados missing da coluna host_name preenchendo com o valor 'nao informado'

In [28]:
df_eda['host_name'].fillna("nao informado", inplace=True)

### 1.3.5. Feature: *'host_since'*

In [29]:
df_info[df_info.index =='host_since']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_since,host_since,object,4441,36008,9,0.123334,0.00025


In [30]:
df_eda['host_since'].unique()

array(['2010-01-08', '2010-04-03', '2010-06-27', ..., '2023-12-23',
       '2019-11-29', '2023-12-24'], dtype=object)

o campo host_since se refere a quando o host tem cadastro, porem quando não tem um host não havera um data. Então para conseguirmos identificar quais são esses cadastros futuramente sem que percamos os dados, iremos preencher os campos null com uma data totalmente fora do padrão. 

Uma outra opção seria preencher os null com o mesmo valor que está na variavel *'last_scraped'*, mas optamos por seguir a alternativa anterior


In [31]:
#host_since_isna = df_eda['host_since'].isna()
#df_eda['host_since'][host_since_isna] = '1900-01-01'
df_eda['host_since'].fillna("1990-01-01", inplace=True)

### 1.3.6. Feature: *'host_location'*

In [32]:
df_info[df_info.index =='host_location']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_location,host_location,object,592,36008,7029,0.016441,0.195207


In [33]:
df_eda['host_location'].unique()

array(['Rio de Janeiro, Brazil', nan, 'Rio, Brazil', 'Brazil', 'Mesa, AZ',
       'State of Rio de Janeiro, Brazil', 'San Diego, CA',
       'Florianópolis, Brazil', 'Sao Paulo, Brazil', 'Berlin, Germany',
       'Miami Beach, FL', 'Palma, Spain', 'Zug, Switzerland',
       'Newport Beach, CA', 'Americana, Brazil', 'Barcelona, Spain',
       'João Pessoa, Brazil', 'Los Angeles, CA', 'Santiago, Chile',
       'London, United Kingdom', 'Massachusetts, United States',
       'Manchester, United Kingdom', 'United States', 'Panama',
       'São Paulo, Brazil', 'State of Espírito Santo, Brazil',
       'Curitiba, Brazil', 'Germany', 'Opatija, Croatia', 'Wolcott, CT',
       'Madrid, Spain', 'Lisbon, Portugal', 'New York, NY',
       'Buenos Aires, Argentina', 'Aberdeen, United Kingdom',
       'Alameda, CA', 'Além Paraíba, Brazil', 'Toulouse, France',
       'Amsterdam, Netherlands', 'Tournai, Belgium',
       'England, United Kingdom', 'Niterói, Brazil', 'Bangkok, Thailand',
       'Weehawk

In [None]:
filtro_host_location = df_eda['host_location'].isna()
df_eda.id[filtro_host_location].count()

df_infHost_location = analiseDeColunasComCorrespondencia(filtro=filtro_host_location, df=df_eda, coluna_alvo='host_location' )

In [None]:
df_infHost_location[df_infHost_location['%']>0]

In [33]:
df_eda['host_location'].fillna("nao informado", inplace=True)

### 1.3.7. Feature: *'host_about'*

In [34]:
df_info[df_info.index =='host_about']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_about,host_about,object,8243,36008,18644,0.228921,0.517774


In [35]:
df_eda['host_about'].unique()

array(['I  am a  journalist/writer. Lived  in NYC for 15 years. I  am now based in Rio and published 3 volumes of travel stories on AMAZ0N: "The World Is My Oyster". If you have never been to Rio, check out the first story, and you\'ll get an idea. Apart from Rio, you\'ll find  29 other travel stories from all around the globe.',
       'Hi guys,\n\nViviane is a commercial photographer, an avid world traveler,  (a former photographer for Airbnb) and an Airbnb superhost. And a free lance photographer for other wonderful clients. She loves life and meeting people.\n\nWe  work together in providing the best accommodation to people and we are\nfirm believers of enjoying the moment as a prime attitude towards life!\n',
       'Hello,   We are Patricia Miranda and Paulo.\nWe are a couple who love to meet new people, new cultures, we both are  very easy going persons,  We are retired after working for several years in tourism and an international airline company.  We also used do host in our 

In [36]:
df_eda['host_about'].fillna("nao informado", inplace=True)

### 1.3.8. Feature: *'host_response_time'*

In [37]:
df_info[df_info.index =='host_response_time']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_response_time,host_response_time,object,4,36008,5039,0.000111,0.139941


In [38]:
df_eda['host_response_time'].unique()

array(['within an hour', 'within a few hours', 'a few days or more',
       'within a day', nan], dtype=object)

In [39]:
df_eda['host_response_time'].fillna("nenhum", inplace=True)

### 1.3.9. Feature: *'host_response_rate'*

In [40]:
df_info[df_info.index =='host_response_rate']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_response_rate,host_response_rate,object,89,36008,5039,0.002472,0.139941


In [41]:
df_eda['host_response_rate'].unique()

array(['100%', '90%', '0%', '50%', nan, '96%', '70%', '63%', '80%', '99%',
       '93%', '91%', '10%', '60%', '86%', '30%', '88%', '67%', '71%',
       '89%', '97%', '11%', '78%', '68%', '17%', '81%', '21%', '75%',
       '94%', '13%', '92%', '73%', '40%', '20%', '83%', '33%', '95%',
       '14%', '25%', '29%', '43%', '69%', '98%', '22%', '79%', '55%',
       '57%', '82%', '31%', '44%', '23%', '56%', '77%', '84%', '38%',
       '87%', '3%', '72%', '85%', '53%', '49%', '62%', '52%', '18%',
       '58%', '64%', '36%', '76%', '6%', '35%', '27%', '59%', '8%', '61%',
       '42%', '37%', '47%', '65%', '46%', '41%', '74%', '9%', '7%', '39%',
       '16%', '19%', '48%', '32%', '5%', '4%'], dtype=object)

In [42]:
df_eda['host_response_rate'].fillna("0%", inplace=True)

### 1.3.10. Feature: *'host_acceptance_rate'*

In [43]:
df_info[df_info.index =='host_acceptance_rate']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_acceptance_rate,host_acceptance_rate,object,100,36008,4802,0.002777,0.133359


In [44]:
df_eda['host_acceptance_rate'].unique()


array(['96%', '80%', '98%', '89%', '93%', '69%', nan, '100%', '94%',
       '40%', '0%', '95%', '77%', '39%', '92%', '55%', '86%', '75%',
       '74%', '81%', '82%', '67%', '83%', '99%', '88%', '90%', '8%',
       '97%', '78%', '57%', '91%', '56%', '25%', '71%', '6%', '84%',
       '50%', '72%', '65%', '54%', '17%', '85%', '79%', '66%', '38%',
       '10%', '58%', '37%', '29%', '73%', '9%', '68%', '33%', '64%',
       '30%', '60%', '51%', '49%', '43%', '62%', '87%', '16%', '63%',
       '61%', '7%', '23%', '45%', '20%', '47%', '52%', '44%', '53%',
       '13%', '21%', '76%', '70%', '36%', '22%', '42%', '28%', '11%',
       '14%', '18%', '46%', '3%', '48%', '27%', '19%', '4%', '59%', '15%',
       '32%', '34%', '35%', '2%', '24%', '5%', '12%', '31%', '41%', '26%'],
      dtype=object)

In [45]:
df_eda['host_acceptance_rate'].fillna("0%", inplace=True)

### 1.3.11. Feature: *'host_is_superhost'*

In [46]:
df_info[df_info.index =='host_is_superhost']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_is_superhost,host_is_superhost,object,2,36008,73,5.6e-05,0.002027


In [47]:
df_eda['host_is_superhost'].unique()

array(['t', 'f', nan], dtype=object)

In [48]:
df_eda['host_is_superhost'].fillna("f", inplace=True)

### 1.3.12. Feature: *'host_thumbnail_url'*

In [49]:
df_info[df_info.index =='host_thumbnail_url']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_thumbnail_url,host_thumbnail_url,object,21152,36008,9,0.587425,0.00025


In [50]:
df_eda['host_thumbnail_url'].unique()


array(['https://a0.muscache.com/im/pictures/user/67b13cea-8c11-49c0-a08d-7f42c330676e.jpg?aki_policy=profile_small',
       'https://a0.muscache.com/im/pictures/user/315ddc81-bea3-4bf0-8fc7-be197a6541ff.jpg?aki_policy=profile_small',
       'https://a0.muscache.com/im/users/153691/profile_pic/1277774787/original.jpg?aki_policy=profile_small',
       ...,
       'https://a0.muscache.com/im/pictures/user/66c691eb-1b74-4360-ab38-9a7db0c508e8.jpg?aki_policy=profile_small',
       'https://a0.muscache.com/im/pictures/user/User-536983374/original/4e564f8e-4b02-4be6-89bd-626a8af86a05.jpeg?aki_policy=profile_small',
       'https://a0.muscache.com/im/pictures/user/User-694816/original/8b1a8b4b-e697-483f-ac68-b13ef2c35e01.jpeg?aki_policy=profile_small'],
      dtype=object)

In [51]:
#df_eda['host_thumbnail_url'].fillna("nao informado", inplace=True)
#ou 
#df_eda.drop(columns=['host_thumbnail_url'], inplace=True)

### 1.3.13. Feature: *'host_picture_url'*

In [52]:
df_info[df_info.index =='host_picture_url']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_picture_url,host_picture_url,object,21152,36008,9,0.587425,0.00025


In [53]:
df_eda['host_picture_url'].unique()

array(['https://a0.muscache.com/im/pictures/user/67b13cea-8c11-49c0-a08d-7f42c330676e.jpg?aki_policy=profile_x_medium',
       'https://a0.muscache.com/im/pictures/user/315ddc81-bea3-4bf0-8fc7-be197a6541ff.jpg?aki_policy=profile_x_medium',
       'https://a0.muscache.com/im/users/153691/profile_pic/1277774787/original.jpg?aki_policy=profile_x_medium',
       ...,
       'https://a0.muscache.com/im/pictures/user/66c691eb-1b74-4360-ab38-9a7db0c508e8.jpg?aki_policy=profile_x_medium',
       'https://a0.muscache.com/im/pictures/user/User-536983374/original/4e564f8e-4b02-4be6-89bd-626a8af86a05.jpeg?aki_policy=profile_x_medium',
       'https://a0.muscache.com/im/pictures/user/User-694816/original/8b1a8b4b-e697-483f-ac68-b13ef2c35e01.jpeg?aki_policy=profile_x_medium'],
      dtype=object)

In [None]:
#df_eda['host_picture_url'].fillna("nao informado", inplace=True)
#ou 
#df_eda.drop(columns=['host_picture_url'], inplace=True)

### 1.3.14. Feature: *'host_neighbourhood'*

In [54]:
df_info[df_info.index =='host_neighbourhood']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_neighbourhood,host_neighbourhood,object,474,36008,7677,0.013164,0.213203


In [55]:
df_eda['host_neighbourhood'].unique()


array(['Copacabana', 'Flamengo', 'Ipanema', 'Laranjeiras', 'Santa Teresa',
       'Tijuca', 'Joá', 'Barra da Tijuca', 'Jardin Botânico', 'Leblon',
       'Vila da Penha', 'Botafogo', 'Lapa', 'Gávea', 'Leme', 'Vidigal',
       nan, 'Recreio dos Bandeirantes', 'Santo Cristo', 'Lagoa',
       'Humaitá', 'Itanhangá', 'São Conrado', 'Praça Seca', 'Glória',
       'Consolacao', 'Montmartre', 'Urca', 'Centro', 'Jardim Botânico',
       'Cosme Velho', 'Penha Circular', 'Independência', 'Estacio',
       'Praça da Bandeira', 'Gamboa', 'Bonsucesso', 'Barra de Guaratiba',
       'Piedade', 'Estácio', 'São Miguel', 'Curicica', 'Catete',
       'Engenho Novo', 'Rio Comprido', 'Loteamento Triangulo de Buzios',
       'Andaraí', 'Vila Isabel', 'Maracanã', 'Silom', 'Rocha',
       'Lins de Vasconcelos', 'Cavalcante', 'Todos os Santos',
       'Jacarepaguá', 'Caju', 'Grajaú', 'Engenho de Dentro',
       'Maria da Graça', 'Cidade Nova', 'Méier', 'Rocha Miranda',
       'Riachuelo', 'Catumbi', 'São Crist

In [56]:
df_eda['host_neighbourhood'].fillna("nao informado", inplace=True)

### 1.3.15. Feature: *'host_listings_count'*

In [57]:
df_info[df_info.index =='host_listings_count']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_listings_count,host_listings_count,float64,87,36008,9,0.002416,0.00025


In [58]:
df_eda['host_listings_count'].unique()


array([2.000e+00, 1.000e+00, 4.000e+00, 7.000e+00, 1.200e+01, 3.000e+00,
       1.900e+01, 8.000e+00, 6.000e+00, 5.000e+00, 1.000e+01, 1.440e+02,
       1.100e+01, 1.090e+02, 5.600e+01, 1.500e+01, 4.900e+01, 1.600e+01,
       1.400e+01, 9.000e+00, 4.800e+01, 2.400e+01,       nan, 5.500e+01,
       5.000e+01, 1.300e+01, 3.000e+01, 1.800e+01, 3.700e+01, 2.700e+01,
       3.600e+01, 1.700e+01, 2.500e+01, 6.100e+01, 8.200e+01, 2.000e+01,
       2.100e+01, 9.900e+01, 2.300e+01, 2.800e+01, 3.100e+01, 3.500e+01,
       6.200e+01, 4.000e+01, 1.260e+02, 2.260e+02, 3.800e+01, 3.400e+01,
       2.200e+01, 4.500e+01, 5.100e+01, 4.300e+01, 3.200e+01, 3.900e+01,
       2.660e+02, 1.910e+02, 5.700e+01, 4.200e+01, 2.080e+02, 5.300e+01,
       2.600e+01, 1.480e+02, 4.100e+01, 8.400e+01, 1.640e+02, 5.900e+01,
       1.660e+02, 7.900e+01, 1.680e+02, 3.300e+01, 1.990e+02, 2.270e+02,
       3.740e+02, 6.830e+02, 6.400e+01, 1.405e+03, 6.500e+01, 2.900e+01,
       4.700e+01, 1.930e+02, 6.600e+01, 5.200e+01, 

In [59]:
df_eda['host_listings_count'].fillna("0.0", inplace=True)

### 1.3.16. Feature: *'host_total_listings_count'*

In [60]:
df_info[df_info.index =='host_total_listings_count']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_total_listings_count,host_total_listings_count,float64,122,36008,9,0.003388,0.00025


In [61]:
df_eda['host_total_listings_count'].unique()


array([5.000e+00, 2.000e+00, 7.000e+00, 8.000e+00, 1.000e+00, 3.300e+01,
       6.000e+00, 4.000e+00, 2.300e+01, 1.100e+01, 3.000e+00, 1.500e+01,
       2.700e+01, 9.000e+00, 2.800e+01, 1.000e+01, 2.100e+01, 1.700e+01,
       4.290e+02, 1.300e+01, 3.100e+01, 1.400e+01, 3.260e+02, 1.200e+01,
       2.500e+01, 8.600e+01, 2.000e+01, 1.360e+02, 1.900e+01, 1.290e+02,
       3.900e+01, 1.600e+01, 2.200e+01, 2.600e+01, 5.400e+01, 3.600e+01,
             nan, 6.900e+01, 5.900e+01, 1.800e+01, 4.500e+01, 1.240e+02,
       6.400e+01, 5.800e+01, 2.900e+01, 3.800e+01, 5.300e+01, 3.200e+01,
       1.540e+02, 8.900e+01, 4.800e+01, 1.340e+02, 6.200e+01, 4.600e+01,
       1.330e+02, 5.100e+01, 9.300e+01, 9.800e+01, 2.490e+02, 6.500e+01,
       3.500e+01, 3.000e+01, 1.380e+02, 1.630e+02, 6.330e+02, 6.700e+01,
       2.400e+01, 4.000e+01, 7.400e+01, 1.930e+02, 4.900e+01, 6.100e+01,
       6.000e+01, 5.020e+02, 9.700e+01, 3.040e+02, 9.900e+01, 1.950e+02,
       3.700e+01, 5.000e+01, 5.970e+02, 8.800e+01, 

In [62]:
df_eda['host_total_listings_count'].fillna("0.0", inplace=True)

### 1.3.17. Feature: *'host_has_profile_pic'*

In [63]:
df_info[df_info.index =='host_has_profile_pic']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_has_profile_pic,host_has_profile_pic,object,2,36008,9,5.6e-05,0.00025


In [64]:
df_eda['host_has_profile_pic'].unique()


array(['t', nan, 'f'], dtype=object)

In [65]:
df_eda['host_has_profile_pic'].fillna("f", inplace=True)

### 1.3.18. Feature: *'host_identity_verified'*

In [66]:
df_info[df_info.index =='host_identity_verified']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_identity_verified,host_identity_verified,object,2,36008,9,5.6e-05,0.00025


In [67]:
df_eda['host_identity_verified'].unique()


array(['t', 'f', nan], dtype=object)

In [68]:
df_eda['host_identity_verified'].fillna("f", inplace=True)

### 1.3.19. Feature: *'bathrooms_text'*

In [90]:
df_info[df_info.index =='bathrooms_text']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
bathrooms_text,bathrooms_text,object,51,36008,30,0.001416,0.000833


In [91]:
df_eda['bathrooms_text'].unique()


array(['1 bath', '1.5 baths', '2 baths', '7 baths', '1 shared bath',
       '3 baths', '1 private bath', '2.5 baths', '5 baths', '4 baths',
       '5 shared baths', nan, '3.5 baths', '2.5 shared baths',
       '2 shared baths', '4.5 baths', '0 shared baths', '0 baths',
       'Shared half-bath', '1.5 shared baths', '8 baths', '6 baths',
       '6.5 baths', '3 shared baths', '3.5 shared baths', '5.5 baths',
       '4 shared baths', '7.5 baths', '15 baths', '8.5 baths',
       '20 shared baths', 'Half-bath', '4.5 shared baths',
       '6.5 shared baths', '10 baths', '15.5 baths', 'Private half-bath',
       '12 baths', '6 shared baths', '7 shared baths', '9 baths',
       '11 baths', '16 baths', '8.5 shared baths', '9.5 baths',
       '5.5 shared baths', '11.5 baths', '8 shared baths',
       '9 shared baths', '12 shared baths', '13 baths', '11 shared baths'],
      dtype=object)

In [92]:
df_eda['bathrooms_text'].fillna('0 bath', inplace=True)

In [93]:
df_eda['bathrooms_text'] = [str(x).lower().replace('half-bath','0.5') for x in df_eda['bathrooms_text'] ]
#df_eda['bathrooms_count'].apply(lambda x: str(x).lower().replace('half-bath', '0.5') )

In [94]:
df_eda['bathrooms_text'].unique()

array(['1 bath', '1.5 baths', '2 baths', '7 baths', '1 shared bath',
       '3 baths', '1 private bath', '2.5 baths', '5 baths', '4 baths',
       '5 shared baths', '0 bath', '3.5 baths', '2.5 shared baths',
       '2 shared baths', '4.5 baths', '0 shared baths', '0 baths',
       'shared 0.5', '1.5 shared baths', '8 baths', '6 baths',
       '6.5 baths', '3 shared baths', '3.5 shared baths', '5.5 baths',
       '4 shared baths', '7.5 baths', '15 baths', '8.5 baths',
       '20 shared baths', '0.5', '4.5 shared baths', '6.5 shared baths',
       '10 baths', '15.5 baths', 'private 0.5', '12 baths',
       '6 shared baths', '7 shared baths', '9 baths', '11 baths',
       '16 baths', '8.5 shared baths', '9.5 baths', '5.5 shared baths',
       '11.5 baths', '8 shared baths', '9 shared baths',
       '12 shared baths', '13 baths', '11 shared baths'], dtype=object)

In [96]:
df_eda['bathroom_type'] = df_eda['bathrooms_text'].str.replace(r'(\d+\.?\d*)', '', regex=True)
df_eda['bathroom_type'] = df_eda['bathrooms_text'].str.replace(r's?$', '', regex=True).apply(lambda x: x.strip())
df_eda['bathroom_type'].unique()

array(['1 bath', '1.5 bath', '2 bath', '7 bath', '1 shared bath',
       '3 bath', '1 private bath', '2.5 bath', '5 bath', '4 bath',
       '5 shared bath', '0 bath', '3.5 bath', '2.5 shared bath',
       '2 shared bath', '4.5 bath', '0 shared bath', 'shared 0.5',
       '1.5 shared bath', '8 bath', '6 bath', '6.5 bath', '3 shared bath',
       '3.5 shared bath', '5.5 bath', '4 shared bath', '7.5 bath',
       '15 bath', '8.5 bath', '20 shared bath', '0.5', '4.5 shared bath',
       '6.5 shared bath', '10 bath', '15.5 bath', 'private 0.5',
       '12 bath', '6 shared bath', '7 shared bath', '9 bath', '11 bath',
       '16 bath', '8.5 shared bath', '9.5 bath', '5.5 shared bath',
       '11.5 bath', '8 shared bath', '9 shared bath', '12 shared bath',
       '13 bath', '11 shared bath'], dtype=object)

In [85]:
df_eda['bathrooms_text'].str.replace(r'baths', 'bath').unique()


array(['bath', 'shared bath', 'private bath', 'shared', '', 'private'],
      dtype=object)

In [136]:
df_eda['bathrooms_count']= df_eda['bathrooms_text']

In [139]:
df_eda['bathrooms_count'].unique()

array(['1 bath', '1.5 baths', '2 baths', '7 baths', '1 shared bath',
       '3 baths', '1 private bath', '2.5 baths', '5 baths', '4 baths',
       '5 shared baths', '0 bath', '3.5 baths', '2.5 shared baths',
       '2 shared baths', '4.5 baths', '0 shared baths', '0 baths',
       'shared 0.5', '1.5 shared baths', '8 baths', '6 baths',
       '6.5 baths', '3 shared baths', '3.5 shared baths', '5.5 baths',
       '4 shared baths', '7.5 baths', '15 baths', '8.5 baths',
       '20 shared baths', '0.5', '4.5 shared baths', '6.5 shared baths',
       '10 baths', '15.5 baths', 'private 0.5', '12 baths',
       '6 shared baths', '7 shared baths', '9 baths', '11 baths',
       '16 baths', '8.5 shared baths', '9.5 baths', '5.5 shared baths',
       '11.5 baths', '8 shared baths', '9 shared baths',
       '12 shared baths', '13 baths', '11 shared baths'], dtype=object)

In [None]:
df_eda['bathrooms_type']=df_eda['bathrooms_text']

### 1.3.20. Feature: *'beds'*

In [97]:
df_info[df_info.index =='beds']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
beds,beds,float64,43,36008,707,0.001194,0.019635


In [102]:
df_eda['beds'].unique()

array([2.0, 1.0, '0.0', 3.0, 7.0, 5.0, 16.0, 10.0, 6.0, 4.0, 8.0, 11.0,
       14.0, 9.0, 13.0, 15.0, 17.0, 12.0, 20.0, 91.0, 35.0, 18.0, 22.0,
       27.0, 50.0, 25.0, 30.0, 38.0, 26.0, 24.0, 23.0, 43.0, 28.0, 44.0,
       32.0, 56.0, 46.0, 29.0, 31.0, 21.0, 39.0, 41.0, 19.0, 33.0],
      dtype=object)

In [101]:
df_eda['beds'].fillna(0.0, inplace=True)

In [104]:
df_eda['beds'] = df_eda['beds'].astype(float)

### 1.3.21. Feature: *'price'*

In [99]:
df_info[df_info.index =='price']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
price,price,object,3230,36008,1771,0.089702,0.049184


In [100]:
df_eda['price'].unique()


array(['$1,357.00', '$865.00', '$373.00', ..., '$1,197.00', '$2,777.00',
       '$2,537.00'], dtype=object)

In [106]:
df_eda['price'].fillna('$0.0', inplace=True)
df_eda['price'] = df_eda['price'].str.replace(r',','',regex=False).replace(r'$', '', regex=False).apply(lambda x: x.strip())
df_eda['price'] = df_eda['price'].astype(float)

In [107]:
df_eda['price'].unique()

array([1357.,  865.,  373., ..., 1197., 2777., 2537.])

### 1.3.22. Feature: *'has_availability'*

In [70]:
df_info[df_info.index =='has_availability']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
has_availability,has_availability,object,2,36008,1771,5.6e-05,0.049184


In [71]:
df_eda['has_availability'].unique()


array(['t', nan, 'f'], dtype=object)

In [None]:
df_eda['has_availability'].fillna('$0.0', inplace=True)

### 1.3.23. Feature: *'first_review'*

In [72]:
df_info[df_info.index =='first_review']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
first_review,first_review,object,3218,36008,10302,0.089369,0.286103


In [73]:
df_eda['first_review'].unique()


array(['2010-07-15', '2010-06-07', '2010-10-03', ..., '2023-12-22',
       '2023-12-26', '2023-12-25'], dtype=object)

### 1.3.24. Feature: *'last_review'*

In [74]:
df_info[df_info.index =='last_review']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
last_review,last_review,object,1424,36008,10302,0.039547,0.286103


In [75]:
df_eda['last_review'].unique()


array(['2023-12-22', '2023-12-03', '2023-12-17', ..., '2023-07-12',
       '2023-02-07', '2022-12-16'], dtype=object)

### 1.3.25. Feature: *'review_scores_rating'*

In [76]:
df_info[df_info.index =='review_scores_rating']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
review_scores_rating,review_scores_rating,float64,139,36008,10283,0.00386,0.285575


In [77]:
df_eda['review_scores_rating'].unique()


array([4.7 , 4.72, 4.9 , 4.21, 4.57, 4.81,  nan, 4.76, 4.96, 4.82, 4.5 ,
       5.  , 4.  , 4.63, 4.86, 4.74, 4.78, 4.51, 4.95, 4.89, 4.8 , 4.79,
       4.25, 4.73, 4.83, 4.59, 4.4 , 4.88, 4.92, 4.91, 4.19, 4.34, 4.68,
       4.67, 4.53, 4.6 , 4.66, 4.84, 4.87, 4.98, 4.85, 4.71, 4.65, 4.14,
       4.62, 4.29, 4.58, 4.54, 4.09, 3.63, 4.45, 4.47, 4.28, 4.64, 4.56,
       4.94, 4.93, 4.97, 4.17, 4.48, 4.61, 4.55, 4.75, 4.22, 3.67, 4.69,
       4.3 , 4.26, 3.  , 4.36, 4.31, 3.86, 4.44, 3.71, 4.77, 4.43, 4.18,
       4.46, 4.33, 4.37, 4.27, 4.2 , 4.16, 4.39, 4.52, 0.  , 4.41, 2.5 ,
       4.49, 3.5 , 4.35, 4.38, 1.  , 3.83, 4.99, 4.23, 3.8 , 2.67, 4.42,
       3.75, 2.  , 2.25, 4.11, 4.06, 4.24, 4.04, 3.78, 3.89, 4.32, 3.33,
       3.2 , 3.43, 3.92, 3.94, 2.33, 4.05, 3.7 , 4.15, 3.6 , 4.13, 3.95,
       4.07, 3.25, 3.4 , 3.22, 4.1 , 3.88, 3.91, 1.5 , 4.08, 3.93, 2.75,
       1.67, 3.82, 3.57, 3.85, 4.12, 3.69, 3.29, 3.17])

### 1.3.26. Feature: *'review_scores_accuracy*

In [78]:
df_info[df_info.index =='review_scores_accuracy']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
review_scores_accuracy,review_scores_accuracy,float64,139,36008,10286,0.00386,0.285659


In [79]:
df_eda['review_scores_accuracy'].unique()


array([4.77, 4.7 , 4.93, 3.88, 4.72, 4.85,  nan, 4.91, 4.87, 4.67, 5.  ,
       3.92, 4.86, 4.  , 4.83, 4.75, 4.95, 4.73, 4.58, 4.8 , 4.88, 4.25,
       4.69, 4.84, 4.66, 4.24, 4.92, 4.39, 4.89, 4.79, 4.57, 4.81, 4.65,
       4.6 , 4.78, 4.9 , 4.94, 4.98, 4.76, 4.33, 4.43, 4.5 , 4.56, 4.45,
       4.63, 3.5 , 4.15, 4.35, 4.68, 4.18, 4.64, 4.38, 4.42, 4.4 , 4.2 ,
       4.62, 4.74, 4.71, 4.97, 4.61, 2.5 , 4.46, 4.59, 4.96, 4.82, 4.32,
       4.55, 4.21, 4.14, 4.53, 3.75, 4.48, 4.09, 4.99, 4.31, 3.  , 4.36,
       4.47, 4.17, 4.52, 4.08, 1.  , 4.44, 4.3 , 4.29, 3.8 , 2.67, 4.49,
       4.54, 3.9 , 4.19, 4.51, 3.69, 2.  , 3.67, 3.17, 3.43, 3.33, 4.37,
       4.11, 3.44, 3.71, 4.34, 4.22, 4.28, 4.1 , 4.13, 3.91, 3.89, 4.23,
       4.05, 3.95, 4.41, 3.2 , 3.6 , 4.16, 4.26, 3.63, 3.25, 3.4 , 4.27,
       3.83, 3.86, 3.87, 3.78, 3.57, 4.06, 4.07, 3.82, 3.94, 3.96, 3.73,
       2.33, 4.12, 3.77, 1.67, 3.14, 4.03, 2.75, 0.  ])

### 1.3.27. Feature: *'review_scores_cleanliness'*

In [80]:
df_info[df_info.index =='review_scores_cleanliness']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
review_scores_cleanliness,review_scores_cleanliness,float64,166,36008,10285,0.00461,0.285631


In [81]:
df_eda['review_scores_cleanliness'].unique()


array([4.65, 4.79, 4.93, 4.25, 4.46, 4.8 ,  nan, 4.7 , 4.86, 4.74, 4.17,
       5.  , 4.67, 4.84, 4.77, 4.5 , 4.33, 4.95, 4.15, 4.53, 4.9 , 4.88,
       4.75, 4.87, 4.56, 4.92, 4.64, 4.89, 4.82, 4.13, 4.94, 4.78, 4.71,
       4.31, 4.45, 4.54, 4.59, 4.96, 4.4 , 4.72, 4.61, 4.76, 4.83, 4.99,
       4.62, 4.47, 3.67, 4.36, 4.01, 4.48, 4.66, 4.85, 4.55, 4.08, 4.6 ,
       3.82, 4.49, 4.41, 4.35, 4.98, 4.73, 4.81, 4.  , 4.63, 4.91, 3.8 ,
       4.39, 4.69, 4.32, 4.42, 4.2 , 2.  , 4.52, 4.09, 4.21, 4.97, 4.68,
       3.6 , 4.24, 4.26, 4.27, 4.38, 4.3 , 3.86, 4.37, 3.  , 4.22, 4.14,
       4.57, 4.29, 3.83, 4.51, 4.18, 4.11, 4.43, 4.12, 4.58, 4.19, 4.44,
       3.95, 3.5 , 3.88, 3.4 , 1.  , 4.06, 2.75, 3.43, 3.85, 3.7 , 3.96,
       4.34, 4.16, 3.75, 3.29, 4.23, 3.81, 3.25, 4.1 , 4.04, 4.28, 3.89,
       3.78, 4.07, 3.9 , 4.03, 3.76, 3.64, 3.84, 3.71, 2.67, 2.2 , 3.33,
       3.57, 3.92, 3.2 , 3.97, 3.99, 2.5 , 3.93, 3.63, 4.05, 2.6 , 3.13,
       3.17, 3.74, 3.14, 3.79, 4.02, 3.56, 3.94, 3.

### 1.3.28. Feature: *'review_scores_checkin'*

In [82]:
df_info[df_info.index =='review_scores_checkin']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
review_scores_checkin,review_scores_checkin,float64,116,36008,10287,0.003222,0.285687


In [83]:
df_eda['review_scores_checkin'].unique()


array([4.83, 4.81, 4.97, 4.69, 4.91,  nan, 5.  , 4.94, 4.92, 4.67, 4.9 ,
       4.95, 4.86, 4.89, 4.73, 4.85, 4.88, 4.96, 4.25, 4.78, 4.71, 4.93,
       4.72, 4.84, 4.8 , 4.99, 4.79, 4.82, 4.33, 4.75, 4.98, 4.7 , 4.47,
       4.74, 4.77, 4.87, 3.  , 4.57, 4.  , 4.76, 4.65, 4.55, 4.5 , 4.63,
       4.59, 4.64, 4.38, 4.56, 4.45, 4.29, 4.4 , 4.58, 4.6 , 4.27, 4.17,
       4.32, 4.21, 4.68, 4.54, 3.5 , 4.3 , 4.61, 2.  , 4.46, 4.62, 4.36,
       2.67, 4.53, 4.2 , 3.75, 4.44, 4.66, 3.94, 4.39, 4.43, 1.  , 4.11,
       4.51, 4.31, 3.67, 4.22, 4.42, 3.33, 4.52, 4.1 , 4.48, 3.86, 4.14,
       3.4 , 4.13, 2.5 , 4.41, 4.23, 3.8 , 3.71, 2.25, 2.33, 3.93, 4.49,
       3.95, 3.25, 4.06, 4.09, 3.82, 4.37, 3.6 , 3.85, 3.88, 1.5 , 4.26,
       4.18, 4.15, 4.34, 4.19, 4.35, 3.83, 0.  ])

### 1.3.29. Feature: *'review_scores_communication'*

In [84]:
df_info[df_info.index =='review_scores_communication']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
review_scores_communication,review_scores_communication,float64,134,36008,10285,0.003721,0.285631


In [85]:
df_eda['review_scores_communication'].unique()


array([4.91, 4.92, 4.95, 4.56, 4.77, 4.89,  nan, 4.83, 5.  , 4.87, 4.88,
       4.75, 4.93, 4.98, 4.57, 4.96, 4.74, 4.85, 4.81, 4.52, 4.94, 4.58,
       4.99, 4.8 , 4.9 , 4.63, 4.7 , 4.82, 4.73, 4.97, 4.67, 4.79, 4.76,
       4.5 , 4.  , 4.66, 4.65, 4.4 , 4.71, 4.6 , 4.72, 4.86, 4.78, 4.33,
       4.69, 4.64, 4.53, 4.1 , 4.68, 4.84, 3.75, 4.48, 4.61, 4.41, 4.07,
       4.62, 4.36, 4.44, 4.51, 4.38, 1.  , 4.59, 3.67, 3.83, 4.49, 4.37,
       4.2 , 4.35, 3.  , 4.43, 4.45, 4.14, 3.25, 4.55, 3.8 , 3.5 , 4.19,
       4.17, 4.54, 4.13, 4.25, 4.11, 2.  , 4.47, 4.46, 4.21, 4.06, 4.22,
       3.86, 4.3 , 4.29, 3.93, 4.34, 4.18, 3.4 , 4.39, 4.42, 3.43, 3.84,
       3.33, 4.27, 2.5 , 3.63, 4.23, 3.88, 4.12, 1.5 , 3.29, 4.31, 4.03,
       2.67, 3.81, 4.32, 4.26, 4.15, 2.33, 3.6 , 3.56, 3.58, 3.45, 4.05,
       3.82, 2.8 , 4.28, 3.92, 2.6 , 3.91, 2.75, 3.9 , 4.16, 3.17, 3.57,
       4.24, 4.08, 0.  ])

### 1.3.30. Feature: *'review_scores_location'*

In [86]:
df_info[df_info.index =='review_scores_location']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
review_scores_location,review_scores_location,float64,128,36008,10286,0.003555,0.285659


In [87]:
df_eda['review_scores_location'].unique()


array([4.77, 4.84, 4.94, 4.44, 4.83, 4.95,  nan, 4.91, 4.76, 4.92, 4.75,
       4.25, 4.89, 5.  , 4.96, 4.71, 4.85, 4.88, 4.53, 4.58, 4.82, 4.7 ,
       4.98, 4.93, 4.9 , 4.64, 4.37, 4.62, 4.31, 4.72, 4.81, 4.87, 4.86,
       4.97, 4.99, 4.74, 4.33, 4.43, 4.57, 4.67, 4.45, 4.73, 4.63, 4.13,
       4.51, 4.41, 4.54, 4.21, 4.2 , 4.5 , 3.5 , 4.65, 4.8 , 3.  , 4.79,
       2.  , 4.  , 4.52, 4.26, 4.4 , 4.6 , 4.49, 4.78, 4.68, 4.22, 4.36,
       4.55, 4.42, 4.69, 4.66, 4.61, 4.32, 4.24, 4.59, 3.92, 3.7 , 3.67,
       4.3 , 4.29, 4.38, 4.46, 4.56, 2.5 , 4.28, 3.75, 4.17, 4.27, 1.  ,
       4.47, 4.18, 4.1 , 4.11, 4.23, 4.14, 4.08, 3.83, 3.9 , 3.89, 3.91,
       3.33, 3.86, 4.48, 4.16, 3.71, 3.88, 4.07, 3.44, 4.39, 3.87, 4.35,
       3.8 , 3.6 , 4.34, 4.15, 3.78, 2.67, 2.75, 4.05, 3.11, 3.58, 3.43,
       3.76, 3.57, 2.33, 3.25, 4.03, 4.19, 3.82, 0.  ])

### 1.3.31. Feature: *'review_scores_value'*

In [88]:
df_info[df_info.index =='review_scores_value']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
review_scores_value,review_scores_value,float64,144,36008,10286,0.003999,0.285659


In [89]:
df_eda['review_scores_value'].unique()


array([4.67, 4.6 , 4.89, 4.38, 4.59, 4.73,  nan, 4.77, 4.78, 4.46, 5.  ,
       4.08, 4.72, 4.57, 4.55, 4.65, 4.62, 4.83, 4.5 , 4.75, 4.54, 4.7 ,
       4.44, 4.23, 4.88, 4.86, 4.82, 4.2 , 4.85, 4.63, 4.56, 4.74, 4.43,
       4.84, 4.53, 4.66, 4.8 , 4.76, 4.68, 4.33, 4.79, 4.95, 4.71, 4.  ,
       4.29, 4.97, 4.19, 4.45, 4.4 , 4.47, 4.81, 4.18, 4.49, 4.9 , 4.58,
       3.75, 4.61, 4.36, 4.94, 4.51, 4.93, 4.87, 4.69, 4.91, 4.12, 4.15,
       2.5 , 4.48, 4.64, 4.92, 3.  , 4.39, 4.37, 4.1 , 4.35, 3.43, 3.83,
       4.42, 4.13, 4.41, 3.63, 4.52, 4.17, 4.31, 3.67, 4.27, 4.21, 4.22,
       4.25, 4.32, 4.14, 4.28, 3.85, 4.34, 1.  , 4.3 , 4.04, 4.96, 3.6 ,
       2.33, 3.5 , 4.26, 2.  , 2.75, 3.8 , 4.98, 3.33, 3.86, 3.88, 3.25,
       3.71, 4.09, 3.93, 3.78, 4.11, 4.07, 2.67, 3.2 , 3.92, 3.89, 4.06,
       4.16, 3.7 , 3.4 , 4.99, 4.24, 3.9 , 3.94, 3.13, 4.03, 3.95, 3.29,
       3.91, 1.5 , 3.81, 4.05, 3.56, 3.98, 3.17, 3.57, 3.82, 3.73, 3.84,
       3.62, 0.  ])

### 1.3.32. Feature: *'reviews_per_month'*

In [90]:
df_info[df_info.index =='reviews_per_month']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
reviews_per_month,reviews_per_month,float64,678,36008,10302,0.018829,0.286103


In [91]:
df_eda['reviews_per_month'].unique()


array([1.900e+00, 1.670e+00, 2.820e+00, 1.300e-01, 1.070e+00, 1.580e+00,
             nan, 1.040e+00, 1.700e-01, 3.070e+00, 2.300e-01, 3.000e-02,
       9.000e-02, 1.150e+00, 1.000e-02, 3.110e+00, 7.000e-02, 1.030e+00,
       8.900e-01, 5.600e-01, 1.850e+00, 8.000e-02, 3.700e-01, 2.500e-01,
       5.700e-01, 3.400e-01, 7.000e-01, 7.200e-01, 4.000e-02, 4.500e-01,
       1.660e+00, 7.700e-01, 4.400e-01, 1.100e-01, 1.270e+00, 1.880e+00,
       3.800e+00, 5.200e-01, 2.440e+00, 1.050e+00, 9.500e-01, 1.620e+00,
       6.300e-01, 1.080e+00, 2.800e-01, 9.800e-01, 1.820e+00, 7.600e-01,
       2.200e-01, 1.380e+00, 1.250e+00, 6.000e-01, 1.560e+00, 1.680e+00,
       5.800e-01, 1.640e+00, 5.000e-02, 6.100e-01, 2.100e-01, 2.000e-02,
       3.300e-01, 1.960e+00, 2.070e+00, 1.020e+00, 1.200e-01, 4.800e-01,
       1.170e+00, 1.130e+00, 3.600e-01, 2.130e+00, 1.100e+00, 8.100e-01,
       1.600e-01, 5.000e-01, 4.050e+00, 6.600e-01, 1.400e+00, 4.100e-01,
       1.000e-01, 2.000e-01, 3.900e-01, 2.400e-01, 

"listing_url"


In [109]:
df_info[df_info.index =='listing_url']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
listing_url,listing_url,object,36008,36008,0,1.0,0.0


In [108]:
df_eda['listing_url'].unique()

array(['https://www.airbnb.com/rooms/17878',
       'https://www.airbnb.com/rooms/25026',
       'https://www.airbnb.com/rooms/35764', ...,
       'https://www.airbnb.com/rooms/1053789340172837654',
       'https://www.airbnb.com/rooms/1053808194231554793',
       'https://www.airbnb.com/rooms/1053823261878675052'], dtype=object)

- "scrape_id"


In [115]:
df_info[df_info.index =='scrape_id']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
scrape_id,scrape_id,int64,1,36008,0,2.8e-05,0.0


In [114]:
df_eda['scrape_id'].unique()

array([20231226034138], dtype=int64)

- "last_scraped"


In [117]:
df_info[df_info.index =='last_scraped']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
last_scraped,last_scraped,object,5,36008,0,0.000139,0.0


In [116]:
df_eda['last_scraped'].unique()

array(['2023-12-27', '2023-12-26', '2023-12-28', '2023-12-30',
       '2023-12-29'], dtype=object)

- "source"


In [118]:
df_info[df_info.index =='source']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
source,source,object,2,36008,0,5.6e-05,0.0


In [119]:
df_eda['source'].unique()

array(['city scrape', 'previous scrape'], dtype=object)

- "name"


In [123]:
df_info[df_info.index =='name']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
name,name,object,14150,36008,0,0.392968,0.0


In [124]:
df_eda['name'].unique()

array(['Condo in Rio de Janeiro · ★4.70 · 2 bedrooms · 2 beds · 1 bath',
       'Rental unit in Rio de Janeiro · ★4.72 · 1 bedroom · 1 bed · 1 bath',
       'Loft in Rio de Janeiro · ★4.90 · 1 bedroom · 1 bed · 1.5 baths',
       ...,
       'Rental unit in Rio de Janeiro · ★New · Studio · 5 beds · 1 bath',
       'Home in Rio de Janeiro · ★New · 5 bedrooms · 6 beds · 6.5 baths',
       'Home in Rio de Janeiro · ★New · 1 bedroom · 4 beds · 1 bath'],
      dtype=object)

- "picture_url"


In [125]:
df_info[df_info.index =='picture_url']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
picture_url,picture_url,object,35265,36008,0,0.979366,0.0


In [126]:
df_eda['picture_url'].unique()

array(['https://a0.muscache.com/pictures/65320518/30698f38_original.jpg',
       'https://a0.muscache.com/pictures/a745aa21-b8dd-4959-a040-eb8e6e6f07ee.jpg',
       'https://a0.muscache.com/pictures/23782972/1d3e55b0_original.jpg',
       ...,
       'https://a0.muscache.com/pictures/miso/Hosting-1053789340172837654/original/f4ff3194-fc5b-430a-808b-08a0e7c38205.jpeg',
       'https://a0.muscache.com/pictures/hosting/Hosting-1053808194231554793/original/5a0f8d6f-2b9c-47c4-b903-ce0273419883.jpeg',
       'https://a0.muscache.com/pictures/miso/Hosting-1053823261878675052/original/e50828b0-f968-40da-91a3-14a36009838e.jpeg'],
      dtype=object)

- "host_id"


In [127]:
df_info[df_info.index =='host_id']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_id,host_id,int64,21980,36008,0,0.61042,0.0


In [128]:
df_eda['host_id'].unique()

array([    68997,    102840,    153691, ..., 206898000, 536983374,
          694816], dtype=int64)

- "host_url"


In [131]:
df_info[df_info.index =='host_url']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_url,host_url,object,21980,36008,0,0.61042,0.0


In [132]:
df_eda['host_url'].unique()

array(['https://www.airbnb.com/users/show/68997',
       'https://www.airbnb.com/users/show/102840',
       'https://www.airbnb.com/users/show/153691', ...,
       'https://www.airbnb.com/users/show/206898000',
       'https://www.airbnb.com/users/show/536983374',
       'https://www.airbnb.com/users/show/694816'], dtype=object)

- "host_verifications"


In [133]:
df_info[df_info.index =='host_verifications']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
host_verifications,host_verifications,object,8,36008,9,0.000222,0.00025


In [134]:
df_eda['host_verifications'].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['email']", "['phone', 'work_email']", nan, '[]',
       "['email', 'work_email']", "['email', 'phone', 'photographer']"],
      dtype=object)

- "neighbourhood_cleansed"


In [135]:
df_info[df_info.index =='neighbourhood_cleansed']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
neighbourhood_cleansed,neighbourhood_cleansed,object,156,36008,0,0.004332,0.0


In [136]:
df_eda['neighbourhood_cleansed'].unique()

array(['Copacabana', 'Flamengo', 'Ipanema', 'Laranjeiras', 'Santa Teresa',
       'Tijuca', 'Barra da Tijuca', 'Jacarepaguá', 'Jardim Botânico',
       'Leblon', 'Joá', 'Leme', 'Vila da Penha', 'Botafogo', 'Centro',
       'Gávea', 'Lagoa', 'Vidigal', 'Recreio dos Bandeirantes',
       'Santo Cristo', 'Humaitá', 'Itanhangá', 'São Conrado',
       'Praça Seca', 'Glória', 'Urca', 'Vargem Pequena', 'Saúde',
       'Alto da Boa Vista', 'Cosme Velho', 'Vargem Grande',
       'Penha Circular', 'Taquara', 'Jardim Sulacap', 'Estácio',
       'Rocinha', 'Rio Comprido', 'Praça da Bandeira', 'Gamboa', 'Cosmos',
       'Bonsucesso', 'Barra de Guaratiba', 'Piedade', 'Senador Camará',
       'Bangu', 'Catete', 'Maracanã', 'Curicica', 'Engenho Novo',
       'Vila Isabel', 'Andaraí', 'Cidade Nova', 'São Francisco Xavier',
       'Guaratiba', 'Anil', 'Cachambi', 'Lins de Vasconcelos',
       'Santa Cruz', 'Cavalcanti', 'Todos os Santos', 'Marechal Hermes',
       'Freguesia (Jacarepaguá)', 'Paciência',

- "latitude"


In [137]:
df_info[df_info.index =='latitude']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
latitude,latitude,float64,20707,36008,0,0.575067,0.0


In [138]:
df_eda['latitude'].unique()

array([-22.96599   , -22.97735   , -22.98107   , ..., -22.98054655,
       -22.96833955, -22.91224091])

- "longitude"


In [139]:
df_info[df_info.index =='longitude']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
longitude,longitude,float64,22606,36008,0,0.627805,0.0


In [140]:
df_eda['longitude'].unique()

array([-43.1794    , -43.19105   , -43.19136   , ..., -43.18585488,
       -43.24083   , -43.17128824])

- "property_type"


In [141]:
df_info[df_info.index =='property_type']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
property_type,property_type,object,85,36008,0,0.002361,0.0


In [142]:
df_eda['property_type'].unique()

array(['Entire condo', 'Entire rental unit', 'Entire loft',
       'Private room in rental unit', 'Entire home',
       'Private room in home', 'Private room in townhouse',
       'Private room in bed and breakfast', 'Entire guest suite',
       'Private room in guesthouse', 'Private room in guest suite',
       'Private room in condo', 'Earthen home', 'Entire guesthouse',
       'Entire cottage', 'Entire serviced apartment',
       'Private room in treehouse', 'Private room in chalet',
       'Room in boutique hotel', 'Shared room in home',
       'Private room in earthen home', 'Entire townhouse', 'Boat',
       'Private room in hostel', 'Entire chalet',
       'Private room in serviced apartment', 'Shared room in rental unit',
       'Treehouse', 'Entire villa', 'Room in serviced apartment',
       'Entire cabin', 'Entire vacation home',
       'Private room in casa particular', 'Private room in villa',
       'Shared room', 'Room in aparthotel', 'Shared room in tiny home',
       '

- "room_type"


In [143]:
df_info[df_info.index =='room_type']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
room_type,room_type,object,4,36008,0,0.000111,0.0


In [144]:
df_eda['room_type'].unique()

array(['Entire home/apt', 'Private room', 'Shared room', 'Hotel room'],
      dtype=object)

In [146]:
df_eda[['property_type','room_type']][df_eda['property_type']=='Cycladic home']

Unnamed: 0,property_type,room_type
26267,Cycladic home,Entire home/apt


- "accommodates"


In [147]:
df_info[df_info.index =='accommodates']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
accommodates,accommodates,int64,16,36008,0,0.000444,0.0


In [148]:
df_eda['accommodates'].unique()

array([ 5,  3,  2,  4, 13,  9, 16,  6, 14,  1, 10,  8, 12,  7, 11, 15],
      dtype=int64)

- "amenities"


In [149]:
df_info[df_info.index =='amenities']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
amenities,amenities,object,1,36008,0,2.8e-05,0.0


In [150]:
df_eda['amenities'].unique()

array(['[]'], dtype=object)

- "price"


In [153]:
df_info[df_info.index =='price']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
price,price,object,3230,36008,1771,0.089702,0.049184


In [154]:
df_eda['price'].unique()

array([1357.,  865.,  373., ..., 1197., 2777., 2537.])

- "minimum_nights"


In [155]:
df_info[df_info.index =='minimum_nights']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
minimum_nights,minimum_nights,int64,71,36008,0,0.001972,0.0


In [156]:
df_eda['minimum_nights'].unique()

array([   5,    2,    3,    4,    1,   20,    6,   22,   78,   30,   90,
         60,    7,   21,   10,   14,   15,   12,   45,   50,   31,   28,
          8,  365,  630,  300,    9,   34,  180,   25,   19,   23,   29,
         32,  360,  120,  500,   17,  730,   11,  200,   16,   40,   18,
         24,   27,   13,   80,  100,   89,  158,   36,   95,  720,  184,
         35,  109, 1125,  999,  150, 1000,  960,   44,   88,  362,   49,
         55,   26,  188,   99,  110], dtype=int64)

- "maximum_nights"


In [157]:
df_info[df_info.index =='maximum_nights']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
maximum_nights,maximum_nights,int64,194,36008,0,0.005388,0.0


In [158]:
df_eda['maximum_nights'].unique()

array([  28,   60,   15,  365,  180,  760,   89,   30,  750,   90,  730,
       1125,   10,   14,  100,  150,   20,  420,   92,   21,    7,  500,
          9,   29,  520,  358,   16,  120,   31,  800,  720,   45,  366,
          5,    8,  250,  360,   50,  110,   27,  380,   35,   40, 1825,
         26,   36,   80, 1124,  300,  320,    6,   91, 1000,   34,   70,
        200,    4, 1123,  280,   59,   99,  370,   88,  130,   12,  140,
        190,   25,  900, 1001,  170,   87,   32, 1110,   22,  160,   18,
        600,   85,   17,  400,   24,   13,  155,   11,  998,   55,  277,
          3,  210,  182,  912,  920,   38, 1120,  352,   61,  240,   93,
       1095,  350,   19,  666,   48,  270,  356,   75,   33,   56,   65,
         62,   47,    1,   43,   23,   37,  121, 1122,   69,   81,  700,
        364,  220,  336,  179,  161, 1109,  610,  369,  185,    2,  125,
         44,  244,  580,  290, 1100,  999,  199,   84,  333,   68,  226,
         42,  540,  790, 1008, 1053,  325,   95,  1

- "minimum_minimum_nights"


In [159]:
df_info[df_info.index =='minimum_minimum_nights']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
minimum_minimum_nights,minimum_minimum_nights,int64,69,36008,0,0.001916,0.0


In [160]:
df_eda['minimum_minimum_nights'].unique()

array([   5,    2,    1,    3,    4,   20,    6,   22,   78,   30,   90,
         60,    7,   21,   10,   14,   15,   12,    8,   31,   50,  365,
        630,  300,    9,   13,   34,   25,   28,   23,   45,   29,   40,
         32,  180,  360,  120,   19,  500,   17,  730,   11,  200,   16,
         18,   24,   27,   80,  100,   89,  158,   36,   95,   26,  184,
         35,  109, 1125,  150, 1000,  960,   44,   88,   39,   49,   55,
        188,   99,  110], dtype=int64)

- "maximum_minimum_nights


In [161]:
df_info[df_info.index =='maximum_minimum_nights']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
maximum_minimum_nights,maximum_minimum_nights,int64,77,36008,0,0.002138,0.0


In [162]:
df_eda['maximum_minimum_nights'].unique()

array([   5,    4,    6,    7,    3,    1,   20,    9,    2,   10,   22,
         78,   30,   90,   60,   14,   21,    8,   15,   12,   45,   50,
         31,   28,  365,  630,  300,  100,   34,  200,  180,   25,   19,
         23,   29,   16,   11,   32,  140,  360,  120,  500,   17,  730,
         62,   40,   18,   27,   24,   13,   80, 1125,   89,  158,   36,
         95,  720,  184,   35,  109,  400,  150, 1000,  960,   91,   44,
        999,   88,   39,   26,   56,   49,   55,  900,  188,   99,  110],
      dtype=int64)

- "minimum_maximum_nights"


In [163]:
df_info[df_info.index =='minimum_maximum_nights']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
minimum_maximum_nights,minimum_maximum_nights,int64,170,36008,0,0.004721,0.0


In [164]:
df_eda['minimum_maximum_nights'].unique()

array([  28,   60,    7,  365,  180,  760,   89,   30, 1125,  750,  730,
         10,   90,  150,   20,   92,   21,    1,  500,    9,   14,  520,
        358,   15,   29,   31,  800,  720,   45,  366,    5,    8,  250,
        360,   50,  110,   27,  120,  380,   35,   40, 1825,   26,   80,
       1124,  300,  320,    6,   91, 1000,   34,    2,  200,  280,    4,
       1123,   59,   99,  370,   70,  130,   12,   25,  900,  170,   87,
         32,  731, 1110,   22,   18,  100,  190,  600,   85,   17,  140,
        400,   24,   13,  155,   11, 1001,  160,   88,  277,    3,  210,
        182,  912,  920,   38, 1120,   16,  352,   93, 1095,   19,  999,
         48,  270,  356,   75,   33,   56,   55,   47,   65,   43,   36,
         62,   23,   37,  121, 1122,   69,   81,  364,  220,  336,  179,
        161, 1109,  610,  369,   61,  125,   44,  244,  350,  580,  199,
         84,  333,  226,   42,  540, 1008, 1053,  325,  700,  102,  275,
        186, 1097,  390,  185,  299,  240,  729,   

- "maximum_maximum_nights"


In [165]:
df_info[df_info.index =='maximum_maximum_nights']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
maximum_maximum_nights,maximum_maximum_nights,int64,172,36008,0,0.004777,0.0


In [166]:
df_eda['maximum_maximum_nights'].unique()

array([  28,   60,   15,  365,  180,  760,   89,   30, 1125,  750,  730,
         10,   90,  150,   20,   92,   21,  500,    9,   14,  520,    7,
        358,   31,  800,  720,   45,    5,    8,  250,  360,   50,  110,
         27,  120,  380,   35,   40, 1825,   26,   80, 1124,  300,   29,
        320,    6,   91,  100, 1000,   34,  200,    4, 1123,  280,   59,
         99,  370,   88,   70,  130,   12,  140,   25,  900,  170,   87,
         32,  731, 1110,   22,  366,  160,   18,  190,  600,   85,   17,
        400,   24,   13,  155,   11, 1001,  277,    3,  210,  182,  912,
        920,   38, 1120,   16,  352,   93, 1095,   33,  999,   19,   48,
        270,  356,   75,   56,   55,   47,   65,    1,   43,   36,   62,
         23,   37,  121, 1122,   69,   81,  364,  220,  336,  179,  161,
       1109,  610,  369,    2,   61,  125,   44,  244,  350,  580, 1100,
        199,   84,  333,  226,   42,  540, 1008, 1053,  325,  700,  102,
        275,  186, 1097,  390,  185,  299,  240,  7

- "minimum_nights_avg_ntm"


In [167]:
df_info[df_info.index =='minimum_nights_avg_ntm']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
minimum_nights_avg_ntm,minimum_nights_avg_ntm,float64,252,36008,0,0.006998,0.0


In [168]:
df_eda['minimum_nights_avg_ntm'].unique()

array([5.000e+00, 2.200e+00, 3.100e+00, 3.200e+00, 3.000e+00, 4.000e+00,
       1.000e+00, 2.300e+00, 2.100e+00, 2.800e+00, 2.000e+01, 5.200e+00,
       2.900e+00, 2.700e+00, 6.000e+00, 2.000e+00, 4.900e+00, 3.300e+00,
       7.000e+00, 1.300e+00, 2.200e+01, 1.900e+00, 3.600e+00, 7.800e+01,
       3.000e+01, 9.000e+01, 6.000e+01, 5.800e+00, 8.800e+00, 2.100e+01,
       7.200e+00, 5.500e+00, 5.100e+00, 3.500e+00, 5.400e+00, 4.100e+00,
       1.000e+01, 1.400e+01, 2.400e+00, 4.200e+00, 5.300e+00, 2.910e+01,
       5.900e+00, 9.900e+00, 4.500e+00, 1.100e+00, 1.500e+01, 4.400e+00,
       6.900e+00, 1.390e+01, 6.800e+00, 3.900e+00, 1.200e+01, 3.400e+00,
       4.370e+01, 6.300e+00, 4.700e+00, 5.000e+01, 1.500e+00, 3.100e+01,
       1.400e+00, 1.700e+00, 9.700e+00, 2.500e+00, 2.940e+01, 2.690e+01,
       1.600e+00, 2.600e+00, 6.200e+00, 7.900e+00, 3.650e+02, 1.200e+00,
       4.600e+00, 6.300e+02, 3.000e+02, 2.870e+01, 4.800e+00, 9.000e+00,
       4.300e+00, 3.400e+01, 1.380e+01, 1.110e+01, 

- "maximum_nights_avg_ntm"


In [169]:
df_info[df_info.index =='maximum_nights_avg_ntm']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
maximum_nights_avg_ntm,maximum_nights_avg_ntm,float64,875,36008,0,0.0243,0.0


In [170]:
df_eda['maximum_nights_avg_ntm'].unique()

array([2.8000e+01, 6.0000e+01, 1.4700e+01, 3.6500e+02, 1.8000e+02,
       7.6000e+02, 8.9000e+01, 3.0000e+01, 1.1250e+03, 7.5000e+02,
       7.3000e+02, 7.8270e+02, 1.0000e+01, 9.0000e+01, 1.5000e+02,
       2.0000e+01, 9.2000e+01, 2.1000e+01, 1.1147e+03, 5.0000e+02,
       1.1130e+02, 9.0000e+00, 1.4000e+01, 5.2000e+02, 7.0000e+00,
       3.5800e+02, 1.5000e+01, 9.1840e+02, 3.1000e+01, 8.0000e+02,
       1.1122e+03, 7.2000e+02, 4.5000e+01, 7.2540e+02, 5.0000e+00,
       8.0000e+00, 2.5000e+02, 3.6000e+02, 5.0000e+01, 1.1000e+02,
       2.4010e+02, 2.7000e+01, 1.2000e+02, 1.0988e+03, 3.8000e+02,
       3.5000e+01, 4.0000e+01, 1.8250e+03, 2.6000e+01, 8.4400e+01,
       8.0000e+01, 1.1240e+03, 3.0000e+02, 2.9000e+01, 3.2000e+02,
       6.0000e+00, 9.1000e+01, 9.8100e+01, 1.0000e+03, 3.4000e+01,
       1.1199e+03, 6.6100e+01, 1.0624e+03, 1.0814e+03, 1.1018e+03,
       4.8260e+02, 2.0000e+02, 1.0551e+03, 4.0000e+00, 1.1230e+03,
       5.9400e+01, 7.1710e+02, 2.8000e+02, 5.9000e+01, 9.9000e

- "has_availability"


In [171]:
df_info[df_info.index =='has_availability']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
has_availability,has_availability,object,2,36008,1771,5.6e-05,0.049184


In [172]:
df_eda['has_availability'].unique()

array(['t', nan, 'f'], dtype=object)

- "availability_30"


In [173]:
df_info[df_info.index =='availability_30']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
availability_30,availability_30,int64,31,36008,0,0.000861,0.0


In [174]:
df_eda['availability_30'].unique()

array([ 5,  3,  4, 15,  6, 30, 18, 24, 14,  0, 16, 25,  1, 20, 19, 28,  8,
       23, 27, 21,  7,  2, 29, 17, 26, 12, 11,  9, 22, 13, 10],
      dtype=int64)

- "availability_60"


In [175]:
df_info[df_info.index =='availability_60']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
availability_60,availability_60,int64,61,36008,0,0.001694,0.0


In [176]:
df_eda['availability_60'].unique()

array([ 7, 18,  9, 38, 22, 60, 51, 13, 20,  0, 33,  1, 43, 25, 58,  3, 31,
        5, 19, 44, 57, 39,  6, 40, 59, 34, 52, 11, 30,  2, 24, 23, 46, 26,
       10, 45, 17, 15, 14, 21,  4, 54, 37, 29, 48, 41, 49, 27, 32, 47, 55,
       53, 42, 28, 50, 16, 56,  8, 35, 12, 36], dtype=int64)

- "availability_90"


In [177]:
df_info[df_info.index =='availability_90']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
availability_90,availability_90,int64,91,36008,0,0.002527,0.0


In [178]:
df_eda['availability_90'].unique()

array([14, 48, 12, 41, 27, 39, 90, 65, 81, 37, 43, 50,  0, 63,  1, 73, 26,
       88, 31,  6, 61, 55, 35, 13, 19, 74, 68,  4, 87,  2, 69, 49, 11, 70,
       89, 25, 82, 52, 34,  3, 54, 53, 76, 10, 32, 47, 15, 64, 42, 44, 51,
       16, 84, 67, 30, 59, 78, 29, 40, 24, 45, 71, 17, 79, 57, 62, 77, 60,
       46, 75, 83, 66, 72, 33, 58, 28,  8, 38, 36,  9, 86, 18, 56, 21, 20,
       80,  5, 85, 23,  7, 22], dtype=int64)

- "availability_365"


In [179]:
df_info[df_info.index =='availability_365']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
availability_365,availability_365,int64,366,36008,0,0.010164,0.0


In [180]:
df_eda['availability_365'].unique()

array([269, 228,  62,  44, 293, 245, 365, 319, 356, 312, 308, 294,   0,
       338,   1,  73, 234, 306, 271, 363, 287, 262,  67, 330, 215, 100,
       264,  80,  76, 143, 362,   3,  31, 249, 321,  16, 345, 179, 200,
        41, 171, 357,  20, 323,  34,  58,  26, 144, 233, 242, 351,  37,
       364, 238,  18, 298, 244,   2,  69,  10, 318, 192, 339,  42, 134,
       326, 216, 329, 157,  30, 240, 174, 268, 149, 353,  89, 207, 292,
       135, 301, 159, 239,  64, 224, 316, 354, 180, 325, 302, 103, 154,
       172,  61, 346, 230, 175, 331, 137, 121, 300, 335, 155, 136, 165,
       189,   6,  83, 130, 324, 148, 358, 299, 145, 349, 162, 277, 336,
       168, 328, 359,  53,  72, 123,  24, 343,  93, 303, 352, 273, 206,
       327, 296,  57,  52, 221,  29, 128, 227, 320,  87,  39, 350,  11,
       347, 281, 272,  60, 342, 225,  90, 138, 307, 113, 315, 124,  86,
       126, 304, 201,  98, 170, 212,   5, 229, 310, 258, 232, 153,  46,
        78, 334, 109, 283, 360, 114, 290, 344, 236,  40, 317, 34

- "calendar_last_scraped"


In [181]:
df_info[df_info.index =='listcalendar_last_scrapeding_url']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing


In [182]:
df_eda['calendar_last_scraped'].unique()

array(['2023-12-27', '2023-12-26', '2023-12-28', '2023-12-30',
       '2023-12-29'], dtype=object)

- "number_of_reviews"


In [183]:
df_info[df_info.index =='number_of_reviews']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
number_of_reviews,number_of_reviews,int64,367,36008,0,0.010192,0.0


In [184]:
df_eda['number_of_reviews'].unique()

array([311, 275, 454,  17, 152, 227,   0, 163,  24, 431,   4,  12, 156,
         1, 463,   7, 153, 111,  82, 257,  53,  37,  78,  42,  84,  11,
       106,  70, 236, 113,  69,  16, 182, 293, 555,  74,  89, 351, 154,
       143, 234,  97, 160, 144, 261, 115,  25,   2,  40, 180, 119, 229,
        85,  83, 235,  41, 270,   5,  81,  30,   3,  46, 281, 404, 167,
       104, 148,  29,  63, 164, 161,  52, 313, 155,  48,  66, 157, 587,
        20,  86, 196,  60,  45,   9,  50,  51,  21,  34, 176,   8, 216,
        88,  43, 116,  18,  95,  94,  10,  14,  28,  39, 110,  55,  15,
         6,  13,  27,  80, 138, 125, 128,  35, 100, 221, 208, 171, 135,
       337, 195, 166, 177, 131, 168,  61,  93, 181, 109, 627, 204, 228,
       249,  96,  32, 130, 191,  56,  68, 358,  47,  31,  44, 267, 126,
        64,  92, 245,  91,  19, 102, 412, 286, 220, 309,  22, 183, 122,
       301, 136, 218, 108,  38,  72,  90, 437, 205, 272,  58, 190,  76,
       101, 402, 225, 372,  49, 117, 413, 319,  33, 170,  77,  5

- "number_of_reviews_ltm"


In [185]:
df_info[df_info.index =='number_of_reviews_ltm']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
number_of_reviews_ltm,number_of_reviews_ltm,int64,99,36008,0,0.002749,0.0


In [186]:
df_eda['number_of_reviews_ltm'].unique()

array([ 29,  36,   0,  14,  12,   3,  25,  16,  28,  26,  23,  13,  31,
         4,  10,   8,  27,  34,  22,   1,  65,   6,  15,  30,  51,  21,
         2,  18,   7,   5,  54,  33,  64,  24,  39,  17,   9,  11,  35,
        20,  44,  40,  45,  38,  43,  63,  55,  19,  46,  37,  32,  57,
        49,  41,  50,  42,  47,  48,  61,  66,  59,  75,  53,  56,  60,
        62,  76,  52,  92,  74,  58, 108,  91,  87,  73,  81,  78,  68,
        77,  70,  82, 115,  85, 120, 124,  96,  89,  80, 105,  67,  72,
        69,  93,  71,  84, 113,  98,  79,  94], dtype=int64)

- "number_of_reviews_l30d"


In [187]:
df_info[df_info.index =='number_of_reviews_l30d']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
number_of_reviews_l30d,number_of_reviews_l30d,int64,15,36008,0,0.000417,0.0


In [188]:
df_eda['number_of_reviews_l30d'].unique()

array([ 4,  2,  0,  3,  1,  6,  5,  7, 11, 10,  8,  9, 14, 13, 12],
      dtype=int64)

- "instant_bookable"


In [189]:
df_info[df_info.index =='instant_bookable']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
instant_bookable,instant_bookable,object,2,36008,0,5.6e-05,0.0


In [190]:
df_eda['instant_bookable'].unique()

array(['f', 't'], dtype=object)

- "calculated_host_listings_count"


In [191]:
df_info[df_info.index =='calculated_host_listings_count']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
calculated_host_listings_count,calculated_host_listings_count,int64,58,36008,0,0.001611,0.0


In [192]:
df_eda['calculated_host_listings_count'].unique()

array([  1,   2,   5,  10,   4,  18,   7,   3,   8,   6, 142,  11,  50,
         9,  21,  15,  46,  14,  35,  24,  47,  32,  12,  29,  37,  23,
        16,  25,  69,  85,  13,  33,  17,  19,  42,  38,  30,  40,  51,
        43,  20,  26, 185, 157,  80,  49,  41,  39,  22,  28,  77,  27,
       128,  57,  56,  31, 145,  34], dtype=int64)

- "calculated_host_listings_count_entire_homes"


In [193]:
df_info[df_info.index =='calculated_host_listings_count_entire_homes']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
calculated_host_listings_count_entire_homes,calculated_host_listings_count_entire_homes,int64,60,36008,0,0.001666,0.0


In [194]:
df_eda['calculated_host_listings_count_entire_homes'].unique()

array([  1,   2,   5,   0,   9,  18,   7,   3,   8,   6,   4,  10, 142,
        11,  42,  20,  13,  14,  35,  22,  47,  32,  12,  29,  15,  37,
        23,  16,  24,  69,  84,  50,  17,  46,  38,  40,  43,  25,  26,
        36, 185, 152,  80,  49,  41,  21,  39,  19,  28,  77, 115,  57,
        51,  56,  30,  27,  31, 145,  33,  34], dtype=int64)

- "calculated_host_listings_count_private_rooms"


In [195]:
df_info[df_info.index =='calculated_host_listings_count_private_rooms']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
calculated_host_listings_count_private_rooms,calculated_host_listings_count_private_rooms,int64,18,36008,0,0.0005,0.0


In [196]:
df_eda['calculated_host_listings_count_private_rooms'].unique()

array([ 0,  1,  2,  3,  5,  4,  8,  9,  7, 12, 11,  6, 10, 21, 13, 14, 18,
       17], dtype=int64)

- "calculated_host_listings_count_shared_rooms"

In [197]:
df_info[df_info.index =='calculated_host_listings_count_shared_rooms']

Unnamed: 0,Columns,Type,Unique,Size,Missing,% Unique,% Missing
calculated_host_listings_count_shared_rooms,calculated_host_listings_count_shared_rooms,int64,16,36008,0,0.000444,0.0


In [198]:
df_eda['calculated_host_listings_count_shared_rooms'].unique()

array([ 0,  1,  9,  4,  2,  6,  3, 10,  5,  7,  8, 19, 11, 16, 13, 15],
      dtype=int64)