In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, median_absolute_error, roc_curve, auc
from sklearn.model_selection import train_test_split
import catboost
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
sns.set(style="darkgrid")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

### Download Data

In [2]:
df_public = pd.read_csv("data/public_listings_case.csv")
df_privat = pd.read_csv("data/private_listings_case.csv")
df_usd_rub = pd.read_excel("data/USDRUB.xlsx")
df_eur_rub = pd.read_excel("data/EURRUB.xlsx")
df_eur_rub["data"] = pd.to_datetime(df_eur_rub["data"])
df_usd_rub["data"] = pd.to_datetime(df_usd_rub["data"])
df_geo_city = pd.DataFrame({
    "region": ["Bergamo", "Bologna", "Firenze", "Milano", "Napoli", "Puglia", "Roma", "Sicilia", "Trentino", "Venezia"],
    "latitude_city": [45.696, 44.4938, 43.7793, 45.4643, 40.8522, 41.1175800, 41.8919, 37.50788 , 46.0679, 45.4371],
    "longitude_city": [9.66721, 11.3387, 11.2463, 9.18951, 14.2681, 16.4842100, 12.5113, 15.08303, 11.1211, 12.3327]
})

### Data Preparation

In [17]:
from main import Airbnb
airbnb = Airbnb(df_public, df_privat, df_usd_rub, df_eur_rub, df_geo_city)
X_public_prepared, X_privat_prepared, X_train_prepared, X_test_prepared = airbnb.get_preparation_date()
X_public_encoded, X_privat_encoded, X_train_encoded, X_test_encoded = airbnb.get_encoded_data()

df_public start preparation - done
df_privat start preparation - done
df_train start preparation - done
df_test start preparation - done
X_public_prepared start encoding - done
X_privat_prepared start encoding - done
X_train_prepared start encoding - done
X_test_prepared start encoding - done


In [25]:
X_public_prepared.loc[X_public_prepared["listing_url"].notnull(), "listing_url"] = "p"
X_public_prepared.loc[X_public_prepared["name"].notnull(), "name"] = "p"
X_public_prepared.loc[X_public_prepared["description"].notnull(), "description"] = "p"
X_public_prepared.loc[X_public_prepared["neighborhood_overview"].notnull(), "neighborhood_overview"] = "p"
X_public_prepared.loc[X_public_prepared["host_name"].notnull(), "host_name"] = "p"
X_public_prepared.loc[X_public_prepared["host_about"].notnull(), "host_about"] = "p"
X_public_prepared.loc[X_public_prepared["amenities"].notnull(), "amenities"] = "p"
X_public_prepared.loc[X_public_prepared["license"].notnull(), "license"] = "p"
X_public_prepared.loc[X_public_prepared["id"].notnull(), "id"] = "p"
X_public_prepared.head()

Unnamed: 0,id,listing_url,source,name,description,neighborhood_overview,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,city,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,region,type,price_usd,difference_review,distance_to_centre_city,cat_distance,len_description,bathrooms_num,bedrooms_num,beds_num,bathrooms_num2,bedrooms_num2,beds_num2,person_per_bathrooms,person_per_bedrooms,person_per_beds,person_per_bathrooms2,person_per_bedrooms2,person_per_beds2
0,p,p,previous scrape,p,p,p,p,2009-12-07,"Solto Collina, Italy",p,within an hour,100.0,100.0,f,,5.0,5.0,"['email', 'phone']",t,t,"Solto Collina, Lombardy, Italy",Solto Collina,,45.78354,10.02471,Entire villa,Entire home/apt,3,,1 bath,1.0,2.0,p,$70.00,3,28,3.0,4.0,1125.0,1125.0,3.0,1125.0,,t,2,2,2,2,2022-09-26,43,1,1,2012-05-23,2022-09-02,4.91,5.0,4.83,5.0,4.95,4.9,4.88,,t,5,5,0,0,0.34,Bergamo,public,70.0,3754,29.401807,from 3 to 100,1000,1.0,1.0,2.0,1.0,1.0,4.0,0.333333,0.333333,0.666667,0.111111,0.111111,0.444444
1,p,p,city scrape,p,p,,p,2010-05-30,"Bergamo, Italy",p,within an hour,96.0,99.0,f,,12.0,13.0,"['email', 'phone']",t,t,,Bergamo,,45.69052,9.67099,Entire rental unit,Entire home/apt,4,,,1.0,2.0,p,$69.00,1,31,1.0,1.0,1125.0,1125.0,1.0,1125.0,,t,0,0,0,148,2022-09-26,93,2,0,2012-03-05,2021-10-05,4.82,4.82,4.86,4.75,4.82,4.7,4.7,,f,11,6,5,0,0.72,Bergamo,public,69.0,3501,0.676388,from 0 to 3,1000,1.0,1.0,2.0,1.0,1.0,4.0,0.25,0.25,0.5,0.0625,0.0625,0.25
2,p,p,city scrape,p,p,p,p,2011-05-14,"Bossico, Italy",p,within an hour,100.0,100.0,t,,3.0,3.0,"['email', 'phone']",t,t,"Bossico, Lombardy, Italy",Bossico,,45.82723,10.04484,Entire rental unit,Entire home/apt,4,,1 bath,1.0,4.0,p,$30.00,7,1125,7.0,7.0,1125.0,1125.0,7.0,1125.0,,t,29,59,89,277,2022-09-26,14,4,0,2018-05-23,2022-06-26,4.71,4.79,4.93,4.71,4.5,4.71,4.86,p,t,3,3,0,0,0.26,Bergamo,public,30.0,1495,32.727577,from 3 to 100,951,1.0,1.0,4.0,1.0,1.0,16.0,0.25,0.25,1.0,0.0625,0.0625,1.0
3,p,p,city scrape,p,p,p,p,2011-07-13,"Bergamo, Italy",,within an hour,100.0,90.0,t,,14.0,17.0,"['email', 'phone']",t,t,"Bergamo, Lombardy, Italy",Bergamo,,45.69884,9.67598,Private room in loft,Private room,3,,1 private bath,1.0,2.0,p,$109.00,1,365,1.0,3.0,365.0,365.0,3.0,365.0,,t,14,44,74,347,2022-09-26,39,3,0,2013-04-07,2022-02-10,4.92,4.9,4.97,5.0,4.97,4.9,4.69,,t,14,11,3,0,0.34,Bergamo,public,109.0,3231,0.750759,from 0 to 3,1000,1.0,1.0,2.0,1.0,1.0,4.0,0.333333,0.333333,0.666667,0.111111,0.111111,0.444444
4,p,p,city scrape,p,p,p,p,2011-07-13,"Bergamo, Italy",,within an hour,100.0,90.0,t,,14.0,17.0,"['email', 'phone']",t,t,"Bergamo, Lombardy, Italy",Bergamo,,45.69769,9.67485,Private room in rental unit,Private room,2,,1 bath,1.0,1.0,p,$72.00,1,365,1.0,3.0,365.0,365.0,3.0,365.0,,t,3,24,51,319,2022-09-26,160,20,1,2012-05-17,2022-09-21,4.8,4.84,4.89,4.86,4.89,4.83,4.75,,t,14,11,3,0,1.27,Bergamo,public,72.0,3779,0.622404,from 0 to 3,1000,1.0,1.0,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.25,0.25,0.25


In [26]:
X_public_prepared.shape

(90717, 87)

In [27]:
X_public_prepared["price_usd"].max()

92686.00206146036

In [28]:
X_public_prepared.to_csv("x3.csv", index=False)

### EDA

In [54]:
print(len(df_public.columns))
print(df_public.shape)
df_public.head(4)

70
(90757, 70)


Unnamed: 0,id,listing_url,source,name,description,neighborhood_overview,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bathrooms_text,bedrooms,beds,amenities,price,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,number_of_reviews_l30d,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,city,type
0,958545013809,https://www.airbnb.com/rooms/15542,previous scrape,Suite PANORAMA facing the lake,Ideally located on the middle ridge of a panor...,One of the highlights of lake Iseo is Montisol...,"Andrea, Maurizio, And Tina",2009-12-07,"Solto Collina, Italy","Hi, I am Andrea and since 2016, I co-manage th...",within an hour,100%,100%,f,,5.0,5.0,"['email', 'phone']",t,t,"Solto Collina, Lombardy, Italy",Solto Collina,,45.78354,10.02471,Entire villa,Entire home/apt,3,,1 bath,1.0,2.0,"[""Free parking on premises"", ""Children\u2019s ...",$70.00,3,28,3.0,4.0,1125.0,1125.0,3.0,1125.0,,t,2,2,2,2,2022-09-26,43,1,1,2012-05-23,2022-09-02,4.91,5.0,4.83,5.0,4.95,4.9,4.88,,t,5,5,0,0,0.34,Bergamo,public
1,25957984113,https://www.airbnb.com/rooms/31412,city scrape,Cute and Cosy Terrace,Grazioso ed accogliente appartamento con terra...,,Elena,2010-05-30,"Bergamo, Italy",Ciao I am Elena. \r\nAfter my kids moved out I...,within an hour,96%,99%,f,,12.0,13.0,"['email', 'phone']",t,t,,Bergamo,,45.69052,9.67099,Entire rental unit,Entire home/apt,4,,,1.0,2.0,"[""TV"", ""Heating"", ""Dishes and silverware"", ""Ki...",$69.00,1,31,1.0,1.0,1125.0,1125.0,1.0,1125.0,,t,0,0,0,148,2022-09-26,93,2,0,2012-03-05,2021-10-05,4.82,4.82,4.86,4.75,4.82,4.7,4.7,,f,11,6,5,0,0.72,Bergamo,public
2,406575409089,https://www.airbnb.com/rooms/179345,city scrape,Bossico Vicino a Montisola e lago D'iseo,CIR 016033-CNI-00002T00146<br />Bossico vicino...,Il mio paese (Bossico (BG) è unico perchè è un...,Angela,2011-05-14,"Bossico, Italy","Ciao sono Angela,\nfaccio il parrucchiere uom...",within an hour,100%,100%,t,,3.0,3.0,"['email', 'phone']",t,t,"Bossico, Lombardy, Italy",Bossico,,45.82723,10.04484,Entire rental unit,Entire home/apt,4,,1 bath,1.0,4.0,"[""Free parking on premises"", ""Children\u2019s ...",$30.00,7,1125,7.0,7.0,1125.0,1125.0,7.0,1125.0,,t,29,59,89,277,2022-09-26,14,4,0,2018-05-23,2022-06-26,4.71,4.79,4.93,4.71,4.5,4.71,4.86,CIR 016033-CNI-00002T00146,t,3,3,0,0,0.26,Bergamo,public
3,185070664003,https://www.airbnb.com/rooms/215933,city scrape,Donizetti Royal suite private toilet,our elegant apartment is in the Lower city cen...,"many restaurants nearby, 50 metres from superm...",Marcella,2011-07-13,"Bergamo, Italy",,within an hour,100%,90%,t,,14.0,17.0,"['email', 'phone']",t,t,"Bergamo, Lombardy, Italy",Bergamo,,45.69884,9.67598,Private room in loft,Private room,3,,1 private bath,1.0,2.0,"[""Heating"", ""Cooking basics"", ""Kitchen"", ""Paid...",$109.00,1,365,1.0,3.0,365.0,365.0,3.0,365.0,,t,14,44,74,347,2022-09-26,39,3,0,2013-04-07,2022-02-10,4.92,4.9,4.97,5.0,4.97,4.9,4.69,,t,14,11,3,0,0.34,Bergamo,public


In [2]:
duplicate_count = df_public.duplicated().sum()
print("Number of duplicate rows:", duplicate_count)

Number of duplicate rows: 0


In [None]:
a = df_public.isna().sum()
a

#### Exploring price

In [3]:
def extract_first_character(text):
    return text[0]

print(df_public["price"].apply(extract_first_character).unique())
print(f"Amount of null value: {df_public['price'].isna().sum()}")
sigma = np.std(X_public_prepared["price_usd"])
print(f"Standard deviation (σ): {sigma}")

['$' '€' '₽']
Amount of null value: 0
Standard deviation (σ): 615.8920476470549


Заметим, что денежные значения у целового признака представлены в разных ['$' '€' '₽'] валютах.
В наборе данных присутсвует столбец "calendar_last_scraped" - "последние обновление данных". Эти даты я собираюсь использовать для конвертации валют по корректному курсу. 

In [237]:
print(pd.to_datetime(X_public_prepared["calendar_last_scraped"]).describe())
print(f"Amount of null value: {X_public_prepared['calendar_last_scraped'].isna().sum()}")

count                            72605
mean     2022-09-20 16:52:42.625163520
min                2022-09-07 00:00:00
25%                2022-09-15 00:00:00
50%                2022-09-25 00:00:00
75%                2022-09-27 00:00:00
max                2022-09-29 00:00:00
Name: calendar_last_scraped, dtype: object
Amount of null value: 0


In [None]:
median_price = X_public_prepared["price_usd"].median()
fig = px.histogram(X_public_prepared, x="price_usd", marginal="box", title="Price (USD) with Median")
fig.update_layout(bargap=0.1)


fig.add_shape(
    go.layout.Shape(
        type='line',
        x0=median_price,
        x1=median_price,
        y0=0,
        y1=1,
        xref='x',
        yref='paper',
        line=dict(color='red', width=2)
    )
)

fig.update_layout(xaxis_title_text='Price (USD)')
fig.show()

In [30]:
df_tmp["price_cat"].value_counts()

price_cat
0    127
Name: count, dtype: int64

In [28]:
def cat_price(row):
    if row['price_usd'] >= 5_000:
        return 0
    else:
        return 1
X_public_prepared["price_cat"] = X_public_prepared.apply(lambda row: cat_price(row), axis=1)

#### Exploring source

In [None]:
# fig = px.histogram(df_train, x="source")


fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram", "Scatter Plot"])

fig.add_trace(px.histogram(df_train, x="source").data[0], row=1, col=1)

fig.add_trace(px.scatter(df_train, x="source", y="price_usd").data[0], row=2, col=1)
fig.update_yaxes(title_text="Price USD", row=2, col=1)

fig.show()

Закодируем признак `Source` при помощи One-Hot encoding

#### Exploring description

In [6]:
X_train_prepared_t = X_train_prepared["description"].fillna("null")

In [9]:
print(f"Amount of null value: {X_train_prepared_t['description'].isna().sum()}")
X_train_prepared_t["description"].describe()

Amount of null value: 1316


count                                                 70598
unique                                                68274
top       Rilassati in questo spazio tranquillo in posiz...
freq                                                    142
Name: description, dtype: object

In [12]:
print(f"Amount of null value: {X_train_prepared['amenities'].isna().sum()}")
X_train_prepared["amenities"].describe()

Amount of null value: 0


count                           71914
unique                          68432
top       ["Long term stays allowed"]
freq                              177
Name: amenities, dtype: object

#### Exploring host_since

In [404]:
print(f"Amount of null value: {df_train['host_since'].isna().sum()}")
df_train["host_since"].describe()

Amount of null value: 0


count                            72441
mean     2017-01-01 08:55:41.930674688
min                2008-08-12 00:00:00
25%                2014-11-02 00:00:00
50%                2016-09-02 00:00:00
75%                2019-02-16 00:00:00
max                2022-09-25 00:00:00
Name: host_since, dtype: object

Закодируем признак `host_since` при помощи One-Hot encoding по годам

#### Exploring host_location

In [None]:
print(f"Number of null value: {X_train_prepared['host_location'].isna().sum()}")
print(f"Number of unique value: {len(X_train_prepared['host_location'].unique())}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["host_location"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Host location", "Scatter Plot"], vertical_spacing=0.5)

histogram_trace = px.histogram(X_train_prepared, x="host_location").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="host_location", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host location", row=2, col=1)

fig.update_layout(height=700)

fig.show()

#### Exploring host_response_time

In [None]:
print(f"Number of null value: {X_train_prepared['host_response_time'].isna().sum()}")
print(f"Number of unique values: {len(X_train_prepared['host_response_time'].unique())}")
print(f"Unique values: {X_train_prepared['host_response_time'].unique()}")
print(X_train_prepared["host_response_time"].describe())


fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram", "Scatter Plot"])
fig.add_trace(px.histogram(X_train_prepared, x="host_response_time").data[0], row=1, col=1)
fig.add_trace(px.scatter(X_train_prepared, x="host_response_time", y="price_usd").data[0], row=2, col=1)
fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.show()

Закодируем признак `host_response_time` при помощи One-Hot encoding

#### Exploring host_response_rate

In [None]:
print(f"Number of null value: {X_train_prepared['host_response_rate'].isna().sum()}")
print(X_train_prepared["host_response_rate"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram", "Scatter Plot"])
fig.add_trace(px.histogram(X_train_prepared, x="host_response_rate").data[0], row=1, col=1)
fig.add_trace(px.scatter(X_train_prepared, x="host_response_rate", y="price_usd").data[0], row=2, col=1)
fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.show()

#### Exploning host_acceptance_rate

In [None]:
print(f"Number of null value: {X_train_prepared['host_acceptance_rate'].isna().sum()}")
corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_acceptance_rate"])[0, 1]
print(f"Corr coef between price_usd and host_acceptance_rate: {corr}")
print(X_train_prepared["host_acceptance_rate"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Host acceptance rate", "Scatter Plot"])
fig.add_trace(px.histogram(X_train_prepared, x="host_acceptance_rate").data[0], row=1, col=1)
fig.add_trace(px.scatter(X_train_prepared, x="host_acceptance_rate", y="price_usd").data[0], row=2, col=1)
fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host acceptance rate", row=2, col=1)
fig.show()

#### Exploring host_is_superhost

In [None]:
print(f"Number of null value: {X_train_prepared['host_is_superhost'].isna().sum()}")
# corr = np.corrcoef(df_train["price_usd"], df_train["host_is_superhost"])[0, 1]
# print(f"Corr coef between price_usd and host_is_superhost: {corr}")
print(X_train_prepared["host_is_superhost"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Host is superhost", "Scatter Plot"])
fig.add_trace(px.histogram(X_train_prepared, x="host_is_superhost").data[0], row=1, col=1)
fig.add_trace(px.scatter(X_train_prepared, x="host_is_superhost", y="price_usd").data[0], row=2, col=1)
fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host is superhost", row=2, col=1)
fig.show()

#### Exploring host_neighbourhood

In [None]:
print(f"Number of null value: {df_train['host_neighbourhood'].isna().sum()}")
print(f"Number of unique value: {len(df_train['host_neighbourhood'].unique())}")
# corr = np.corrcoef(df_train["price_usd"], df_train["host_is_superhost"])[0, 1]
# print(f"Corr coef between price_usd and host_is_superhost: {corr}")
print(df_train["host_neighbourhood"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of host neighbourhood", "Scatter Plot"])

histogram_trace = px.histogram(df_train, x="host_neighbourhood").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(df_train, x="host_neighbourhood", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_layout(height=1000)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host Neighbourhood", row=2, col=1)

fig.show()

#### Exploring host_listings_count

In [None]:
print(f"Number of null value: {X_train_prepared['host_listings_count'].isna().sum()}")
corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_listings_count"])[0, 1]
print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["host_listings_count"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of host listings count", "Scatter Plot"])

histogram_trace = px.histogram(X_train_prepared, x="host_listings_count").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="host_listings_count", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host listings count", row=2, col=1)

fig.show()

#### Exploring host_total_listings_count

In [None]:
print(f"Number of null value: {X_train_prepared['host_total_listings_count'].isna().sum()}")
corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_listings_count"])[0, 1]
print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["host_total_listings_count"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Host total listings count", "Scatter Plot"])

histogram_trace = px.histogram(X_train_prepared, x="host_total_listings_count").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="host_total_listings_count", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host total listings count", row=2, col=1)

fig.show()

#### Exploring host_verifications

In [None]:
print(f"Number of null value: {X_train_prepared['host_verifications'].isna().sum()}")
print(f"Number of unique value: {X_train_prepared['host_verifications'].unique()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["host_verifications"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Host verifications", "Scatter Plot"])

histogram_trace = px.histogram(X_train_prepared, x="host_verifications").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="host_verifications", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host verifications", row=2, col=1)

fig.show()

#### Exploring host_has_profile_pic

In [None]:
print(f"Number of null value: {X_train_prepared['host_has_profile_pic'].isna().sum()}")
print(f"Number of unique value: {X_train_prepared['host_has_profile_pic'].unique()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["host_has_profile_pic"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Host has profile pic", "Scatter Plot"])

histogram_trace = px.histogram(X_train_prepared, x="host_has_profile_pic").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="host_has_profile_pic", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host has profile pic", row=2, col=1)

fig.show()

#### Exploring host_identity_verified

In [None]:
print(f"Number of null value: {X_train_prepared['host_identity_verified'].isna().sum()}")
print(f"Number of unique value: {X_train_prepared['host_identity_verified'].unique()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["host_identity_verified"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Host identity verified", "Scatter Plot"])

histogram_trace = px.histogram(X_train_prepared, x="host_identity_verified").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="host_identity_verified", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Host identity verified", row=2, col=1)

fig.show()

#### Exploring neighbourhood

In [None]:
print(f"Number of null value: {X_train_prepared['neighbourhood'].isna().sum()}")
print(f"Number of unique value: {len(X_train_prepared['neighbourhood'].unique())}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["neighbourhood"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Neighbourhood", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="neighbourhood").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="neighbourhood", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Neighbourhood", row=2, col=1, tickangle=45)
fig.update_xaxes(row=1, col=1, tickangle=45)

fig.update_layout(height=1000)

fig.show()

#### Exploring neighbourhood_cleansed

In [None]:
print(f"Number of null value: {X_train_prepared['neighbourhood_cleansed'].isna().sum()}")
print(f"Number of unique value: {len(X_train_prepared['neighbourhood_cleansed'].unique())}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["neighbourhood_cleansed"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Neighbourhood cleansed", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="neighbourhood_cleansed").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="neighbourhood_cleansed", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Neighbourhood cleansed", row=2, col=1)

fig.update_layout(height=900)

fig.show()

#### Exploring neighbourhood_group_cleansed

In [None]:
print(f"Number of null value: {X_train_prepared['neighbourhood_group_cleansed'].isna().sum()}")
print(f"Number of unique value: {len(X_train_prepared['neighbourhood_group_cleansed'].unique())}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["neighbourhood_group_cleansed"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Neighbourhood group cleansed", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="neighbourhood_group_cleansed").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="neighbourhood_group_cleansed", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Neighbourhood group cleansed", row=2, col=1)

# fig.update_layout(height=900)

fig.show()

#### Exploring latitude & longitude

In [None]:
print(f"Number of null latitude value: {X_train_prepared['latitude'].isna().sum()}")
print(f"Number of null longitude value: {X_train_prepared['longitude'].isna().sum()}")

fig = go.Figure(go.Densitymapbox(
    lat=X_train_prepared['latitude'],
    lon=X_train_prepared['longitude'],
    z=X_train_prepared['price_usd'],
    radius=10,
    colorbar=dict(
        title='Price (USD)',
        tickvals=[X_train_prepared['price_usd'].min(), X_train_prepared['price_usd'].max()],
    ),
))

fig.update_layout(
    mapbox=dict(
        center=dict(lat=X_train_prepared['latitude'].mean(), lon=X_train_prepared['longitude'].mean()),
        style='open-street-map',
        zoom=10,
    ),
    showlegend=False,
    margin=dict(l=15, r=15, t=15, b=15),
)

fig.show()

#### Exploring property_type

In [None]:
print(f"Number of null value: {X_train_prepared['property_type'].isna().sum()}")
print(f"Number of unique value: {len(X_train_prepared['property_type'].unique())}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["property_type"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Property type", "Scatter Plot"], vertical_spacing=0.4)

histogram_trace = px.histogram(X_train_prepared, x="property_type").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="property_type", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Property type", row=2, col=1)

fig.update_layout(height=900)

fig.show()

#### Exploring room_type

In [None]:
print(f"Number of null value: {X_train_prepared['room_type'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["room_type"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Room type", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="room_type").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="room_type", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Room type", row=2, col=1)

fig.show()

#### Exploring accommodates

In [None]:
print(f"Number of null value: {X_train_prepared['accommodates'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["accommodates"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Accommodates", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="accommodates").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="accommodates", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Accommodates", row=2, col=1)

fig.show()

#### Exploring bathrooms

In [None]:
print(f"Number of null value: {X_train_prepared['bathrooms'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["bathrooms"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Bathrooms", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="bathrooms").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="bathrooms", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Bathrooms", row=2, col=1)

fig.show()

#### Exploring bathrooms_text

In [None]:
print(f"Number of null value: {X_train_prepared['bathrooms_text'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["bathrooms_text"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Bathrooms text", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="bathrooms_text").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="bathrooms_text", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Bathrooms text", row=2, col=1)

fig.update_layout(height=800)

fig.show()

#### Exploring bedrooms

In [None]:
print(f"Number of null value: {X_train_prepared['bedrooms'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["bedrooms"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Bedrooms", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="bedrooms").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="bedrooms", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Bedrooms", row=2, col=1)

fig.show()

#### Exploring beds

In [None]:
print(f"Number of null value: {X_train_prepared['beds'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["beds"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Beds", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="beds").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="beds", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Beds", row=2, col=1)

fig.show()

#### Exploring amenities

In [2]:
print(f"Number of null value: {X_train_prepared['amenities'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["amenities"].describe())

Number of null value: 0
count                           71372
unique                          68082
top       ["Long term stays allowed"]
freq                              177
Name: amenities, dtype: object


In [27]:
def remove_sumbol(s):
    translation_table = str.maketrans("", "", '[]"')
    return s.translate(translation_table)

X_train_prepared["amenities"] = X_train_prepared["amenities"].apply(remove_sumbol)


In [28]:
s_0 = X_train_prepared["amenities"][0]
s_1 = X_train_prepared["amenities"][1]
s_2 = X_train_prepared["amenities"][2]
s_3 = X_train_prepared["amenities"][4]
s_4 = X_train_prepared["amenities"][5]


print(s_0)
print(s_1)
print(s_2)
print(s_3)
print(s_4)

Free parking on premises, Children\u2019s books and toys, Heating, Host greets you, Cooking basics, Kitchen, Coffee maker, Iron, Long term stays allowed, Essentials, First aid kit, Hangers, Washer, Hair dryer, Wifi, Microwave, Free street parking, Hot water, Luggage dropoff allowed, Backyard, TV, Dishes and silverware, Smoke alarm, Lake access, Fire extinguisher, Crib, Refrigerator, Stove
TV, Heating, Dishes and silverware, Kitchen, Dryer, Smart lock, Elevator, Washer, Air conditioning, Refrigerator, Long term stays allowed, Wifi
Free parking on premises, Children\u2019s books and toys, High chair, Heating, Cooking basics, Kitchen, Coffee maker, 32\ TV, Iron, Dedicated workspace, Long term stays allowed, Private entrance, Essentials, First aid kit, Ski-in/Ski-out, Extra pillows and blankets, Hangers, Single level home, Shampoo, Hair dryer, Private fenced garden or backyard, Wifi, Microwave, Free street parking, Clothing storage, Air conditioning, Carbon monoxide alarm, Bed linens, Hot 

In [30]:
categories_list = []

def categories_count(row):
    categories = eval(row)
    categories_list.extend(categories)

X_train_prepared["amenities"].apply(categories_count)

unique_categories = set(categories_list)

print(len(unique_categories))
print(unique_categories)

5246
{'Eco friendly shampoo shampoo', 'FRANKE stainless steel oven', 'Whirpool refrigerator', 'Indesit  stainless steel stove', 'General 120 litri refrigerator', 'Premium brands conditioner', 'Private outdoor pool - infinity, lap pool, saltwater', '55" HDTV with Apple TV, Netflix, standard cable', 'Prodotti locali conditioner', 'Rex stainless steel induction stove', 'Shared outdoor pool - heated, infinity, rooftop', 'L’Occitane Verveine body soap', 'Non specificato body soap', 'Waterfront', 'SHAMPOO conditioner', 'TV with Amazon Prime Video', 'Fast wifi – 362 Mbps', 'Mondoconvenienza refrigerator', 'Daikin refrigerator', 'Fast wifi – 88 Mbps', 'FRANKE stainless steel gas stove', 'Fast wifi – 112 Mbps', '65" HDTV with DVD player', 'Shared outdoor pool - available seasonally, open specific hours, olympic-sized, sun loungers', 'WHIRPOOL induction stove', 'Alpesinox stainless steel gas stove', 'Coop shampoo', '52" HDTV with Netflix, standard cable, Amazon Prime Video', '44" HDTV with Netfl

#### Exploring minimum_nights

In [None]:
print(f"Number of null value: {X_train_prepared['minimum_nights'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["minimum_nights"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Minimum nights", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="minimum_nights").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="minimum_nights", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Minimum nights", row=2, col=1)

fig.show()

#### Exploring maximum_nights

In [None]:
print(f"Number of null value: {X_train_prepared['maximum_nights'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["maximum_nights"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Maximum nights", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="maximum_nights").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="maximum_nights", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Maximum nights", row=2, col=1)

fig.show()

#### Exploring minimum_minimum_nights

In [None]:
print(f"Number of null value: {X_train_prepared['minimum_minimum_nights'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["minimum_minimum_nights"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Minimum minimum nights", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="minimum_minimum_nights").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="minimum_minimum_nights", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Minimum minimum nights", row=2, col=1)

fig.show()

#### Exploring maximum_minimum_nights

In [None]:
print(f"Number of null value: {X_train_prepared['maximum_minimum_nights'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["maximum_minimum_nights"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Maximum minimum nights", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="maximum_minimum_nights").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="maximum_minimum_nights", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Maximum minimum nights", row=2, col=1)

fig.show()

#### Exploring minimum_maximum_nights

In [None]:
print(f"Number of null value: {X_train_prepared['minimum_maximum_nights'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["minimum_maximum_nights"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Minimum maximum nights", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="minimum_maximum_nights").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="minimum_maximum_nights", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Minimum maximum nights", row=2, col=1)

fig.show()

#### Exploring maximum_maximum_nights

In [None]:
print(f"Number of null value: {X_train_prepared['maximum_maximum_nights'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["maximum_maximum_nights"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Maximum maximum nights", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="maximum_maximum_nights").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="maximum_maximum_nights", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Maximum maximum nights", row=2, col=1)

fig.show()

#### Exploring minimum_nights_avg_ntm

In [None]:
print(f"Number of null value: {X_train_prepared['minimum_nights_avg_ntm'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["minimum_nights_avg_ntm"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Minimum nights avg ntm", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="minimum_nights_avg_ntm").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="minimum_nights_avg_ntm", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Minimum nights avg ntm", row=2, col=1)

fig.show()

#### Exploring maximum_nights_avg_ntm

In [None]:
print(f"Number of null value: {X_train_prepared['maximum_nights_avg_ntm'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["maximum_nights_avg_ntm"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Maximum nights avg ntm", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="maximum_nights_avg_ntm").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="maximum_nights_avg_ntm", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Maximum nights avg ntm", row=2, col=1)

fig.show()

#### Exploring has_availability

In [None]:
print(f"Number of null value: {X_train_prepared['has_availability'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["has_availability"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Has availability", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="has_availability").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="has_availability", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Has availability", row=2, col=1)

fig.show()

#### Exploring availability_30 & availability_60 & availability_90 & availability_365

In [None]:
print(f"Number of null value: {X_train_prepared['availability_30'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")

fig = make_subplots(rows=4, cols=2, subplot_titles=["Histogram of Availability 30", "Scatter Plot"], vertical_spacing=0.1)

histogram_trace_1 = px.histogram(X_train_prepared, x="availability_30").data[0]
histogram_trace_2 = px.histogram(X_train_prepared, x="availability_60").data[0]
histogram_trace_3 = px.histogram(X_train_prepared, x="availability_90").data[0]
histogram_trace_4 = px.histogram(X_train_prepared, x="availability_365").data[0]
fig.add_trace(histogram_trace_1, row=1, col=1)
fig.add_trace(histogram_trace_2, row=2, col=1)
fig.add_trace(histogram_trace_3, row=3, col=1)
fig.add_trace(histogram_trace_4, row=4, col=1)

scatter_trace_1 = px.scatter(X_train_prepared, x="availability_30", y="price_usd").data[0]
scatter_trace_2 = px.scatter(X_train_prepared, x="availability_60", y="price_usd").data[0]
scatter_trace_3 = px.scatter(X_train_prepared, x="availability_90", y="price_usd").data[0]
scatter_trace_4 = px.scatter(X_train_prepared, x="availability_365", y="price_usd").data[0]
fig.add_trace(scatter_trace_1, row=1, col=2)
fig.add_trace(scatter_trace_2, row=2, col=2)
fig.add_trace(scatter_trace_3, row=3, col=2)
fig.add_trace(scatter_trace_4, row=4, col=2)

fig.update_yaxes(title_text="Price USD", row=1, col=2)
fig.update_yaxes(title_text="Price USD", row=2, col=2)
fig.update_yaxes(title_text="Price USD", row=3, col=2)
fig.update_yaxes(title_text="Price USD", row=4, col=2)
fig.update_xaxes(title_text="Availability 30", row=1, col=2)
fig.update_xaxes(title_text="Availability 60", row=2, col=2)
fig.update_xaxes(title_text="Availability 90", row=3, col=2)
fig.update_xaxes(title_text="Availability 365", row=4, col=2)

fig.update_layout(height=1000)

fig.show()

#### Exploring calendar_last_scraped

In [4]:
print(f"Number of null value: {X_train_prepared['calendar_last_scraped'].isna().sum()}")
X_train_prepared["calendar_last_scraped"].describe()

Number of null value: 0


count                            71914
mean     2022-09-20 17:19:26.493311232
min                2022-09-07 00:00:00
25%                2022-09-15 00:00:00
50%                2022-09-25 00:00:00
75%                2022-09-27 00:00:00
max                2022-09-29 00:00:00
Name: calendar_last_scraped, dtype: object

#### Exploring calendar_updated

In [5]:
print(f"Number of null value: {X_train_prepared['calendar_updated'].isna().sum()}")
X_train_prepared["calendar_updated"].describe()

Number of null value: 71914


count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: calendar_updated, dtype: float64

#### Exploring number_of_reviews

In [None]:
print(f"Number of null value: {X_train_prepared['number_of_reviews'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["number_of_reviews"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Number of reviews", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="number_of_reviews").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="number_of_reviews", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Number of reviews", row=2, col=1)

fig.show()

#### Exploring number_of_reviews_ltm

In [None]:
print(f"Number of null value: {X_train_prepared['number_of_reviews_ltm'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["number_of_reviews_ltm"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Number of reviews ltm", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="number_of_reviews_ltm").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="number_of_reviews_ltm", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Number of reviews ltm", row=2, col=1)

fig.show()

#### Exploring number_of_reviews_l30d

In [None]:
print(f"Number of null value: {X_train_prepared['number_of_reviews_l30d'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["number_of_reviews_l30d"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Number of reviews l30d", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="number_of_reviews_l30d").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="number_of_reviews_l30d", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Number of reviews l30d", row=2, col=1)

fig.show()

#### Exploring review_scores_rating

In [None]:
print(f"Number of null value: {X_train_prepared['review_scores_rating'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["review_scores_rating"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Review scores rating", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="review_scores_rating").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="review_scores_rating", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Review scores rating", row=2, col=1)

fig.show()

#### Exploring review_scores_accuracy

In [None]:
print(f"Number of null value: {X_train_prepared['review_scores_accuracy'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["review_scores_accuracy"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Review scores accuracy", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="review_scores_accuracy").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="review_scores_accuracy", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Review scores accuracy", row=2, col=1)

fig.show()

#### Exploring review_scores_cleanliness

In [None]:
print(f"Number of null value: {X_train_prepared['review_scores_cleanliness'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["review_scores_cleanliness"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Review scores cleanliness", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="review_scores_cleanliness").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="review_scores_cleanliness", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Review scores cleanliness", row=2, col=1)

fig.show()

#### Exploring review_scores_checkin

In [None]:
print(f"Number of null value: {X_train_prepared['review_scores_checkin'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["review_scores_checkin"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Review scores checkin", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="review_scores_checkin").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="review_scores_checkin", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Review scores checkin", row=2, col=1)

fig.show()

#### Exploring review_scores_communication

In [None]:
print(f"Number of null value: {X_train_prepared['review_scores_communication'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["review_scores_communication"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Review scores communication", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="review_scores_communication").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="review_scores_communication", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Review scores communication", row=2, col=1)

fig.show()

#### Exploring review_scores_location

In [None]:
print(f"Number of null value: {X_train_prepared['review_scores_location'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["review_scores_location"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Review scores location", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="review_scores_location").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="review_scores_location", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Review scores location", row=2, col=1)

fig.show()

#### Exploring review_scores_value

In [None]:
print(f"Number of null value: {X_train_prepared['review_scores_value'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["review_scores_value"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Review scores value", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="review_scores_value").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="review_scores_value", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Review scores value", row=2, col=1)

fig.show()

#### Exploring license

In [None]:
print(f"Number of null value: {X_train_prepared['license'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["license"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of License", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="license").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="license", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="License", row=2, col=1)

fig.show()

#### Exploring instant_bookable

In [None]:
print(f"Number of null value: {X_train_prepared['instant_bookable'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["instant_bookable"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Instant bookable", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="instant_bookable").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="instant_bookable", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Instant bookable", row=2, col=1)

fig.show()

#### Exploring calculated_host_listings_count

In [None]:
print(f"Number of null value: {X_train_prepared['calculated_host_listings_count'].isna().sum()}")
print(X_train_prepared["calculated_host_listings_count"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Calculated host listings count", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="calculated_host_listings_count").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="calculated_host_listings_count", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Calculated host listings count", row=2, col=1)

fig.show()

#### Exploring calculated_host_listings_count_entire_homes

In [None]:
print(f"Number of null calculated_host_listings_count_entire_homes value: {X_train_prepared['calculated_host_listings_count_entire_homes'].isna().sum()}")
print(f"Number of null calculated_host_listings_count_private_rooms value: {X_train_prepared['calculated_host_listings_count_private_rooms'].isna().sum()}")
print(f"Number of null calculated_host_listings_count_shared_rooms value: {X_train_prepared['calculated_host_listings_count_shared_rooms'].isna().sum()}")
print(X_train_prepared["calculated_host_listings_count_entire_homes"].describe())
print(X_train_prepared["calculated_host_listings_count_private_rooms"].describe())
print(X_train_prepared["calculated_host_listings_count_shared_rooms"].describe())

fig = make_subplots(rows=3, cols=2, subplot_titles=["Histogram of Calculated host listings count entire homes", "Scatter Plot"], vertical_spacing=0.2)

column_names = ["calculated_host_listings_count_entire_homes", "calculated_host_listings_count_private_rooms", "calculated_host_listings_count_shared_rooms"]
target_column = "price_usd"

for i, column_name in enumerate(column_names, start=1):
    # Histograms
    histogram_trace = px.histogram(X_train_prepared, x=column_name).data[0]
    fig.add_trace(histogram_trace, row=i, col=1)
    
    # Scatter plots
    scatter_trace = px.scatter(X_train_prepared, x=column_name, y=target_column).data[0]
    fig.add_trace(scatter_trace, row=i, col=2)
    
    # Update axes
    fig.update_yaxes(title_text="Price USD", row=i, col=2)
    fig.update_xaxes(title_text=f"{column_name}", row=i, col=1)

fig.update_layout(height=800)
fig.show()

#### Exploring reviews_per_month

In [None]:
print(f"Number of null value: {X_train_prepared['reviews_per_month'].isna().sum()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["reviews_per_month"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Reviews per month", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="reviews_per_month").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="reviews_per_month", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Reviews per month", row=2, col=1)

fig.show()

#### Exploring city

In [None]:
print(f"Number of null value: {X_train_prepared['city'].isna().sum()}")
print(f"Unique value: {X_train_prepared['city'].unique()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["city"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of City", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="city").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="city", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="City", row=2, col=1)

fig.show()

#### Exploring type

In [None]:
print(f"Number of null value: {X_train_prepared['type'].isna().sum()}")
# print(f"Unique value: {X_train_prepared['type'].unique()}")
# corr = np.corrcoef(X_train_prepared["price_usd"], X_train_prepared["host_verifications"])[0, 1]
# print(f"Corr coef between price_usd and host_listings_count: {corr}")
print(X_train_prepared["type"].describe())

fig = make_subplots(rows=2, cols=1, subplot_titles=["Histogram of Type", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace = px.histogram(X_train_prepared, x="type").data[0]
fig.add_trace(histogram_trace, row=1, col=1)

scatter_trace = px.scatter(X_train_prepared, x="type", y="price_usd").data[0]
fig.add_trace(scatter_trace, row=2, col=1)

fig.update_yaxes(title_text="Price USD", row=2, col=1)
fig.update_xaxes(title_text="Type", row=2, col=1)

fig.show()

#### Exploring Region & City

In [None]:
print(f"Number of null region value: {X_train_prepared['region'].isna().sum()}")
print(f"Number of null city value: {X_train_prepared['city'].isna().sum()}")
print(f"Number of unique region value: {len(X_train_prepared['region'].unique())}")
print(f"Number of unique city value: {len(X_train_prepared['city'].unique())}")

fig = make_subplots(rows=2, cols=2, subplot_titles=["Histogram of Region & City", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace_1 = px.histogram(X_train_prepared, x="region").data[0]
histogram_trace_2 = px.histogram(X_train_prepared, x="city").data[0]
fig.add_trace(histogram_trace_1, row=1, col=1)
fig.add_trace(histogram_trace_2, row=2, col=1)

scatter_trace_1 = px.scatter(X_train_prepared, x="region", y="price_usd").data[0]
scatter_trace_2 = px.scatter(X_train_prepared, x="city", y="price_usd").data[0]
fig.add_trace(scatter_trace_1, row=1, col=2)
fig.add_trace(scatter_trace_2, row=2, col=2)

fig.update_yaxes(title_text="Price USD", row=1, col=2)
fig.update_yaxes(title_text="Price USD", row=2, col=2)
fig.update_xaxes(title_text="region", row=1, col=2)
fig.update_xaxes(title_text="city", row=2, col=2)
fig.update_xaxes(title_text="region", row=1, col=1)
fig.update_xaxes(title_text="city", row=2, col=1)

fig.update_layout(height=700)

fig.show()

#### Restore values in columns host_response_time & host_response_rate & host_acceptance_rate

In [2]:
print(f"Number of null host_response_time value: {X_public_prepared['host_response_time'].isna().sum()}")
print(f"Number of null host_response_rate value: {X_public_prepared['host_response_rate'].isna().sum()}")
print(f"Number of null host_acceptance_rate value: {X_public_prepared['host_acceptance_rate'].isna().sum()}")

Number of null host_response_time value: 14360
Number of null host_response_rate value: 14360
Number of null host_acceptance_rate value: 12006


In [3]:
selected_columns = ["host_response_time", "host_response_rate", "host_acceptance_rate", "host_is_superhost", "host_listings_count", "number_of_reviews"]
df_result = X_public_prepared[selected_columns]

filtered_rows = df_result.notna().all(axis=1)
df_result = df_result[filtered_rows]

# df_host_response_rate = pd.get_dummies(df_result["host_response_time"], drop_first=True)
df_host_is_superhost = pd.get_dummies(df_result["host_is_superhost"], drop_first=True)

# df_result = df_result.drop(["host_response_time"], axis=1)
df_result = df_result.drop(["host_is_superhost"], axis=1)
df_result = df_result.drop(["host_response_time", "host_is_superhost"], axis=1)
df_result = pd.concat([df_host_is_superhost, df_result], axis=1)

# df_result["host_is_superhost"].replace({"f":False, "t":True}, inplace=True)
df_result["host_response_time"].replace({"within a day":1, "within a few hours":2, "within an hour":3, "a few days or more":4}, inplace=True)

print(df_result.shape)
df_result.head()

(72590, 6)


Unnamed: 0,t,host_response_time,host_response_rate,host_acceptance_rate,host_listings_count,number_of_reviews
0,False,3,100.0,100.0,5.0,43
1,False,3,96.0,99.0,12.0,93
2,True,3,100.0,100.0,3.0,14
3,True,3,100.0,90.0,14.0,39
4,True,3,100.0,90.0,14.0,160


In [80]:
def corr_for_num(df, x, y):
    print(f"Нулевая гипотеза (H0): Нет линейной связи между двумя переменными {x} & {y}")
    print(f"Альтернативная гипотеза (H1): Существует линейная связь между двумя переменными {x} & {y}")

    correlation_coefficient, p_value = scipy.stats.pearsonr(df[x], df[y])
    alpha = 0.05

    print(f"Коэффициент корреляции: {correlation_coefficient}")
    print(f"P-value: {p_value}")

    if p_value < alpha:
        print("Отклоняем нулевую гипотезу: Есть статистически значимая линейная связь.")
    else:
        print("Не отклоняем нулевую гипотезу: Нет статистически значимой линейной связи.")
    
    fig = px.scatter(df, x=x, y=y)
    fig.update_layout(height=400, width=700)
    fig.show()
    

def corr_for_cat(df, x, y, y_0, y_1, y_2, y_3):
    print(f"Нулевая гипотеза (H0): Нет линейной связи между двумя переменными {x} & {y}")
    print(f"Альтернативная гипотеза (H1): Существует линейная связь между двумя переменными {x} & {y}")

    f_statistic, p_value = scipy.stats.f_oneway(df[f"{x}"][df[f"{y}"] == y_0],
                                                df[f"{x}"][df[f"{y}"] == y_1],
                                                df[f"{x}"][df[f"{y}"] == y_2],
                                                df[f"{x}"][df[f"{y}"] == y_3])

    alpha = 0.05

    print(f"Статистика теста: {f_statistic}")
    print(f"P-value: {p_value}")

    if p_value < alpha:
        print("Отклоняем нулевую гипотезу: Есть статистически значимая линейная связь.")
    else:
        print("Не отклоняем нулевую гипотезу: Нет статистически значимой линейной связи.")
    

In [None]:
corr_for_num(df_result, x="host_response_rate", y="host_acceptance_rate")

In [None]:
corr_for_num(df_result, x="host_response_rate", y="host_listings_count")

In [None]:
corr_for_num(df_result, x="host_response_rate", y="number_of_reviews")

In [81]:
y_list = ["within an hour", "a few days or more", "within a few hours", "within a day"]
corr_for_cat(df_result, x="host_response_rate", y="host_response_time", y_0="within an hour", y_1="a few days or more", y_2="within a few hours", y_3="within a day")

Нулевая гипотеза (H0): Нет линейной связи между двумя переменными host_response_rate & host_response_time
Альтернативная гипотеза (H1): Существует линейная связь между двумя переменными host_response_rate & host_response_time
Статистика теста: 73308.88366811286
P-value: 0.0
Отклоняем нулевую гипотезу: Есть статистически значимая линейная связь.


In [82]:
y_list = ["within an hour", "a few days or more", "within a few hours", "within a day"]
corr_for_cat(df_result, x="host_acceptance_rate", y="host_response_time", y_0="within an hour", y_1="a few days or more", y_2="within a few hours", y_3="within a day")

Нулевая гипотеза (H0): Нет линейной связи между двумя переменными host_acceptance_rate & host_response_time
Альтернативная гипотеза (H1): Существует линейная связь между двумя переменными host_acceptance_rate & host_response_time
Статистика теста: 8694.813615098745
P-value: 0.0
Отклоняем нулевую гипотезу: Есть статистически значимая линейная связь.


In [151]:
def restore_num_var(df, target):
    models = [LinearRegression(), Ridge(), Lasso()]
    models_name = ["LinearReg", "Ridge", "Lasse"]
        
    best_mae = float('inf')
    best_y_pred = None
    X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1), df[target],test_size=0.2, random_state=20)
    for model, model_name in zip(models, models_name):
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        mae_train = mean_absolute_error(y_train, y_pred_train)
        mae_test = mean_absolute_error(y_test, y_pred_test)
        if best_mae > mae_test:
            best_mae = mae_test
            best_y_pred = y_pred_test
        
        print(model_name)
        print(f"mae_train = {mae_train}")
        print(f"mae_test = {mae_test}")
    
    n = 150
    a = pd.DataFrame({'x': range(len(y_test[:n])), 'y_test': y_test[:n], 'best_y_pred': best_y_pred[:n]})
    fig = px.line(a, x='x', y=['y_test', 'best_y_pred'], color_discrete_map={'y_test': 'blue', 'best_y_pred': 'red'})
    fig.update_layout(margin=dict(l=15, r=15, t=15, b=15))
    fig.show()
        
    return best_mae, y_test, best_y_pred


def restore_cat_var(df, target, binary=True):
    models = [LogisticRegression()]
    models_name = ["LogisticReg"]
        
    best_y_pred = None
    X_train, X_test, y_train, y_test = train_test_split(df.drop(target, axis=1), df[target],test_size=0.2, random_state=20)
    for model, model_name in zip(models, models_name):
        model.fit(X_train, y_train)
        y_pred_train = model.predict(X_train)
        y_pred_test = model.predict(X_test)
        accuracy_train = accuracy_score(y_train, y_pred_train)
        accuracy_test = accuracy_score(y_test, y_pred_test)
        
        if binary:
            f1_score_train = f1_score(y_train, y_pred_train)
            f1_score_test = f1_score(y_test, y_pred_test)
        else:
            f1_score_train = f1_score(y_train, y_pred_train, average='weighted')
            f1_score_test = f1_score(y_test, y_pred_test, average='weighted')
            
        best_y_pred = y_pred_test
        
        print(model_name)
        print(f"accuracy_score = {accuracy_train}")
        print(f"accuracy_test = {accuracy_test}")
        print(f"f1_score_train = {f1_score_train}")
        print(f"f1_score_test = {f1_score_test}")
        
    return best_y_pred

In [None]:
best_mae, y_test, best_y_pred = restore_num_var(df_result, target="host_response_rate")

In [152]:
best_y_pred = restore_cat_var(df_result, target="host_response_time", binary=False)

LogisticReg
accuracy_score = 0.7558203609312577
accuracy_test = 0.7558892409422785
f1_score_train = 0.6908075638240082
f1_score_test = 0.6910031343805625


### Feature Engeenering

#### Exploring distance_to_centre_city & cat_distance

In [None]:
print(X_train_prepared["distance_to_centre_city"].describe())

fig = make_subplots(rows=2, cols=2, subplot_titles=["Histogram of Distance to centre city", "Scatter Plot"], vertical_spacing=0.3)

histogram_trace_1 = px.histogram(X_train_prepared, x="distance_to_centre_city").data[0]
histogram_trace_2 = px.histogram(X_train_prepared, x="cat_distance").data[0]
fig.add_trace(histogram_trace_1, row=1, col=1)
fig.add_trace(histogram_trace_2, row=2, col=1)

scatter_trace_1 = px.scatter(X_train_prepared, x="distance_to_centre_city", y="price_usd").data[0]
bax_trace_2 = px.box(X_train_prepared, x="cat_distance", y="price_usd", points="all").data[0]
fig.add_trace(scatter_trace_1, row=1, col=2)
fig.add_trace(bax_trace_2, row=2, col=2)

fig.update_yaxes(title_text="Price USD", row=1, col=2)
fig.update_yaxes(title_text="Price USD", row=2, col=2)
fig.update_xaxes(title_text="Distance to centre city", row=1, col=1)
fig.update_xaxes(title_text="Cat distance", row=2, col=1)

fig.update_layout(height=700)

fig.show()

### Data Preparation

In [2]:
X_train, X_test, y_train, y_test = train_test_split(X_public_encoded.drop("price_cat", axis=1), X_public_encoded["price_cat"], test_size=0.2, random_state=20)

In [None]:
a = X_public_encoded["price_cat"].value_counts()
unique_values = [0, 1]
class_frequencies = [a.loc[0], a.loc[1]]
class_weights = [sum(class_frequencies) / (len(unique_values) * freq) for freq in class_frequencies]
class_weights

In [4]:
cat_features = [0, 6, 21, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 37, 40, 42, 44]
text_features = [47, 48]

In [None]:
def cat_eval(iterations, depth, learning_rate, l2_leaf_reg, min_data_in_leaf, class_weights):
    params = {
        'iterations': int(iterations),
        'depth': int(depth),
        'learning_rate': learning_rate,
        'l2_leaf_reg': l2_leaf_reg,
        'min_data_in_leaf': int(min_data_in_leaf),
        'cat_features': cat_features,
        'text_features': text_features,
        "class_weights": class_weights,
        'verbose': False
    }
    
    cv_scores = cross_val_score(catboost.CatBoostClassifier(**params), 
                                X_train, 
                                y_train, 
                                cv=3)
    return cv_scores.mean()

pbounds = {
    'iterations': (100, 5000),
    'depth': (4, 12),
    'learning_rate': (0.001, 0.2),
    'l2_leaf_reg': (1, 10),
    'min_data_in_leaf': (1, 40),
    "class_weights": class_weights
}

cat_opt = BayesianOptimization(
    f=cat_eval,
    pbounds=pbounds
)

cat_opt.maximize(n_iter=4, init_points=3)

In [None]:
iterations = int(692.6)
depth = int(4.604)
learning_rate = 0.1865
l2_leaf_reg = 2.65
min_data_in_leaf = 19.76

clf = catboost.CatBoostClassifier(cat_features=cat_features, 
                                  text_features=text_features,
                                  iterations=iterations,
                                  depth=depth,
                                  learning_rate=learning_rate,
                                  l2_leaf_reg=l2_leaf_reg,
                                  min_data_in_leaf=min_data_in_leaf,
                                  class_weights=class_weights,
                                  verbose=100)
clf.fit(X_train, y_train)

In [None]:
feature_importance = clf.feature_importances_
feature_names = clf.feature_names_
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": feature_importance
})
importance_df = importance_df.sort_values(by='Importance', ascending=False)
fig = px.bar(
    importance_df,
    x='Importance',
    y='Feature',
    title='Feature Importance',
    labels={'Importance': 'SHAP Values'},
    height=1000
)
fig.show()

In [5]:
y_probs = clf.predict_proba(X_test)[:, 1]

In [6]:
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

fig = go.Figure()

fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f'ROC Curve (AUC={roc_auc:.2f})'))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line={'dash': 'dash'}))

fig.update_layout(
    title='Receiver Operating Characteristic (ROC) Curve',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    showlegend=True
)

fig.update_layout(height=500, width=700)

fig.show()

AUC = 0.92

In [None]:
iterations = int(692.6)
depth = int(4.604)
learning_rate = 0.1865
l2_leaf_reg = 2.65
min_data_in_leaf = 19.76
clf = catboost.CatBoostClassifier(cat_features=cat_features, 
                                  text_features=text_features,
                                  iterations=iterations,
                                  depth=depth,
                                  learning_rate=learning_rate,
                                  l2_leaf_reg=l2_leaf_reg,
                                  min_data_in_leaf=min_data_in_leaf,
                                  class_weights=class_weights,
                                  verbose=100)
train_pool = catboost.Pool(data=X_public_encoded.drop("price_cat", axis=1), label=X_public_encoded["price_cat"], cat_features=cat_features, text_features=text_features)
clf.fit(train_pool)

In [7]:
y_pred = clf.predict(X_privat_encoded)
X_privat_encoded["price_cat"] = y_pred

In [8]:
X_public_encoded["price_usd"] = X_public_prepared["price_usd"]

In [9]:
X_public_encoded_null = X_public_encoded[X_public_encoded["price_cat"] == 0]
X_public_encoded_one = X_public_encoded[X_public_encoded["price_cat"] == 1]
X_privat_encoded_null = X_privat_encoded[X_privat_encoded["price_cat"] == 0]
X_privat_encoded_one = X_privat_encoded[X_privat_encoded["price_cat"] == 1]

X_public_encoded_null = X_public_encoded_null.drop("price_cat", axis=1)
X_public_encoded_one = X_public_encoded_one.drop("price_cat", axis=1)
X_privat_encoded_null = X_privat_encoded_null.drop("price_cat", axis=1)
X_privat_encoded_one = X_privat_encoded_one.drop("price_cat", axis=1)

y_public_null = np.log(X_public_encoded_null["price_usd"])
y_public_one = np.log(X_public_encoded_one["price_usd"])

In [None]:
def cat_eval(iterations, depth, learning_rate, l2_leaf_reg, min_data_in_leaf, cat_features, text_features):
    params = {
        'iterations': int(iterations),
        'depth': int(depth),
        'learning_rate': learning_rate,
        'l2_leaf_reg': l2_leaf_reg,
        'min_data_in_leaf': int(min_data_in_leaf),
        'loss_function': 'RMSE',
        'eval_metric': 'MAE',
        'cat_features': cat_features,
        'text_features': text_features,
        'verbose': False
    }
    
    cv_scores = cross_val_score(catboost.CatBoostRegressor(**params), X_public_encoded_null.drop("price_usd", axis=1), y_public_null, cv=4, scoring='neg_mean_squared_error')
    return cv_scores.mean()

pbounds = {
    'iterations': (100, 2000),
    'depth': (4, 10),
    'learning_rate': (0.001, 0.1),
    'l2_leaf_reg': (1, 10),
    'min_data_in_leaf': (1, 40),
    "cat_features": cat_features,
    "text_features": text_features
}

cat_opt = BayesianOptimization(
    f=cat_eval,
    pbounds=pbounds
)

cat_opt.maximize(n_iter=4, init_points=3)

In [None]:
def cat_eval(iterations, depth, learning_rate, l2_leaf_reg, min_data_in_leaf, cat_features, text_features):
    params = {
        'iterations': int(iterations),
        'depth': int(depth),
        'learning_rate': learning_rate,
        'l2_leaf_reg': l2_leaf_reg,
        'min_data_in_leaf': int(min_data_in_leaf),
        'loss_function': 'RMSE',
        'eval_metric': 'MAE',
        'cat_features': cat_features,
        'text_features': text_features,
        'verbose': False
    }
    
    cv_scores = cross_val_score(catboost.CatBoostRegressor(**params), X_public_encoded_one.drop("price_usd", axis=1), y_public_one, cv=2, scoring='neg_mean_squared_error')
    return cv_scores.mean()

pbounds = {
    'iterations': (100, 2000),
    'depth': (4, 10),
    'learning_rate': (0.001, 0.1),
    'l2_leaf_reg': (1, 10),
    'min_data_in_leaf': (1, 40),
    "cat_features": cat_features,
    "text_features": text_features
}

cat_opt = BayesianOptimization(
    f=cat_eval,
    pbounds=pbounds
)

cat_opt.maximize(n_iter=4, init_points=3) 

In [None]:
depth = 6.806
iter = 321.4
l2 = 6.973
lr = 0.08957
md = 31.98 
cgb_null = catboost.CatBoostRegressor(
    cat_features=cat_features, 
    text_features=text_features,
    depth=round(depth),
    iterations=round(iter),
    l2_leaf_reg=l2,
    learning_rate=lr,
    min_data_in_leaf=md,
    random_state=20,
    verbose=100
)
train_pool = catboost.Pool(data=X_public_encoded_null.drop("price_usd", axis=1), label=y_public_null, cat_features=cat_features, text_features=text_features)
cgb_null.fit(train_pool)

depth = 6.599 
iter = 310.5
l2 = 4.699
lr = 0.04998
md = 18.5
cgb_one = catboost.CatBoostRegressor(
    cat_features=cat_features, 
    text_features=text_features,
    depth=round(depth),
    iterations=round(iter),
    l2_leaf_reg=l2,
    learning_rate=lr,
    min_data_in_leaf=md,
    random_state=20,
    verbose=100
)
train_pool = catboost.Pool(data=X_public_encoded_one.drop("price_usd", axis=1), label=y_public_one, cat_features=cat_features, text_features=text_features)
cgb_one.fit(train_pool)

In [16]:
y_pred_null = np.exp(cgb_null.predict(X_privat_encoded_null))
y_pred_one = np.exp(cgb_one.predict(X_privat_encoded_one))

In [17]:
id_privat = X_privat_prepared["id"]
X_privat_encoded["id"] = id_privat
X_privat_encoded.head()

id_privat_null = X_privat_encoded[X_privat_encoded["price_cat"] == 0]["id"]
id_privat_one = X_privat_encoded[X_privat_encoded["price_cat"] == 1]["id"]
id_privat_one.head()

2410    826134279509
3333     48656919301
3472    187540898068
3517    176879173440
5319    891949567697
Name: id, dtype: object

In [18]:
df_privat_preds_one = pd.DataFrame({
    "id": id_privat_one,
    "price": y_pred_one
})
df_privat_preds_null = pd.DataFrame({
    "id": id_privat_null,
    "price": y_pred_null
})
df_sub = pd.DataFrame({
    "id": X_privat_encoded["id"]
})

In [20]:
merged_df = pd.merge(df_privat_preds_one, df_privat_preds_null, on='id', how='outer')
final_df = pd.merge(merged_df, df_sub, on='id', how='outer')
final_df['price_x'].fillna(final_df['price_y'], inplace=True)

In [24]:
final_df = final_df.drop("price_y", axis=1)
result_df_df = pd.merge(df_sub, final_df, on='id')
result_df_df = result_df_df.sort_values(by='id')
result_df_df = result_df_df.sort_index()

In [26]:
result_df_df = result_df_df.rename(columns={"price_x": "price"})
result_df_df.head()

(91199, 2)


Unnamed: 0,id,price
0,390928174037,124.693935
1,588144551007,68.548676
2,922917834874,85.732756
3,126940038965,78.419018
4,440821704407,66.411448


In [27]:
result_df_df.to_csv("sub.csv", index=False)