In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

# import ensemble methods
from sklearn.ensemble import (
    BaggingClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
    StackingClassifier,
    RandomForestClassifier,
    AdaBoostRegressor,
)
from xgboost import (
    XGBClassifier,
    XGBRegressor,
)

# import base estimators
from sklearn.tree import (
    DecisionTreeClassifier,
    DecisionTreeRegressor
)
from sklearn.linear_model import (
    LogisticRegression,
    Ridge
)
from sklearn.svm import (
    SVC,
    SVR
)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    r2_score,
)
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.figure_factory as ff

import ast

import warnings

warnings.filterwarnings(
    "ignore", category=DeprecationWarning
)  # to avoid deprecation warnings


In [289]:
url = "https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Machine+Learning+Supervis%C3%A9/Boosting/listings.csv"
dataset = pd.read_csv(url)

dataset.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,241032,https://www.airbnb.com/rooms/241032,20160104002432,2016-01-04,Stylish Queen Anne Apartment,,Make your self at home in this charming one-be...,Make your self at home in this charming one-be...,none,,...,10.0,f,,WASHINGTON,f,moderate,f,f,2,4.07
1,953595,https://www.airbnb.com/rooms/953595,20160104002432,2016-01-04,Bright & Airy Queen Anne Apartment,Chemically sensitive? We've removed the irrita...,"Beautiful, hypoallergenic apartment in an extr...",Chemically sensitive? We've removed the irrita...,none,"Queen Anne is a wonderful, truly functional vi...",...,10.0,f,,WASHINGTON,f,strict,t,t,6,1.48
2,3308979,https://www.airbnb.com/rooms/3308979,20160104002432,2016-01-04,New Modern House-Amazing water view,New modern house built in 2013. Spectacular s...,"Our house is modern, light and fresh with a wa...",New modern house built in 2013. Spectacular s...,none,Upper Queen Anne is a charming neighborhood fu...,...,10.0,f,,WASHINGTON,f,strict,f,f,2,1.15
3,7421966,https://www.airbnb.com/rooms/7421966,20160104002432,2016-01-04,Queen Anne Chateau,A charming apartment that sits atop Queen Anne...,,A charming apartment that sits atop Queen Anne...,none,,...,,f,,WASHINGTON,f,flexible,f,f,1,
4,278830,https://www.airbnb.com/rooms/278830,20160104002432,2016-01-04,Charming craftsman 3 bdm house,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,Cozy family craftman house in beautiful neighb...,none,We are in the beautiful neighborhood of Queen ...,...,9.0,f,,WASHINGTON,f,strict,f,f,1,0.89


There are a lot of columns in this dataset. Display the dataset info.

In [290]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 92 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   id                                3818 non-null   int64  
 1   listing_url                       3818 non-null   object 
 2   scrape_id                         3818 non-null   int64  
 3   last_scraped                      3818 non-null   object 
 4   name                              3818 non-null   object 
 5   summary                           3641 non-null   object 
 6   space                             3249 non-null   object 
 7   description                       3818 non-null   object 
 8   experiences_offered               3818 non-null   object 
 9   neighborhood_overview             2786 non-null   object 
 10  notes                             2212 non-null   object 
 11  transit                           2884 non-null   object 
 12  thumbn

In [291]:
print(f'the number of rows is: {dataset.shape[0]}')
print(f'the number of columns is: {dataset.shape[1]}')

the number of rows is: 3818
the number of columns is: 92


Let's proceed to some visualization, first display the distribution of the price variable. You will have to preprocess it as it is not in a numerical format.

In [292]:
# Percentage of missing values for each column
missing_values = (dataset.isnull().sum() / dataset.shape[0]) * 100

# Filtrer pour afficher uniquement les colonnes avec des valeurs manquantes (> 1%)
missing_values = missing_values[missing_values > 5]

# Affichage des colonnes concernées
print("Colonnes avec des valeurs manquantes (> 15%):")
print(f'Nb de colonnes avec valeurs manquantes: {len(missing_values)}')
print()
print(missing_values)

Colonnes avec des valeurs manquantes (> 15%):
Nb de colonnes avec valeurs manquantes: 29

space                           14.903091
neighborhood_overview           27.029859
notes                           42.063908
transit                         24.463070
thumbnail_url                    8.381351
medium_url                       8.381351
xl_picture_url                   8.381351
host_about                      22.498690
host_response_time              13.698271
host_response_rate              13.698271
host_acceptance_rate            20.246202
host_neighbourhood               7.857517
neighbourhood                   10.895757
square_feet                     97.459403
weekly_price                    47.380828
monthly_price                   60.267156
security_deposit                51.126244
cleaning_fee                    26.977475
first_review                    16.422211
last_review                     16.422211
review_scores_rating            16.946045
review_scores_accuracy      

In [293]:
type(missing_values)

pandas.core.series.Series

In [294]:
print(f'the datatype of price column is: {dataset["price"].dtype}')

the datatype of price column is: object


In [295]:
dataset["price"]

0        $85.00
1       $150.00
2       $975.00
3       $100.00
4       $450.00
         ...   
3813    $359.00
3814     $79.00
3815     $93.00
3816     $99.00
3817     $87.00
Name: price, Length: 3818, dtype: object

In [296]:
dataset["price"][0:5]

0     $85.00
1    $150.00
2    $975.00
3    $100.00
4    $450.00
Name: price, dtype: object

The distribution of the target variable is skewed towards high values (this is a very usual situation when working with prices, many items are around the average price range and the higher the price, the fewer items there are). A standard way of working with such variables is to change the scale using the log function so the distribution becomes evenly distributed. Create a price_log variable that's equal to log(price)

In [297]:
# convertir le type de la cellule
dataset['price'] = dataset['price'].str.replace('$', '', regex=False)  # Remove the dollar sign
dataset['price'] = dataset['price'].str.replace(',', '', regex=False)  # Remove commas
dataset['price'] = dataset['price'].astype(float)  # Convert to float
dataset["price"][0:5]

0     85.0
1    150.0
2    975.0
3    100.0
4    450.0
Name: price, dtype: float64

In [298]:
dataset["price_log"] = np.log10(dataset["price"])
dataset[["price", "price_log"]][0:5]

Unnamed: 0,price,price_log
0,85.0,1.929419
1,150.0,2.176091
2,975.0,2.989005
3,100.0,2.0
4,450.0,2.653213


In [299]:
# basics stats
dataset.dtypes

id                                    int64
listing_url                          object
scrape_id                             int64
last_scraped                         object
name                                 object
                                     ...   
require_guest_profile_picture        object
require_guest_phone_verification     object
calculated_host_listings_count        int64
reviews_per_month                   float64
price_log                           float64
Length: 93, dtype: object

In [300]:
dataset.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,calculated_host_listings_count,reviews_per_month,price_log
count,3818.0,3818.0,3818.0,3816.0,3816.0,3818.0,3818.0,3818.0,3802.0,3812.0,...,3160.0,3165.0,3160.0,3167.0,3163.0,3162.0,0.0,3818.0,3191.0,3818.0
mean,5550111.0,20160100000000.0,15785560.0,7.157757,7.157757,47.628961,-122.333103,3.349398,1.259469,1.307712,...,9.636392,9.556398,9.786709,9.809599,9.608916,9.452245,,2.946307,2.078919,2.032163
std,2962660.0,0.0,14583820.0,28.628149,28.628149,0.043052,0.031745,1.977599,0.590369,0.883395,...,0.698031,0.797274,0.595499,0.568211,0.629053,0.750259,,5.893029,1.822348,0.245605
min,3335.0,20160100000000.0,4193.0,1.0,1.0,47.505088,-122.417219,1.0,0.0,0.0,...,2.0,3.0,2.0,2.0,4.0,2.0,,1.0,0.02,1.30103
25%,3258256.0,20160100000000.0,3275204.0,1.0,1.0,47.609418,-122.35432,2.0,1.0,1.0,...,9.0,9.0,10.0,10.0,9.0,9.0,,1.0,0.695,1.875061
50%,6118244.0,20160100000000.0,10558140.0,1.0,1.0,47.623601,-122.328874,3.0,1.0,1.0,...,10.0,10.0,10.0,10.0,10.0,10.0,,1.0,1.54,2.0
75%,8035127.0,20160100000000.0,25903090.0,3.0,3.0,47.662694,-122.3108,4.0,1.0,2.0,...,10.0,10.0,10.0,10.0,10.0,10.0,,2.0,3.0,2.176091
max,10340160.0,20160100000000.0,53208610.0,502.0,502.0,47.733358,-122.240607,16.0,8.0,7.0,...,10.0,10.0,10.0,10.0,10.0,10.0,,37.0,12.15,3.0


The distribution looks a lot better for prediction purposes after the log transformation!
    
Visualize the price against the following variables :

room type

beds

property type

In [301]:
px.histogram(dataset, "price_log")

In [302]:
# Create a box plot with customizations
fig = px.box(
    dataset,
    x="room_type",
    y="price_log",
    title="Logarithmic Prices by Room Type",
    labels={"room_type": "Room Type", "price_log": "Log of Price"},
    width=800,  # Width of the plot
    height=600  # Height of the plot
)

# Show the plot
fig.show()

In [303]:
# Create a box plot with customizations
fig = px.box(
    dataset,
    x="beds",
    y="price_log",
    title="Logarithmic Prices by Room Type",
    labels={"room_type": "Room Type", "price_log": "Log of Price"},
    width=800,  # Width of the plot
    height=600  # Height of the plot
)

# Show the plot
fig.show()

In [304]:
# Create a box plot with customizations
fig = px.box(
    dataset,
    x="property_type",
    y="price_log",
    title="Logarithmic Prices by Room Type",
    labels={"room_type": "Room Type", "price_log": "Log of Price"},
    width=800,  # Width of the plot
    height=600  # Height of the plot
)

# Show the plot
fig.show()

In [305]:
dataset[dataset['beds']==8][["price_log", "beds"]]

Unnamed: 0,price_log,beds
935,2.628389,8.0
997,2.544068,8.0
1069,1.579784,8.0
1073,1.612784,8.0


In [306]:
dataset[dataset['property_type']=="Boat"][["price_log", "property_type"]]

Unnamed: 0,price_log,property_type
212,1.929419,Boat
319,2.832509,Boat
561,2.176091,Boat
565,2.889302,Boat
567,2.396199,Boat
1917,1.977724,Boat
3263,2.176091,Boat
3303,1.875061,Boat


In [307]:
dataset[dataset['property_type']=="Treehouse"][["price_log", "property_type"]]

Unnamed: 0,price_log,property_type
1733,1.740363,Treehouse
2190,1.681241,Treehouse
2569,2.30103,Treehouse


Isolate the target variable in an object y and the other variables in an object X

In [308]:
# Separate target variable Y from features X
print("Separating labels from features...")

X = dataset.drop(columns=["price_log", "price"], axis= 1)
Y = dataset["price_log"]

print("Y (Target variable):")
print(Y.head())
print("\nX (Features):")
print(X.head())

Separating labels from features...
Y (Target variable):
0    1.929419
1    2.176091
2    2.989005
3    2.000000
4    2.653213
Name: price_log, dtype: float64

X (Features):
        id                           listing_url       scrape_id last_scraped  \
0   241032   https://www.airbnb.com/rooms/241032  20160104002432   2016-01-04   
1   953595   https://www.airbnb.com/rooms/953595  20160104002432   2016-01-04   
2  3308979  https://www.airbnb.com/rooms/3308979  20160104002432   2016-01-04   
3  7421966  https://www.airbnb.com/rooms/7421966  20160104002432   2016-01-04   
4   278830   https://www.airbnb.com/rooms/278830  20160104002432   2016-01-04   

                                  name  \
0         Stylish Queen Anne Apartment   
1   Bright & Airy Queen Anne Apartment   
2  New Modern House-Amazing water view   
3                   Queen Anne Chateau   
4       Charming craftsman 3 bdm house   

                                             summary  \
0                              

We will have to remove a certain number of variables that we do not know how to use at this point. Start by removing the variables that could be interpreted as an id , we will also remove the variables that contain long texts as we haven't learned about text processing yet.

We also have to remove all variables related to price, as they represent a risk of leak because of their direct link to the target variable, like monthly price.

A certain number of variables contain a very high amount of missing values, in some cases these missing values correspond to an information we can exploit, sometimes not. Remove these not so useful variables from the dataset, strat by checking the proportion of missing values for all variables.

Your dataset should only contain categorical and numerical variables after this step. Check if your final dataset contains the following variables :



In [309]:
# Percentage of missing values for each column
missing_values = (dataset.isnull().sum() / dataset.shape[0]) * 100

"""
# Filtrer pour afficher uniquement les colonnes avec des valeurs manquantes (> 1%)
missing_values = missing_values[missing_values > 1]
"""

# Affichage des colonnes concernées
print(f'Nb de colonnes avec valeurs manquantes: {len(missing_values)}')
print(missing_values.sort_values(ascending=False))

Nb de colonnes avec valeurs manquantes: 93
license             100.000000
square_feet          97.459403
monthly_price        60.267156
security_deposit     51.126244
weekly_price         47.380828
                       ...    
accommodates          0.000000
bed_type              0.000000
amenities             0.000000
price                 0.000000
price_log             0.000000
Length: 93, dtype: float64


In [310]:
X_clean = X.drop(["host_location", 
                "experiences_offered",
                  "host_about", 
                  "host_thumbnail_url",
                  "host_picture_url",
                  "license",
                  "monthly_price",
                  "square_feet",
                  "street", 
                  "neighbourhood", 
                  "neighbourhood_cleansed", 
                  "city", 
                  "state", 
                  "market", 
                  "smart_location",
                  "country_code",
                  "country",
                  "amenities",
                  "jurisdiction_names",
                  "first_review",
                  "last_review",
                  "calendar_last_scraped",
                  "weekly_price",
                  "notes",
                  "neighborhood_overview",
                  "name",
                  "description",
                  "listing_url" ,
                  "host_id",
                  "host_url",
                  "scrape_id",
                  "space",
                  "picture_url",
                  "last_scraped",
                  "host_name",
                  "medium_url",
                  "summary",
                  "xl_picture_url",
                  "transit",
                  "thumbnail_url",
                  "host_neighbourhood",
                  "zipcode",
                  "calendar_updated",
                  "license",
                  "id"], axis=1)

Are there any remaining missing values ? Is there a relevant way to replace those missing values without using imputing methods ? Are all the variables in a numerical format ? If not run some preprocessing to create a clean dataset.

In [311]:
# Percentage of missing values for each column
missing_values = (X_clean.isnull().sum() / X_clean.shape[0]) * 100

"""
# Filtrer pour afficher uniquement les colonnes avec des valeurs manquantes (> 1%)
missing_values = missing_values[missing_values > 1]
"""

# Affichage des colonnes concernées
print(f'Nb de colonnes avec valeurs manquantes: {len(missing_values)}')
print(missing_values.sort_values(ascending=False))

Nb de colonnes avec valeurs manquantes: 47
security_deposit                    51.126244
cleaning_fee                        26.977475
host_acceptance_rate                20.246202
review_scores_checkin               17.234154
review_scores_accuracy              17.234154
review_scores_value                 17.181771
review_scores_location              17.155579
review_scores_cleanliness           17.103195
review_scores_communication         17.050812
review_scores_rating                16.946045
reviews_per_month                   16.422211
host_response_rate                  13.698271
host_response_time                  13.698271
bathrooms                            0.419068
bedrooms                             0.157150
host_since                           0.052383
host_is_superhost                    0.052383
host_listings_count                  0.052383
host_total_listings_count            0.052383
host_verifications                   0.052383
host_has_profile_pic                 

In [312]:
X_clean.dtypes

host_since                           object
host_response_time                   object
host_response_rate                   object
host_acceptance_rate                 object
host_is_superhost                    object
host_listings_count                 float64
host_total_listings_count           float64
host_verifications                   object
host_has_profile_pic                 object
host_identity_verified               object
neighbourhood_group_cleansed         object
latitude                            float64
longitude                           float64
is_location_exact                    object
property_type                        object
room_type                            object
accommodates                          int64
bathrooms                           float64
bedrooms                            float64
beds                                float64
bed_type                             object
security_deposit                     object
cleaning_fee                    

In [313]:
X_clean.head()

Unnamed: 0,host_since,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,...,review_scores_communication,review_scores_location,review_scores_value,requires_license,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,2011-08-11,within a few hours,96%,100%,f,3.0,3.0,"['email', 'phone', 'reviews', 'kba']",t,t,...,10.0,9.0,10.0,f,f,moderate,f,f,2,4.07
1,2013-02-21,within an hour,98%,100%,t,6.0,6.0,"['email', 'phone', 'facebook', 'linkedin', 're...",t,t,...,10.0,10.0,10.0,f,f,strict,t,t,6,1.48
2,2014-06-12,within a few hours,67%,100%,f,2.0,2.0,"['email', 'phone', 'google', 'reviews', 'jumio']",t,t,...,10.0,10.0,10.0,f,f,strict,f,f,2,1.15
3,2013-11-06,,,,f,1.0,1.0,"['email', 'phone', 'facebook', 'reviews', 'jum...",t,t,...,,,,f,f,flexible,f,f,1,
4,2011-11-29,within an hour,100%,,f,2.0,2.0,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,t,...,10.0,9.0,9.0,f,f,strict,f,f,1,0.89


In [314]:
print('Processing cleaning fee...')

# Vérifier les premières valeurs avant le traitement
print("Before processing:")
print(X['cleaning_fee'].head())

# Suppression du symbole "$" et conversion en float
X['cleaning_fee'] = X['cleaning_fee'].apply(
    lambda x: float(x.replace('$', '').replace(',', '').strip()) if isinstance(x, str) else x
)

# Remplacement des valeurs manquantes par 0
X['cleaning_fee'] = X['cleaning_fee'].fillna(0)

print("After processing:")
print(X['cleaning_fee'].head())
print('Done.')

Processing cleaning fee...
Before processing:
0        NaN
1     $40.00
2    $300.00
3        NaN
4    $125.00
Name: cleaning_fee, dtype: object
After processing:
0      0.0
1     40.0
2    300.0
3      0.0
4    125.0
Name: cleaning_fee, dtype: float64
Done.


In [315]:
print('Processing host_response_rate...')

# Vérifier les premières valeurs avant traitement
print("Before processing:")
print(X_clean['host_response_rate'].head())

# Conversion du taux de réponse en format numérique
X_clean['host_response_rate'] = X_clean['host_response_rate'].apply(
    lambda x: float(x.strip('%')) / 100 if isinstance(x, str) and x.endswith('%') else x
)

# Vérifier les premières valeurs après traitement
print("After processing:")
print(X_clean['host_response_rate'].head())

print('Done.')

Processing host_response_rate...
Before processing:
0     96%
1     98%
2     67%
3     NaN
4    100%
Name: host_response_rate, dtype: object
After processing:
0    0.96
1    0.98
2    0.67
3     NaN
4    1.00
Name: host_response_rate, dtype: float64
Done.


In [316]:
print('Processing host_acceptance_rate...')

# Vérifier les premières valeurs avant traitement
print("Before processing:")
print(X_clean['host_acceptance_rate'].head())

# Conversion du taux de réponse en format numérique
X_clean['host_acceptance_rate'] = X_clean['host_acceptance_rate'].apply(
    lambda x: float(x.strip('%')) / 100 if isinstance(x, str) and x.endswith('%') else x
)

# Vérifier les premières valeurs après traitement
print("After processing:")
print(X_clean['host_acceptance_rate'].head())

print('Done.')

Processing host_acceptance_rate...
Before processing:
0    100%
1    100%
2    100%
3     NaN
4     NaN
Name: host_acceptance_rate, dtype: object
After processing:
0    1.0
1    1.0
2    1.0
3    NaN
4    NaN
Name: host_acceptance_rate, dtype: float64
Done.


In [317]:
print('Processing security_deposit...')
# Vérifier les premières valeurs avant traitement
print("Before processing:")
print(X_clean['security_deposit'].head())

# Suppression du symbole "$" et conversion en float
X['security_deposit'] = X['security_deposit'].apply(
    lambda x: float(x.replace('$', '').replace(',', '').strip()) if isinstance(x, str) else x
)

# Remplacement des valeurs manquantes par 0
X['security_deposit'] = X['security_deposit'].fillna(0)

print("After processing:")
print(X['security_deposit'].head())
print('Done.')

Processing security_deposit...
Before processing:
0          NaN
1      $100.00
2    $1,000.00
3          NaN
4      $700.00
Name: security_deposit, dtype: object
After processing:
0       0.0
1     100.0
2    1000.0
3       0.0
4     700.0
Name: security_deposit, dtype: float64
Done.


In [318]:
print('Processing host_has_profile_pic...')
# Vérifier les premières valeurs avant traitement
print("Before processing:")
print(X_clean['host_has_profile_pic'].head())

# Remplacer les valeurs manquantes dans 'host_has_profile_pic' par "f"
X_clean['host_has_profile_pic'] = X_clean['host_has_profile_pic'].fillna("f")

# Vérification des premières valeurs après traitement
print("Processed 'host_has_profile_pic':")
print(X_clean['host_has_profile_pic'].head())

Processing host_has_profile_pic...
Before processing:
0    t
1    t
2    t
3    t
4    t
Name: host_has_profile_pic, dtype: object
Processed 'host_has_profile_pic':
0    t
1    t
2    t
3    t
4    t
Name: host_has_profile_pic, dtype: object


In [319]:
print(f"Valeurs uniques : {X_clean['host_has_profile_pic'].unique()}")

Valeurs uniques : ['t' 'f']


In [320]:
print('Processing host_identity_verified...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['host_identity_verified'].unique()}")

# Remplacer les valeurs manquantes dans 'host_identity_verified' par "f"
X_clean['host_identity_verified'] = X_clean['host_identity_verified'].fillna("f")

print('Processing host_identity_verified...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['host_identity_verified'].unique()}")


Processing host_identity_verified...
Valeurs uniques : ['t' 'f' nan]
Processing host_identity_verified...
Valeurs uniques : ['t' 'f']


In [321]:
print('Processing is_location_exact...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['is_location_exact'].unique()}")


Processing is_location_exact...
Valeurs uniques : ['t' 'f']


In [322]:
print('Processing host_response_time...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['host_response_time'].unique()}")

# Remplacer les valeurs manquantes dans 'host_response_time' par "unknown"
X_clean['host_response_time'] = X_clean['host_response_time'].fillna("unknown")

print('Processing host_response_time...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['host_response_time'].unique()}")

Processing host_response_time...
Valeurs uniques : ['within a few hours' 'within an hour' nan 'within a day'
 'a few days or more']
Processing host_response_time...
Valeurs uniques : ['within a few hours' 'within an hour' 'unknown' 'within a day'
 'a few days or more']


In [323]:
print('Processing host_is_superhost...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['host_is_superhost'].unique()}")

# Remplacer les valeurs manquantes dans 'host_response_time' par "unknown"
X_clean['host_is_superhost'] = X_clean['host_is_superhost'].fillna("unknown")

print('Processing host_is_superhost...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['host_is_superhost'].unique()}")

Processing host_is_superhost...
Valeurs uniques : ['f' 't' nan]
Processing host_is_superhost...
Valeurs uniques : ['f' 't' 'unknown']


In [324]:
print('Processing property_type...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['property_type'].unique()}")

# Remplacer les valeurs manquantes dans 'host_response_time' par "unknown"
X_clean['property_type'] = X_clean['property_type'].fillna("unknown")

print('Processing property_type...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['property_type'].unique()}")

Processing property_type...
Valeurs uniques : ['Apartment' 'House' 'Cabin' 'Condominium' 'Camper/RV' 'Bungalow'
 'Townhouse' 'Loft' 'Boat' 'Bed & Breakfast' 'Other' 'Dorm' 'Treehouse'
 'Yurt' 'Chalet' 'Tent' nan]
Processing property_type...
Valeurs uniques : ['Apartment' 'House' 'Cabin' 'Condominium' 'Camper/RV' 'Bungalow'
 'Townhouse' 'Loft' 'Boat' 'Bed & Breakfast' 'Other' 'Dorm' 'Treehouse'
 'Yurt' 'Chalet' 'Tent' 'unknown']


In [325]:
print('Processing extra_people...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['extra_people'].unique()}")

# Suppression du symbole "$" et conversion en float
X['extra_people'] = X['extra_people'].apply(
    lambda x: float(x.replace('$', '').replace(',', '').strip()) if isinstance(x, str) else x
)

# Remplacement des valeurs manquantes par 0
X['extra_people'] = X['extra_people'].fillna(0)

print("After processing:")
print(X['extra_people'].head())
print('Done.')

Processing extra_people...
Valeurs uniques : ['$5.00' '$0.00' '$25.00' '$15.00' '$30.00' '$10.00' '$20.00' '$50.00'
 '$60.00' '$75.00' '$100.00' '$35.00' '$40.00' '$45.00' '$7.00' '$14.00'
 '$55.00' '$18.00' '$29.00' '$12.00' '$19.00' '$8.00' '$21.00' '$26.00'
 '$17.00' '$44.00' '$9.00' '$80.00' '$200.00' '$28.00' '$85.00' '$250.00'
 '$13.00' '$16.00' '$300.00' '$33.00' '$49.00' '$22.00' '$27.00' '$68.00'
 '$71.00' '$48.00' '$6.00' '$36.00' '$175.00']
After processing:
0     5.0
1     0.0
2    25.0
3     0.0
4    15.0
Name: extra_people, dtype: float64
Done.


In [326]:
print('Processing host_verifications...')
# Vérifier les premières valeurs avant traitement
print(f"Valeurs uniques : {X_clean['host_verifications'].unique()}")

Processing host_verifications...
Valeurs uniques : ["['email', 'phone', 'reviews', 'kba']"
 "['email', 'phone', 'facebook', 'linkedin', 'reviews', 'jumio']"
 "['email', 'phone', 'google', 'reviews', 'jumio']"
 "['email', 'phone', 'facebook', 'reviews', 'jumio']"
 "['email', 'phone', 'facebook', 'reviews', 'kba']"
 "['email', 'phone', 'facebook', 'google', 'linkedin', 'reviews', 'jumio']"
 "['email', 'phone', 'facebook', 'linkedin', 'reviews', 'kba']"
 "['email', 'phone', 'linkedin', 'reviews', 'jumio']"
 "['email', 'reviews', 'kba']"
 "['email', 'phone', 'linkedin', 'reviews', 'kba']"
 "['email', 'phone', 'facebook', 'reviews']" "['phone', 'reviews']"
 "['phone']" "['email', 'phone', 'reviews']"
 "['email', 'phone', 'reviews', 'jumio', 'kba']"
 "['email', 'phone', 'facebook', 'google', 'reviews', 'jumio']"
 "['email', 'phone', 'facebook', 'google', 'linkedin', 'reviews', 'kba']"
 "['email', 'phone', 'reviews', 'jumio']"
 "['email', 'phone', 'google', 'linkedin', 'reviews', 'manual_offl

In [327]:
print('Processing host_verifications...')

# Vérifier les premières valeurs avant traitement
print("Before processing:")
print(X_clean['host_verifications'].head())

# Remplacer les valeurs manquantes par une liste vide "[]"
X_clean['host_verifications'] = X_clean['host_verifications'].fillna("[]")

# Remplacer "None" par une liste vide "[]"
X_clean['host_verifications'] = X_clean['host_verifications'].str.replace("None", "[]")

# Convertir chaque chaîne en liste et calculer le nombre de mots
X_clean['host_verifications'] = X_clean['host_verifications'].apply(
    lambda x: len(ast.literal_eval(x)) if isinstance(x, str) else 0
)

# Vérifier les premières valeurs après traitement
print("After processing:")
print(X_clean['host_verifications'].head())

Processing host_verifications...
Before processing:
0                 ['email', 'phone', 'reviews', 'kba']
1    ['email', 'phone', 'facebook', 'linkedin', 're...
2     ['email', 'phone', 'google', 'reviews', 'jumio']
3    ['email', 'phone', 'facebook', 'reviews', 'jum...
4     ['email', 'phone', 'facebook', 'reviews', 'kba']
Name: host_verifications, dtype: object
After processing:
0    4
1    6
2    5
3    5
4    5
Name: host_verifications, dtype: int64


In [328]:
print('Processing host_since...')

# Vérifier les premières valeurs avant traitement
print("Before processing:")
print(X_clean['host_since'].head())

# Convertir 'host_since' en durée (en jours) depuis aujourd'hui
X_clean['host_since'] = pd.to_datetime(X_clean['host_since'], format="%Y-%m-%d", errors="coerce")
X_clean['host_since'] = (pd.Timestamp("today") - X_clean['host_since']).dt.days

# Vérifier les premières valeurs après traitement
print("After processing:")
print(X_clean['host_since'].head())

Processing host_since...
Before processing:
0    2011-08-11
1    2013-02-21
2    2014-06-12
3    2013-11-06
4    2011-11-29
Name: host_since, dtype: object
After processing:
0    4849.0
1    4289.0
2    3813.0
3    4031.0
4    4739.0
Name: host_since, dtype: float64


Check that all variables that can can be converted are in numerical format, do not forget to check y as well.

In [329]:
X_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 47 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   host_since                        3816 non-null   float64
 1   host_response_time                3818 non-null   object 
 2   host_response_rate                3295 non-null   float64
 3   host_acceptance_rate              3045 non-null   float64
 4   host_is_superhost                 3818 non-null   object 
 5   host_listings_count               3816 non-null   float64
 6   host_total_listings_count         3816 non-null   float64
 7   host_verifications                3818 non-null   int64  
 8   host_has_profile_pic              3818 non-null   object 
 9   host_identity_verified            3818 non-null   object 
 10  neighbourhood_group_cleansed      3818 non-null   object 
 11  latitude                          3818 non-null   float64
 12  longit

In [330]:
Y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 3818 entries, 0 to 3817
Series name: price_log
Non-Null Count  Dtype  
--------------  -----  
3818 non-null   float64
dtypes: float64(1)
memory usage: 30.0 KB


In [331]:
X_train, X_test, y_train, y_test = train_test_split(
    X_clean,
    Y,
    test_size=0.2,
    random_state=1
)

Separate the variables into two groups, one for the numerical variables and one for the categorical variables. And apply preprocessings to each subgroup of variables properly.

In [332]:
numerical_features = X_clean.select_dtypes(include=["float64", "int64"]).columns.tolist()
categorical_features = X_clean.select_dtypes(exclude=["float64", "int64"]).columns.tolist()

print(f'here you are with numerical variables: \n{numerical_features}')
print(f'\nhere you are with categorical variables: \n{categorical_features}')

here you are with numerical variables: 
['host_since', 'host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'host_total_listings_count', 'host_verifications', 'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'guests_included', 'minimum_nights', 'maximum_nights', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'number_of_reviews', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'calculated_host_listings_count', 'reviews_per_month']

here you are with categorical variables: 
['host_response_time', 'host_is_superhost', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood_group_cleansed', 'is_location_exact', 'property_type', 'room_type', 'bed_type', 'security_deposit', 'cleaning_fee', 'extra_people', 'has_availability', 'requires_license', 'instant_bookable', 'cancellation_p

In [333]:
numeric_transformers = Pipeline(steps=[
    ("imputer", KNNImputer()),
    ("scaler", StandardScaler())
])

In [334]:
categorical_transformers = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [335]:
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformers, numerical_features),
    ("cat", categorical_transformers, categorical_features)
])

In [336]:
# Preprocessing on train set
print('performing preprocessing on train set')
print(X_train.head())

X_train = preprocessor.fit_transform(X_train)
print('preprocessor on train set done')
X_train[0:5]
print()

# Preprocessing on test set
print('performing preprocessing on test set')
print(X_test.head())

X_test = preprocessor.transform(X_test)
print('preprocessor on test set done')
X_test[0:5]


performing preprocessing on train set
      host_since  host_response_time  host_response_rate  \
2313      3864.0      within an hour                1.00   
3690      3935.0             unknown                 NaN   
2225      4565.0  within a few hours                1.00   
180       4298.0  a few days or more                0.33   
823       3451.0      within an hour                0.90   

      host_acceptance_rate host_is_superhost  host_listings_count  \
2313                   1.0                 f                  1.0   
3690                   NaN                 f                  1.0   
2225                   1.0                 f                  1.0   
180                    1.0                 f                  2.0   
823                    1.0                 f                  1.0   

      host_total_listings_count  host_verifications host_has_profile_pic  \
2313                        1.0                   6                    t   
3690                        1.0   

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 235 stored elements and shape (5, 297)>

What score would you expect for a model that would always predict the average price?

In [339]:
# Calcul de la baseline (prédiction constante égale à la moyenne de y_train)
baseline_value = y_train.mean()  # La baseline est la moyenne de y_train
baseline_train = np.full_like(y_train, baseline_value)  # Tableau de la taille de y_train rempli avec la moyenne
baseline_test = np.full_like(y_test, baseline_value)    # Tableau de la taille de y_test rempli avec la moyenne

# Calcul des scores R^2 pour la baseline
r2_train = r2_score(y_train, baseline_train)
r2_test = r2_score(y_test, baseline_test)

# Affichage des résultats
print(f"R^2 baseline (train): {r2_train:.4f}")
print(f"R^2 baseline (test): {r2_test:.4f}")

R^2 baseline (train): 0.0000
R^2 baseline (test): -0.0001


Train an Adaboost model with all its default parameters, what's the score ?

In [344]:
# Initialisation du modèle AdaBoost
regressor_ada = AdaBoostRegressor(random_state=42)  # Ajout d'une graine pour reproductibilité

# Entraînement du modèle
regressor_ada.fit(X_train, y_train)

# Évaluation des performances
r2_train = regressor_ada.score(X_train, y_train)
r2_test = regressor_ada.score(X_test, y_test)

# Affichage des résultats
print(f"R^2 AdaBoost (train): {r2_train:.4f}")
print(f"R^2 AdaBoost (test): {r2_test:.4f}")

R^2 AdaBoost (train): 0.6394
R^2 AdaBoost (test): 0.6443


Train an XGBoost model with all its default parameters except max_depth=3 (the same as adaboost default), what's the score ?

In [341]:
# Initialisation du modèle XGBoost
regressor_xgb = XGBRegressor(max_depth=3, random_state=42)  # Ajout d'une graine pour reproductibilité

# Entraînement du modèle
regressor_xgb.fit(X_train, y_train)

# Évaluation des performances
r2_train = regressor_xgb.score(X_train, y_train)
r2_test = regressor_xgb.score(X_test, y_test)

# Affichage des résultats
print(f"R^2 XGBoost (train): {r2_train:.4f}")
print(f"R^2 XGBoost (test): {r2_test:.4f}")

R^2 XGBoost (train): 0.8500
R^2 XGBoost (test): 0.7452


Adaboost does not seem to be performing as well as XGBoost, however it does not seem to overfit the data as much, try and improve it by playing with its parameters learning rate & n_estimators thanks to a grid search

In [345]:
# Initialisation du modèle de base et de l'algorithme d'ensemble
regressor_ada = AdaBoostRegressor()

# Define parameter grid for hyperparameter tuning
params = {
    'n_estimators':[50, 100, 150, 200],
    "learning_rate":[1.0, 0.5, 0.1]
}

print(params)

# initialisation la GridSearch avec validation croisée:
gridsearch = GridSearchCV(
    estimator= regressor_ada,
    param_grid= params,
    cv=3,
)
gridsearch.fit(X_train, y_train)

# Affichage des meilleurs paramètres et performances
print(f'Meilleurs hyperparamètres: {gridsearch.best_params_}')
print(f'Meilleure cross-validation accuracy: {gridsearch.best_score_:.4f}')

# Evaluation des meilleures performances sur les ensembles d'entrainement et de test
train_accuracy = gridsearch.score(X_train, y_train)
test_accuracy = gridsearch.score(X_test, y_test)
print(f'Meilleur accuracy score sur le train set: {train_accuracy:.4f}')
print(f'Meilleur accuracy score sur le test set: {test_accuracy:.4f}')

{'n_estimators': [50, 100, 150, 200], 'learning_rate': [1.0, 0.5, 0.1]}
Meilleurs hyperparamètres: {'learning_rate': 1.0, 'n_estimators': 100}
Meilleure cross-validation accuracy: 0.6146
Meilleur accuracy score sur le train set: 0.6425
Meilleur accuracy score sur le test set: 0.6543


We don't seem to be able to reach XGBoost performance using Adaboost in this case

Let's now run a sanity check to make sure that Adaboost and XGBoost actually improved the performance of their base models which are regression trees in this case. Train a regression tree model with max_depth = 3 (the default for Adaboost)

In [347]:
# Initialisation du modèle de base et de l'algorithme d'ensemble
tree_regressor = DecisionTreeRegressor(max_depth=3)

# Train the regression model
tree_regressor.fit(X_train, y_train)

# Evaluation des meilleures performances sur les ensembles d'entrainement et de test
train_accuracy = tree_regressor.score(X_train, y_train)
test_accuracy = tree_regressor.score(X_test, y_test)
print(f'Meilleur accuracy score sur le train set: {train_accuracy:.4f}')
print(f'Meilleur accuracy score sur le test set: {test_accuracy:.4f}')

Meilleur accuracy score sur le train set: 0.5856
Meilleur accuracy score sur le test set: 0.6098


We conclude here that both boosting algorithms have fulfilled their missions, they both were able to improve performance on the test set compared to the base model! However XGBoost seems to have superior performance in this case despite higher levels of over fitting.

Train separately three independent models, and then implement a voting. Do you get better results?

In [349]:
# Initialisation du modèle Ridge
linreg = Ridge(random_state=42)  # Ajout de `random_state` pour reproductibilité

# Grille d'hyperparamètres
param_grid = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]
}

# Initialisation de GridSearchCV
linreg_opt = GridSearchCV(
    estimator=linreg,
    param_grid=param_grid,
    cv=3,  # Nombre de plis pour la validation croisée
    scoring='r2',  # Score basé sur R2
    n_jobs=-1,  # Utilise tous les cœurs disponibles pour accélérer la recherche
    verbose=1  # Affiche les détails du processus
)

# Recherche par grille
print("Grid search in progress...")
linreg_opt.fit(X_train, y_train)
print("Grid search completed.")

# Résultats
print(f"Best hyperparameters: {linreg_opt.best_params_}")
print(f"Best cross-validated R2: {linreg_opt.best_score_:.4f}")
print()
print(f"R2 on training set: {linreg_opt.score(X_train, y_train):.4f}")
print(f"R2 on test set: {linreg_opt.score(X_test, y_test):.4f}")

Grid search in progress...
Fitting 3 folds for each of 9 candidates, totalling 27 fits
Grid search completed.
Best hyperparameters: {'alpha': 50.0}
Best cross-validated R2: 0.6810

R2 on training set: 0.7158
R2 on test set: 0.7023


In [350]:
# Initialisation du modèle Decision Tree Regressor
decision_tree_r = DecisionTreeRegressor()

# Grille d'hyperparamètres
param_grid = {
    'max_depth': [1, 2, 3], 
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4]
}

# Initialisation de GridSearchCV
decision_tree_r_opt = GridSearchCV(
    estimator=decision_tree_r,
    param_grid=param_grid,
    cv=3,  # Nombre de plis pour la validation croisée
)

# Recherche par grille
print("Grid search in progress...")
decision_tree_r_opt.fit(X_train, y_train)
print("Grid search completed.")

# Résultats
print(f"Best hyperparameters: {decision_tree_r_opt.best_params_}")
print(f"Best cross-validated R2: {decision_tree_r_opt.best_score_:.4f}")
print()
print(f"R2 on training set: {decision_tree_r_opt.score(X_train, y_train):.4f}")
print(f"R2 on test set: {decision_tree_r_opt.score(X_test, y_test):.4f}")

Grid search in progress...
Grid search completed.
Best hyperparameters: {'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best cross-validated R2: 0.5749

R2 on training set: 0.5856
R2 on test set: 0.6098


In [None]:
# Initialisation du modèle SVM
svm = SVR()

# Grille d'hyperparamètres
param_grid = {
    'max_depth': [1, 2, 3], 
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [2, 3, 4]
}

# Initialisation de GridSearchCV
decision_tree_r_opt = GridSearchCV(
    estimator=decision_tree_r,
    param_grid=param_grid,
    cv=3,  # Nombre de plis pour la validation croisée
)

# Recherche par grille
print("Grid search in progress...")
decision_tree_r_opt.fit(X_train, y_train)
print("Grid search completed.")

# Résultats
print(f"Best hyperparameters: {decision_tree_r_opt.best_params_}")
print(f"Best cross-validated R2: {decision_tree_r_opt.best_score_:.4f}")
print()
print(f"R2 on training set: {decision_tree_r_opt.score(X_train, y_train):.4f}")
print(f"R2 on test set: {decision_tree_r_opt.score(X_test, y_test):.4f}")