In [None]:
# Loading Airbnb NY Dataset

import numpy as np
import pandas as pd
excluding_list = ['price', 'id', 'latitude', 'longitude', 'host_id', 'last_review', 
                  'name', 'host_name'] #A 
categorical = ['neighbourhood_group', 'neighbourhood', 'room_type'] #B
continuous = ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 
              'Calculated_host_listings_count'] #C

url = "https://raw.githubusercontent.com/lmassaron/tabular_datasets/master/AB_NYC_2019.csv"
data = pd.read_csv(url)

target_median = (data["price"] > data["price"].median()).astype(int) #D

#A list of column names to be excluded from the analysis
#B list of names of columns that likely represent categorical variables in the dataset
#C list of names of columns that represent continuous numerical variables in the dataset
#D a binary balanced target

In [None]:
# Listing B.1 K-nearest neighbors classifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import accuracy_score

categorical_onehot_encoding = OneHotEncoder(handle_unknown='ignore')

accuracy = make_scorer(accuracy_score) #A
cv = KFold(5, shuffle=True, random_state=0) #B
model = KNeighborsClassifier(n_neighbors=30, 
                             weights="uniform",
                             algorithm="auto", 
                             n_jobs=-1) #C

column_transform = ColumnTransformer(
    [('categories', categorical_onehot_encoding, low_card_categorical),
     ('numeric', numeric_discretizing, continuous)],
    remainder='drop',
    verbose_feature_names_out=False,
    sparse_threshold=0.0) #D

model_pipeline = Pipeline(
    [('processing', column_transform),
     ('pca', PCA(n_components="mle")),
     ('modeling', model)]) #E

cv_scores = cross_validate(estimator=model_pipeline, 
                           X=data, 
                           y=target_median,
                           scoring=accuracy,
                           cv=cv, 
                           return_train_score=True,
                           return_estimator=True) #F

mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])
print(f"{mean_cv:0.3f} ({std_cv:0.3f})", 
      f"fit: {fit_time:0.2f} secs pred: {score_time:0.2f} secs") #G

#A creating a scoring function using the accuracy_score metric
#B creating a five-fold cross-validation iterator with shuffling and a fixed random state
#C creating an instance of the KNeighborsClassifier with specified hyperparameters
#D defining a ColumnTransformer to preprocess features, applying one-hot encoding to categorical features with low cardinality and discretization to numerical features
#E creating a pipeline that sequentially applies the column transformation, performs PCA dimensionality reduction, and then fits the k-nearest neighbors model to the data
#F performing cross-validation on the data using the defined pipeline, with accuracy scoring
#G  printing the mean and standard deviation of test scores

In [None]:
# Listing B.2 Support vector classifier

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numeric_standardization = Pipeline([
       ("imputation", SimpleImputer(strategy="constant", fill_value=0)),
       ("standardizing", StandardScaler())
       ])

accuracy = make_scorer(accuracy_score) #A
cv = KFold(5, shuffle=True, random_state=0) #B
model = SVC(C=1.0, kernel='rbf', gamma='scale', probability=False) #C

column_transform = ColumnTransformer(
    [('categories', categorical_onehot_encoding, low_card_categorical),
     ('numeric', numeric_standardization, continuous)],
    remainder='drop',
    verbose_feature_names_out=False,
    sparse_threshold=0.0) #D

model_pipeline = Pipeline(
    [('processing', column_transform),
     ('modeling', model)]) #E

cv_scores = cross_validate(estimator=model_pipeline, 
                           X=data, 
                           y=target_median,
                           scoring=accuracy,
                           cv=cv, 
                           return_train_score=True,
                           return_estimator=True) #F

mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])
print(f"{mean_cv:0.3f} ({std_cv:0.3f})", 
      f"fit: {fit_time:0.2f} secs pred: {score_time:0.2f} secs") #G

#A creating a scoring function using the accuracy_score metric
#B creating a five-fold cross-validation iterator with shuffling and a fixed random state
#C creating an instance of the Support Vector Classifier  with specified hyperparameters
#D defining a ColumnTransformer to preprocess features, applying one-hot encoding to categorical features with low cardinality and standardization to numerical features
#E creating a pipeline that sequentially applies the column transformation and the model to the data
#F performing cross-validation on the data using the defined pipeline, with accuracy scoring
#G  printing the mean and standard deviation of test scores

In [None]:
# Listing B.3 Support vector classifier from RAPIDS cuML 

from cuml.svm import SVC
from sklearn.metrics import accuracy_score

accuracy = make_scorer(accuracy_score) #A
cv = KFold(5, shuffle=True, random_state=0) #B
model = SVC(C=1.0, kernel='rbf', gamma='scale', probability=False) #C

column_transform = ColumnTransformer(
    [('categories', categorical_onehot_encoding, low_card_categorical),
     ('numeric', numeric_standardization, continuous)],
    remainder='drop',
    verbose_feature_names_out=False,
    sparse_threshold=0.0) #D

model_pipeline = Pipeline(
    [('processing', column_transform),
     ('modeling', model)]) #E

cv_scores = cross_validate(estimator=model_pipeline, 
                           X=data, 
                           y=target_median,
                           scoring=accuracy,
                           cv=cv, 
                           return_train_score=True,
                           return_estimator=True) #F

mean_cv = np.mean(cv_scores['test_score'])
std_cv = np.std(cv_scores['test_score'])
fit_time = np.mean(cv_scores['fit_time'])
score_time = np.mean(cv_scores['score_time'])
print(f"{mean_cv:0.3f} ({std_cv:0.3f})", 
      f"fit: {fit_time:0.2f} secs pred: {score_time:0.2f} secs") #G

#A creating a scoring function using the accuracy_score metric
#B creating a 5-fold cross-validation iterator with shuffling and a fixed random state
#C creating an instance of a Support Vector Classifier from the GPU-accelerated cuML library with specified hyperparameters
#D defining a ColumnTransformer to preprocess features, applying one-hot encoding to categorical features with low cardinality and standardization to numerical features
#E creating a pipeline that sequentially applies the column transformation and the model to the data
#F performing cross-validation on the data using the defined pipeline, with accuracy scoring
#G  printing the mean and standard deviation of test scores