#AIRBNB - PIPELINE

**Our unit of analysis is an AIRBNB LISTING**

We will see how we can transform the input variables. We won't do any predictions in this notebook!

# Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [2]:
#We will predict the "median_house_value" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_$75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_$75-$150


In [3]:
# Find the total number of rows

airbnb.shape

(3555, 23)

# Split the data into train and test

In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(airbnb, test_size=0.3)

### Be careful: we haven't seperated the target column yet

## Check the missing values

In [5]:
train.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          2
room_type                              0
accommodates                           0
bathrooms                             10
bedrooms                               8
beds                                   6
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 556
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

In [6]:
test.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          1
room_type                              0
accommodates                           0
bathrooms                              4
bedrooms                               2
beds                                   3
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 244
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

# Data Prep

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

## Separate the POTENTIAL target variables (we don't want to transform them)

In [8]:
train_targets = train[['price', 'price_gte_150', 'price_category']]
test_targets = test[['price', 'price_gte_150', 'price_category']]

train_inputs = train.drop(['price', 'price_gte_150', 'price_category'], axis=1)
test_inputs = test.drop(['price', 'price_gte_150', 'price_category'], axis=1)

##  Identify the numerical and categorical columns

### Option 1: Manually

In [9]:
numeric_columns = ['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 
                   'Number of amenities', 'guests_included', 
                   'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']
 
binary_columns = ['host_is_superhost', 'host_identity_verified']
 
categorical_columns = ['neighbourhood_cleansed', 'property_type', 
                       'room_type', 'bed_type', 'cancellation_policy']

### Option 2: Programmatically

In [10]:
binary_columns

['host_is_superhost', 'host_identity_verified']

In [11]:
numeric_columns

['latitude',
 'longitude',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'Number of amenities',
 'guests_included',
 'price_per_extra_person',
 'minimum_nights',
 'number_of_reviews',
 'number_days_btw_first_last_review',
 'review_scores_rating']

In [12]:
categorical_columns

['neighbourhood_cleansed',
 'property_type',
 'room_type',
 'bed_type',
 'cancellation_policy']

# Pipeline

In [23]:
Pipeline?


In [13]:
numeric_transformer = Pipeline(steps=[
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [14]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [15]:
binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

In [30]:
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('binary', binary_transformer, binary_columns)],
        remainder='passthrough')

#passtrough is an optional step. You don't have to use it.

In [40]:
from sklearn import set_config  ## optional self practice

set_config(display="diagram")
display(preprocessor)


In [41]:
pipe = Pipeline(preprocessor) ## self practice
pipe

# Transform: fit_transform() for TRAIN

In [17]:
#Fit and transform the train data
train_x = preprocessor.fit_transform(train_inputs)

train_x

array([[ 0.82254842,  0.69215829,  0.54753414, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.55146572,  0.15729058,  0.54753414, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.07311286, -1.97951247, -0.59100739, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.61093878, -0.07631528,  3.96315871, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.17819153, -0.94575177, -1.16027815, ...,  0.        ,
         0.        ,  0.        ],
       [-0.33618088,  1.03587419, -0.59100739, ...,  0.        ,
         0.        ,  1.        ]])

In [18]:
train_x.shape

(2488, 66)

# Tranform: transform() for TEST

In [19]:
# Transform the test data
test_x = preprocessor.transform(test_inputs)

test_x

array([[-1.21269719, -1.20324989,  0.54753414, ...,  0.        ,
         0.        ,  1.        ],
       [-2.86419979, -2.67831359, -0.59100739, ...,  0.        ,
         0.        ,  0.        ],
       [-0.11443035,  1.26295963, -0.59100739, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [ 0.47803436, -1.63486781, -0.59100739, ...,  0.        ,
         1.        ,  1.        ],
       [ 0.59928397,  0.34795157,  2.82461719, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.19953968,  0.22845713, -0.59100739, ...,  0.        ,
         0.        ,  1.        ]])

In [20]:
test_x.shape

(1067, 66)

# Feature Names (optional)

You can get the feature names if you are running the new version of Scikit-Learn.

First, check your version of scikit-learn. If it is below 1.1.1, you might have to update it to the latest version as described below:

In [21]:
# Check if your scikit-learn version is below 1.1.1

import sklearn

sklearn.__version__

'1.2.1'

In [22]:
# Now we can retrieve the column names:

preprocessor.get_feature_names_out()

array(['num__latitude', 'num__longitude', 'num__accommodates',
       'num__bathrooms', 'num__bedrooms', 'num__beds',
       'num__Number of amenities', 'num__guests_included',
       'num__price_per_extra_person', 'num__minimum_nights',
       'num__number_of_reviews', 'num__number_days_btw_first_last_review',
       'num__review_scores_rating', 'cat__neighbourhood_cleansed_Allston',
       'cat__neighbourhood_cleansed_Back Bay',
       'cat__neighbourhood_cleansed_Bay Village',
       'cat__neighbourhood_cleansed_Beacon Hill',
       'cat__neighbourhood_cleansed_Brighton',
       'cat__neighbourhood_cleansed_Charlestown',
       'cat__neighbourhood_cleansed_Chinatown',
       'cat__neighbourhood_cleansed_Dorchester',
       'cat__neighbourhood_cleansed_Downtown',
       'cat__neighbourhood_cleansed_East Boston',
       'cat__neighbourhood_cleansed_Fenway',
       'cat__neighbourhood_cleansed_Hyde Park',
       'cat__neighbourhood_cleansed_Jamaica Plain',
       'cat__neighbourhood_cl

In [23]:
# If you wanted, you can recreate the data set in the processed format:

airbnb_train_processed = pd.DataFrame(train_x, columns=preprocessor.get_feature_names_out())

airbnb_train_processed

Unnamed: 0,num__latitude,num__longitude,num__accommodates,num__bathrooms,num__bedrooms,num__beds,num__Number of amenities,num__guests_included,num__price_per_extra_person,num__minimum_nights,...,cat__bed_type_Couch,cat__bed_type_Futon,cat__bed_type_Pull-out Sofa,cat__bed_type_Real Bed,cat__cancellation_policy_flexible,cat__cancellation_policy_moderate,cat__cancellation_policy_strict,cat__cancellation_policy_super_strict_30,binary__host_is_superhost,binary__host_identity_verified
0,0.822548,0.692158,0.547534,1.606727,1.014067,0.421337,1.048032,-0.403158,-0.559031,-0.140022,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
1,0.551466,0.157291,0.547534,-0.432312,-0.327240,-0.610441,0.636851,-0.403158,-0.559031,-0.140022,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.073113,-1.979512,-0.591007,0.587207,-0.327240,-0.610441,-1.624644,-0.403158,-0.559031,0.282250,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.183786,0.305172,-1.160278,-0.432312,-0.327240,-0.610441,0.020080,-0.403158,0.201644,-0.280779,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-1.244238,-0.580193,0.547534,-0.432312,1.014067,0.421337,-0.185511,0.530787,1.215877,-0.140022,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2483,0.246338,0.313836,-1.160278,-0.432312,-0.327240,-0.610441,-2.241416,-0.403158,0.049509,-0.280779,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2484,0.373761,0.196639,0.547534,-0.432312,-0.327240,0.421337,0.225670,0.530787,1.469436,0.282250,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2485,-0.610939,-0.076315,3.963159,0.587207,2.355374,2.484891,0.636851,6.134455,-0.559031,-0.280779,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2486,1.178192,-0.945752,-1.160278,-0.432312,-0.327240,-0.610441,0.020080,-0.403158,1.469436,-0.280779,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
