# Tutorial 1 - AIRBNB - CORE STEPS

**Our unit of analysis is an AIRBNB LISTING**

We will see how we can transform the input variables. We won't do any predictions in this notebook!

# Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd

np.random.seed(42)


# Get the data

In [40]:
#We will predict the "median_house_value" value in the data set:

airbnb = pd.read_csv("airbnb.csv")
airbnb.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
0,0,0,Roslindale,42.282619,-71.133068,House,Entire home/apt,4,1.5,2.0,...,1,0,2,0,0,,moderate,250,1,gte_226
1,0,1,Roslindale,42.286241,-71.134374,Apartment,Private room,2,1.0,1.0,...,0,0,2,36,804,94.0,moderate,65,0,lte_$75
2,1,1,Roslindale,42.292438,-71.135765,Apartment,Private room,2,1.0,1.0,...,1,20,3,41,2574,98.0,moderate,65,0,lte_$75
3,0,0,Roslindale,42.281106,-71.121021,House,Private room,4,1.0,1.0,...,2,25,1,1,0,100.0,moderate,75,0,lte_$75
4,1,1,Roslindale,42.284512,-71.136258,House,Private room,2,1.5,1.0,...,1,0,2,29,380,99.0,flexible,79,0,btw_$75-$150


In [41]:
# Find the total number of rows

airbnb.shape

(3555, 23)

In [42]:
# Check the missing values

airbnb.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          3
room_type                              0
accommodates                           0
bathrooms                             14
bedrooms                              10
beds                                   9
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 800
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

### Should we remove these rows or not???

In [43]:
# If we want to remove them, use the following code:

# train.dropna(axis=0, inplace=True)

# Split data (train/test)

In [44]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(airbnb, test_size=0.3)

In [45]:
train.shape

(2488, 23)

In [46]:
test.shape

(1067, 23)

In [47]:
train.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
2085,0,1,Downtown,42.356167,-71.061084,Apartment,Private room,2,1.0,1.0,...,1,0,1,0,0,,flexible,400,1,gte_226
3544,0,0,West Roxbury,42.303305,-71.16194,Townhouse,Private room,1,1.0,1.0,...,1,35,1,1,0,80.0,moderate,68,0,lte_$75
2407,0,1,Fenway,42.351109,-71.089654,Apartment,Entire home/apt,7,1.0,2.0,...,3,50,1,38,314,88.0,strict,115,0,btw_$75-$150
2988,0,1,Dorchester,42.285509,-71.068632,Apartment,Private room,2,1.0,1.0,...,1,10,1,4,65,80.0,flexible,80,0,btw_$75-$150
936,0,1,South End,42.345767,-71.063189,Apartment,Entire home/apt,4,1.0,1.0,...,2,10,3,20,367,90.0,strict,189,1,btw_$151-$225


In [48]:
test.head()

Unnamed: 0,host_is_superhost,host_identity_verified,neighbourhood_cleansed,latitude,longitude,property_type,room_type,accommodates,bathrooms,bedrooms,...,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,cancellation_policy,price,price_gte_150,price_category
960,0,1,South End,42.340245,-71.076636,Apartment,Private room,2,1.0,1.0,...,1,0,2,7,168,97.0,strict,125,0,btw_$75-$150
132,0,1,Jamaica Plain,42.30905,-71.112539,Apartment,Entire home/apt,4,1.0,2.0,...,2,10,2,32,715,98.0,moderate,200,1,btw_$151-$225
2430,0,0,Fenway,42.35087,-71.101978,Bed & Breakfast,Private room,2,1.0,1.0,...,2,25,2,0,0,,strict,175,1,btw_$151-$225
2229,0,1,Fenway,42.344332,-71.100976,Apartment,Entire home/apt,3,1.0,1.0,...,1,0,3,3,372,100.0,strict,239,1,gte_226
646,0,1,North End,42.364244,-71.056175,Apartment,Entire home/apt,3,1.0,2.0,...,2,10,3,10,55,98.0,strict,182,1,btw_$151-$225


# Prepare the data

In [49]:
# Descriptive statistics of numerical variables

train.describe()

Unnamed: 0,host_is_superhost,host_identity_verified,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating,price,price_gte_150
count,2488.0,2488.0,2488.0,2488.0,2488.0,2476.0,2479.0,2482.0,2488.0,2488.0,2488.0,2488.0,2488.0,2488.0,1932.0,2488.0,2488.0
mean,0.118569,0.726688,42.339925,-71.084874,3.034164,1.213651,1.242436,1.599517,14.883039,1.432476,10.865756,3.258441,19.729502,289.036576,92.165114,165.615354,0.49799
std,0.323346,0.445749,0.024073,0.031694,1.751568,0.479469,0.742815,1.0026,4.824835,1.070993,18.631348,9.597254,37.230063,421.217226,9.20806,103.222584,0.500096
min,0.0,0.0,42.235942,-71.171789,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,20.0,10.0,0.0
25%,0.0,0.0,42.329549,-71.105454,2.0,1.0,1.0,1.0,12.0,1.0,0.0,1.0,1.0,0.0,89.0,85.0,0.0
50%,0.0,1.0,42.345132,-71.078509,2.0,1.0,1.0,1.0,15.0,1.0,0.0,2.0,5.0,93.5,95.0,149.0,0.0
75%,0.0,1.0,42.354418,-71.061993,4.0,1.0,1.5,2.0,18.0,2.0,20.0,3.0,21.25,420.25,99.0,218.0,1.0
max,1.0,1.0,42.389982,-71.0001,16.0,6.0,5.0,16.0,30.0,14.0,200.0,300.0,404.0,2680.0,100.0,650.0,1.0


In [50]:
# Total missing values in each column

train.isna().sum()

host_is_superhost                      0
host_identity_verified                 0
neighbourhood_cleansed                 0
latitude                               0
longitude                              0
property_type                          2
room_type                              0
accommodates                           0
bathrooms                             12
bedrooms                               9
beds                                   6
bed_type                               0
Number of amenities                    0
guests_included                        0
price_per_extra_person                 0
minimum_nights                         0
number_of_reviews                      0
number_days_btw_first_last_review      0
review_scores_rating                 556
cancellation_policy                    0
price                                  0
price_gte_150                          0
price_category                         0
dtype: int64

## Separate the POTENTIAL target columns. Separate numerical and categorical inputs

In [51]:
train_targets = train[['price', 'price_gte_150', 'price_category']]

train_numeric_columns = train[['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']]

train_binary_columns = train[['host_is_superhost', 'host_identity_verified']]

train_categorical_columns = train[['neighbourhood_cleansed', 'property_type', 
                                   'room_type', 'bed_type', 'cancellation_policy']]

In [52]:
train_numeric_columns.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
2085,42.356167,-71.061084,2,1.0,1.0,1.0,16,1,0,1,0,0,
3544,42.303305,-71.16194,1,1.0,1.0,1.0,14,1,35,1,1,0,80.0
2407,42.351109,-71.089654,7,1.0,2.0,3.0,16,3,50,1,38,314,88.0
2988,42.285509,-71.068632,2,1.0,1.0,1.0,18,1,10,1,4,65,80.0
936,42.345767,-71.063189,4,1.0,1.0,2.0,24,2,10,3,20,367,90.0


In [53]:
train_binary_columns.head()

Unnamed: 0,host_is_superhost,host_identity_verified
2085,0,1
3544,0,0
2407,0,1
2988,0,1
936,0,1


In [54]:
train_categorical_columns.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
2085,Downtown,Apartment,Private room,Real Bed,flexible
3544,West Roxbury,Townhouse,Private room,Real Bed,moderate
2407,Fenway,Apartment,Entire home/apt,Real Bed,strict
2988,Dorchester,Apartment,Private room,Real Bed,flexible
936,South End,Apartment,Entire home/apt,Real Bed,strict


## Process the numerical variables

### Imputation 

In [55]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [56]:
train_numeric_columns_imputed = imputer.fit_transform(train_numeric_columns)

In [57]:
train_numeric_columns_imputed

array([[ 42.35616708, -71.06108437,   2.        , ...,   0.        ,
          0.        ,  95.        ],
       [ 42.30330536, -71.16193984,   1.        , ...,   1.        ,
          0.        ,  80.        ],
       [ 42.35110882, -71.08965359,   7.        , ...,  38.        ,
        314.        ,  88.        ],
       ...,
       [ 42.36042354, -71.06149584,   2.        , ...,   0.        ,
          0.        ,  95.        ],
       [ 42.33064668, -71.10584252,   2.        , ...,  23.        ,
        400.        ,  93.        ],
       [ 42.35172558, -71.12104524,   1.        , ...,   0.        ,
          0.        ,  95.        ]])

### Standardize the values


In [58]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_numeric_columns_std = scaler.fit_transform(train_numeric_columns_imputed)

train_numeric_columns_std

array([[ 0.67480896,  0.75076516, -0.59054037, ..., -0.53004121,
        -0.68633157,  0.2685372 ],
       [-1.52148303, -2.43208662, -1.16157203, ..., -0.5031758 ,
        -0.68633157, -1.56126167],
       [ 0.46464902, -0.15083782,  2.26461791, ...,  0.49084449,
         0.05927688, -0.58536894],
       ...,
       [ 0.85165582,  0.73777977, -0.59054037, ..., -0.53004121,
        -0.68633157,  0.2685372 ],
       [-0.38550936, -0.66173688, -0.59054037, ...,  0.08786329,
         0.26348811,  0.02456402],
       [ 0.49027409, -1.14151259, -1.16157203, ..., -0.53004121,
        -0.68633157,  0.2685372 ]])

### Convert back to Pandas

In [59]:
train_numeric_columns_std_df = pd.DataFrame(train_numeric_columns_std, 
                                      columns=train_numeric_columns.columns).reset_index(drop=True)

train_numeric_columns_std_df.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
0,0.674809,0.750765,-0.59054,-0.444401,-0.325787,-0.597103,0.231549,-0.403889,-0.583315,-0.235369,-0.530041,-0.686332,0.268537
1,-1.521483,-2.432087,-1.161572,-0.444401,-0.325787,-0.597103,-0.183056,-0.403889,1.295617,-0.235369,-0.503176,-0.686332,-1.561262
2,0.464649,-0.150838,2.264618,-0.444401,1.022897,1.399661,0.231549,1.463911,2.100874,-0.235369,0.490844,0.059277,-0.585369
3,-2.260874,0.512573,-0.59054,-0.444401,-0.325787,-0.597103,0.646154,-0.403889,-0.046477,-0.235369,-0.42258,-0.531986,-1.561262
4,0.242697,0.68434,0.551523,-0.444401,-0.325787,0.401279,1.88997,0.530011,-0.046477,-0.026934,0.007267,0.185128,-0.341396


In [60]:
train_numeric_columns_std_df.isna().sum()

latitude                             0
longitude                            0
accommodates                         0
bathrooms                            0
bedrooms                             0
beds                                 0
Number of amenities                  0
guests_included                      0
price_per_extra_person               0
minimum_nights                       0
number_of_reviews                    0
number_days_btw_first_last_review    0
review_scores_rating                 0
dtype: int64

## Process the categorical variables

In [78]:
#Find the total number of missing values  (isna() and isnull() both are same only)
train_categorical_columns.isna().sum()

neighbourhood_cleansed    0
property_type             2
room_type                 0
bed_type                  0
cancellation_policy       0
dtype: int64

In [79]:
train_categorical_columns['property_type'].value_counts()

Apartment          1821
House               390
Condominium         157
Townhouse            34
Loft                 31
Bed & Breakfast      25
Other                11
Boat                  8
Entire Floor          3
Dorm                  2
Villa                 2
Camper/RV             1
Guesthouse            1
Name: property_type, dtype: int64

In [80]:
#Find the rows that have missing values
train_categorical_columns[train_categorical_columns.isnull().any(axis=1)]

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
3363,Allston,,Private room,Real Bed,moderate
2042,Downtown,,Entire home/apt,Real Bed,strict


In [81]:
#Impute "unknown" or for categorical text values

categorical_imputer = SimpleImputer(strategy="constant", fill_value='UNKNOWN')

train_categorical_columns_imputed = categorical_imputer.fit_transform(train_categorical_columns)

### Convert back to Pandas

In [82]:
train_categorical_columns_imputed_df = pd.DataFrame(train_categorical_columns_imputed, 
                                      columns=train_categorical_columns.columns).reset_index(drop=True)

train_categorical_columns_imputed_df.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
0,Downtown,Apartment,Private room,Real Bed,flexible
1,West Roxbury,Townhouse,Private room,Real Bed,moderate
2,Fenway,Apartment,Entire home/apt,Real Bed,strict
3,Dorchester,Apartment,Private room,Real Bed,flexible
4,South End,Apartment,Entire home/apt,Real Bed,strict


In [83]:
train_categorical_columns_imputed_df['property_type'].value_counts()

Apartment          1821
House               390
Condominium         157
Townhouse            34
Loft                 31
Bed & Breakfast      25
Other                11
Boat                  8
Entire Floor          3
Dorm                  2
UNKNOWN               2
Villa                 2
Camper/RV             1
Guesthouse            1
Name: property_type, dtype: int64

### One-hot-encoding
Now let's preprocess the categorical variables using one-hot encoding

In [84]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()

train_categorical_columns_1hot = cat_encoder.fit_transform(train_categorical_columns_imputed_df)

train_categorical_columns_1hot

<2488x51 sparse matrix of type '<class 'numpy.float64'>'
	with 12440 stored elements in Compressed Sparse Row format>

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [85]:
train_categorical_columns_1hot.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [86]:
cat_encoder.categories_

[array(['Allston', 'Back Bay', 'Bay Village', 'Beacon Hill', 'Brighton',
        'Charlestown', 'Chinatown', 'Dorchester', 'Downtown',
        'East Boston', 'Fenway', 'Hyde Park', 'Jamaica Plain',
        'Leather District', 'Longwood Medical Area', 'Mattapan',
        'Mission Hill', 'North End', 'Roslindale', 'Roxbury',
        'South Boston', 'South Boston Waterfront', 'South End', 'West End',
        'West Roxbury'], dtype=object),
 array(['Apartment', 'Bed & Breakfast', 'Boat', 'Camper/RV', 'Condominium',
        'Dorm', 'Entire Floor', 'Guesthouse', 'House', 'Loft', 'Other',
        'Townhouse', 'UNKNOWN', 'Villa'], dtype=object),
 array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object),
 array(['Airbed', 'Couch', 'Futon', 'Pull-out Sofa', 'Real Bed'],
       dtype=object),
 array(['flexible', 'moderate', 'strict', 'super_strict_30'], dtype=object)]

In [87]:
#Let's flatten the array of arrays to get the column names  
## Since it is list of array, we need to for loops for comprehension statement, below first for loop iterate list of array, second
## for loop is iterate through individual array
                                                     
onehot_column_names = [item for sublist in cat_encoder.categories_ for item in sublist]

onehot_column_names

['Allston',
 'Back Bay',
 'Bay Village',
 'Beacon Hill',
 'Brighton',
 'Charlestown',
 'Chinatown',
 'Dorchester',
 'Downtown',
 'East Boston',
 'Fenway',
 'Hyde Park',
 'Jamaica Plain',
 'Leather District',
 'Longwood Medical Area',
 'Mattapan',
 'Mission Hill',
 'North End',
 'Roslindale',
 'Roxbury',
 'South Boston',
 'South Boston Waterfront',
 'South End',
 'West End',
 'West Roxbury',
 'Apartment',
 'Bed & Breakfast',
 'Boat',
 'Camper/RV',
 'Condominium',
 'Dorm',
 'Entire Floor',
 'Guesthouse',
 'House',
 'Loft',
 'Other',
 'Townhouse',
 'UNKNOWN',
 'Villa',
 'Entire home/apt',
 'Private room',
 'Shared room',
 'Airbed',
 'Couch',
 'Futon',
 'Pull-out Sofa',
 'Real Bed',
 'flexible',
 'moderate',
 'strict',
 'super_strict_30']

### Convert back to Pandas

In [88]:
train_categorical_columns_1hot_df = pd.DataFrame(train_categorical_columns_1hot.toarray(), 
                                           columns = onehot_column_names).reset_index(drop=True)

train_categorical_columns_1hot_df.head()

Unnamed: 0,Allston,Back Bay,Bay Village,Beacon Hill,Brighton,Charlestown,Chinatown,Dorchester,Downtown,East Boston,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Do not process the binary variables

## Concatenate all variables

In [89]:
# Concatanete these variables to the existing data set:
# add reset_index(drop=True), otherwise, it adds NaN rows

train_prepared = pd.concat((train_numeric_columns_std_df.reset_index(drop=True), 
                             train_categorical_columns_1hot_df.reset_index(drop=True),
                             train_binary_columns.reset_index(drop=True)), axis=1)

# if you want to create a separate column for missing values, use dummy_na=True:
# pd.get_dummies(df,dummy_na=True)

train_prepared.shape

(2488, 66)

In [90]:
train_prepared.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,...,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,host_is_superhost,host_identity_verified
0,0.674809,0.750765,-0.59054,-0.444401,-0.325787,-0.597103,0.231549,-0.403889,-0.583315,-0.235369,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1
1,-1.521483,-2.432087,-1.161572,-0.444401,-0.325787,-0.597103,-0.183056,-0.403889,1.295617,-0.235369,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0
2,0.464649,-0.150838,2.264618,-0.444401,1.022897,1.399661,0.231549,1.463911,2.100874,-0.235369,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1
3,-2.260874,0.512573,-0.59054,-0.444401,-0.325787,-0.597103,0.646154,-0.403889,-0.046477,-0.235369,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0,1
4,0.242697,0.68434,0.551523,-0.444401,-0.325787,0.401279,1.88997,0.530011,-0.046477,-0.026934,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1


# Process the Test data using "Transform" only

In [91]:
test_targets = test[['price', 'price_gte_150', 'price_category']]

test_numeric_columns = test[['latitude', 'longitude', 'accommodates', 
                   'bathrooms', 'bedrooms', 'beds', 'Number of amenities', 
                   'guests_included', 'price_per_extra_person', 'minimum_nights', 
                   'number_of_reviews', 'number_days_btw_first_last_review', 
                   'review_scores_rating']]

test_binary_columns = test[['host_is_superhost', 'host_identity_verified']]

test_categorical_columns = test[['neighbourhood_cleansed', 'property_type', 
                                 'room_type', 'bed_type', 'cancellation_policy']]

## Process numerical variables - test

### Imputation 

In [92]:
#Transform only

test_numeric_columns_imputed = imputer.transform(test_numeric_columns)

In [93]:
test_numeric_columns_imputed

array([[ 42.3402452 , -71.07663628,   2.        , ...,   7.        ,
        168.        ,  97.        ],
       [ 42.30904986, -71.11253859,   4.        , ...,  32.        ,
        715.        ,  98.        ],
       [ 42.35087002, -71.10197771,   2.        , ...,   0.        ,
          0.        ,  95.        ],
       ...,
       [ 42.35485704, -71.06122845,   2.        , ...,  28.        ,
        161.        ,  97.        ],
       [ 42.34306172, -71.07275114,   2.        , ...,  48.        ,
        474.        ,  98.        ],
       [ 42.2637184 , -71.12021697,   2.        , ...,   0.        ,
          0.        ,  95.        ]])

### Standardize the values


In [94]:
test_numeric_columns_std = scaler.transform(test_numeric_columns_imputed)

test_numeric_columns_std

array([[ 0.01328872,  0.25996953, -0.59054037, ..., -0.34198332,
        -0.2874073 ,  0.51251039],
       [-1.28281128, -0.8730551 ,  0.55152294, ...,  0.32965201,
         1.01147111,  0.63449698],
       [ 0.45472739, -0.5397691 , -0.59054037, ..., -0.53004121,
        -0.68633157,  0.2685372 ],
       ...,
       [ 0.62037959,  0.74621821, -0.59054037, ...,  0.22219036,
        -0.30402915,  0.51251039],
       [ 0.13030914,  0.38257889, -0.59054037, ...,  0.75949862,
         0.43920475,  0.63449698],
       [-3.166237  , -1.11537359, -0.59054037, ..., -0.53004121,
        -0.68633157,  0.2685372 ]])

### Convert back to Pandas

In [95]:
test_numeric_columns_std_df = pd.DataFrame(test_numeric_columns_std, 
                                      columns=test_numeric_columns.columns).reset_index(drop=True)

test_numeric_columns_std_df.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,number_of_reviews,number_days_btw_first_last_review,review_scores_rating
0,0.013289,0.25997,-0.59054,-0.444401,-0.325787,-0.597103,-0.183056,-0.403889,-0.583315,-0.131151,-0.341983,-0.287407,0.51251
1,-1.282811,-0.873055,0.551523,-0.444401,1.022897,0.401279,1.06076,0.530011,-0.046477,-0.131151,0.329652,1.011471,0.634497
2,0.454727,-0.539769,-0.59054,-0.444401,-0.325787,-0.597103,0.231549,0.530011,0.75878,-0.131151,-0.530041,-0.686332,0.268537
3,0.183075,-0.508159,-0.019509,-0.444401,-0.325787,-0.597103,0.438852,-0.403889,-0.583315,-0.026934,-0.449445,0.197001,0.87847
4,1.010367,0.905686,-0.019509,-0.444401,1.022897,-0.597103,0.646154,0.530011,-0.046477,-0.026934,-0.261387,-0.555731,0.634497


In [96]:
test_numeric_columns_std_df.isna().sum()

latitude                             0
longitude                            0
accommodates                         0
bathrooms                            0
bedrooms                             0
beds                                 0
Number of amenities                  0
guests_included                      0
price_per_extra_person               0
minimum_nights                       0
number_of_reviews                    0
number_days_btw_first_last_review    0
review_scores_rating                 0
dtype: int64

## Process the categorical variables - test

In [97]:
#Find the total number of missing values
test_categorical_columns.isna().sum()

neighbourhood_cleansed    0
property_type             1
room_type                 0
bed_type                  0
cancellation_policy       0
dtype: int64

In [98]:
#Impute "unknown" or for categorical text values

test_categorical_columns_imputed = categorical_imputer.transform(test_categorical_columns)

### Convert back to Pandas

In [99]:
test_categorical_columns_imputed_df = pd.DataFrame(test_categorical_columns_imputed, 
                                      columns=test_categorical_columns.columns).reset_index(drop=True)

test_categorical_columns_imputed_df.head()

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,bed_type,cancellation_policy
0,South End,Apartment,Private room,Real Bed,strict
1,Jamaica Plain,Apartment,Entire home/apt,Real Bed,moderate
2,Fenway,Bed & Breakfast,Private room,Real Bed,strict
3,Fenway,Apartment,Entire home/apt,Real Bed,strict
4,North End,Apartment,Entire home/apt,Real Bed,strict


In [100]:
test_categorical_columns_imputed_df['property_type'].value_counts()

Apartment          772
House              165
Condominium         71
Townhouse           19
Bed & Breakfast     16
Loft                 8
Other                6
Villa                4
Boat                 4
UNKNOWN              1
Entire Floor         1
Name: property_type, dtype: int64

### One-hot-encoding
Now let's preprocess the categorical variables using one-hot encoding

In [101]:
test_categorical_columns_1hot = cat_encoder.transform(test_categorical_columns_imputed_df)

test_categorical_columns_1hot

<1067x51 sparse matrix of type '<class 'numpy.float64'>'
	with 5335 stored elements in Compressed Sparse Row format>

By default, the `OneHotEncoder` class returns a sparse array, but we can convert it to a dense array if needed by calling the `toarray()` method:

In [102]:
test_categorical_columns_1hot.toarray()

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [103]:
#One hot column names are still the same

onehot_column_names

['Allston',
 'Back Bay',
 'Bay Village',
 'Beacon Hill',
 'Brighton',
 'Charlestown',
 'Chinatown',
 'Dorchester',
 'Downtown',
 'East Boston',
 'Fenway',
 'Hyde Park',
 'Jamaica Plain',
 'Leather District',
 'Longwood Medical Area',
 'Mattapan',
 'Mission Hill',
 'North End',
 'Roslindale',
 'Roxbury',
 'South Boston',
 'South Boston Waterfront',
 'South End',
 'West End',
 'West Roxbury',
 'Apartment',
 'Bed & Breakfast',
 'Boat',
 'Camper/RV',
 'Condominium',
 'Dorm',
 'Entire Floor',
 'Guesthouse',
 'House',
 'Loft',
 'Other',
 'Townhouse',
 'UNKNOWN',
 'Villa',
 'Entire home/apt',
 'Private room',
 'Shared room',
 'Airbed',
 'Couch',
 'Futon',
 'Pull-out Sofa',
 'Real Bed',
 'flexible',
 'moderate',
 'strict',
 'super_strict_30']

### Convert back to Pandas

In [104]:
test_categorical_columns_1hot_df = pd.DataFrame(test_categorical_columns_1hot.toarray(), 
                                           columns = onehot_column_names).reset_index(drop=True)

test_categorical_columns_1hot_df.head()

Unnamed: 0,Allston,Back Bay,Bay Village,Beacon Hill,Brighton,Charlestown,Chinatown,Dorchester,Downtown,East Boston,...,Shared room,Airbed,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


## Do not transform the binary variables - test

## Concatenate all variables - test

In [105]:
# Concatanete these variables to the existing data set:
# add reset_index(drop=True), otherwise, it adds NaN rows

test_prepared = pd.concat((test_numeric_columns_std_df.reset_index(drop=True), 
                           test_categorical_columns_1hot_df.reset_index(drop=True),
                           test_binary_columns.reset_index(drop=True)), axis=1)

# if you want to create a separate column for missing values, use dummy_na=True:
# pd.get_dummies(df,dummy_na=True)

test_prepared.shape

(1067, 66)

In [106]:
test_prepared.head()

Unnamed: 0,latitude,longitude,accommodates,bathrooms,bedrooms,beds,Number of amenities,guests_included,price_per_extra_person,minimum_nights,...,Couch,Futon,Pull-out Sofa,Real Bed,flexible,moderate,strict,super_strict_30,host_is_superhost,host_identity_verified
0,0.013289,0.25997,-0.59054,-0.444401,-0.325787,-0.597103,-0.183056,-0.403889,-0.583315,-0.131151,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1
1,-1.282811,-0.873055,0.551523,-0.444401,1.022897,0.401279,1.06076,0.530011,-0.046477,-0.131151,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,1
2,0.454727,-0.539769,-0.59054,-0.444401,-0.325787,-0.597103,0.231549,0.530011,0.75878,-0.131151,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,0
3,0.183075,-0.508159,-0.019509,-0.444401,-0.325787,-0.597103,0.438852,-0.403889,-0.583315,-0.026934,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1
4,1.010367,0.905686,-0.019509,-0.444401,1.022897,-0.597103,0.646154,0.530011,-0.046477,-0.026934,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0,1


## What we didn't do:

Visualization<br>
Feature engineering<br>
Modeling