# Dataset Creation

## Environment

In [1]:
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns


pd.set_option('display.max_columns', None)

pd.set_option('display.float_format', '{:.6g}'.format)

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

## Dataset

In [2]:
df = pd.read_csv("data/airbnb-sel1-cols.csv")

In [3]:
df.head(10)

Unnamed: 0,Neighbourhood Group Cleansed,Property Type,Room Type,Cancellation Policy,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Guests Included,Extra People,Minimum Nights,Availability 30,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Reviews per Month,Security Deposit,Cleaning Fee,Calculated host listings count,Num_Host_Verifications,Name_Len,Summary_Len,Space_Len,Description_Len,Summary_Lang,is_Thumbnail,is_HostAbout,is_ResponseInHours,is_EntireHome,is_Bed,is_UpdatedToday,Price
0,Centro,Apartment,PrivateRoom,flexible,40.4101,-3.71475,2,1,1,1,1,0,2,1,1,80,98.0,10.0,10.0,10.0,10.0,2.33,,,1,4,34,67,488,1000,True,True,True,False,False,True,False,36
1,Centro,Apartment,EntireHome,strict,40.417,-3.70944,2,1,1,1,1,0,3,12,320,90,95.0,9.0,10.0,10.0,10.0,2.69,100.0,30.0,2,6,32,241,1000,1000,True,True,True,False,True,True,False,85
2,Centro,Apartment,PrivateRoom,flexible,40.4097,-3.71356,2,1,1,1,1,10,3,7,158,20,91.0,9.0,9.0,9.0,10.0,0.6,100.0,,4,4,27,232,469,479,True,True,True,False,False,True,False,20
3,Centro,Apartment,EntireHome,strict,40.4232,-3.71125,4,1,1,2,2,5,3,7,248,55,99.0,10.0,10.0,10.0,10.0,3.56,250.0,15.0,9,6,35,249,1000,1000,False,True,True,True,True,True,True,112
4,Centro,Apartment,EntireHome,moderate,40.4165,-3.71784,8,3,3,4,6,40,3,6,296,46,96.0,10.0,10.0,10.0,10.0,1.13,200.0,20.0,1,4,33,271,939,1000,True,True,True,True,True,True,False,190
5,Centro,Apartment,PrivateRoom,strict,40.4168,-3.71024,2,2,1,1,1,5,10,6,341,3,67.0,5.0,10.0,10.0,10.0,0.14,,15.0,97,4,24,370,187,923,False,True,True,True,False,True,False,30
6,Centro,Apartment,PrivateRoom,strict,40.4139,-3.70564,1,1,1,1,1,0,10,0,281,0,,,,,,,,15.0,97,4,24,341,129,991,False,True,True,True,False,True,False,18
7,Centro,Apartment,PrivateRoom,moderate,40.4092,-3.70405,2,1,1,1,1,10,2,12,347,78,93.0,10.0,9.0,10.0,9.0,2.2,70.0,12.0,3,5,29,148,0,1000,True,True,True,True,False,True,False,38
8,Centro,Apartment,EntireHome,flexible,40.412,-3.70669,2,1,1,1,1,0,14,1,174,8,90.0,10.0,10.0,10.0,10.0,0.72,350.0,20.0,6,5,32,237,0,237,False,True,False,True,True,True,False,71
9,Centro,Apartment,EntireHome,moderate,40.4099,-3.69552,2,1,0,1,1,0,3,4,321,49,90.0,9.0,10.0,10.0,10.0,2.72,,,1,3,34,248,145,899,True,True,False,True,True,False,False,48


In [4]:
df.shape

(13207, 38)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13207 entries, 0 to 13206
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Neighbourhood Group Cleansed    13207 non-null  object 
 1   Property Type                   13207 non-null  object 
 2   Room Type                       13207 non-null  object 
 3   Cancellation Policy             13207 non-null  object 
 4   Latitude                        13207 non-null  float64
 5   Longitude                       13207 non-null  float64
 6   Accommodates                    13207 non-null  int64  
 7   Bathrooms                       13158 non-null  float64
 8   Bedrooms                        13184 non-null  float64
 9   Beds                            13158 non-null  float64
 10  Guests Included                 13207 non-null  int64  
 11  Extra People                    13207 non-null  int64  
 12  Minimum Nights                  

In [6]:
df.isnull().sum()

Neighbourhood Group Cleansed         0
Property Type                        0
Room Type                            0
Cancellation Policy                  0
Latitude                             0
Longitude                            0
Accommodates                         0
Bathrooms                           49
Bedrooms                            23
Beds                                49
Guests Included                      0
Extra People                         0
Minimum Nights                       0
Availability 30                      0
Availability 365                     0
Number of Reviews                    0
Review Scores Rating              2838
Review Scores Cleanliness         2850
Review Scores Checkin             2866
Review Scores Communication       2850
Review Scores Location            2868
Reviews per Month                 2713
Security Deposit                  7572
Cleaning Fee                      5387
Calculated host listings count       0
Num_Host_Verifications   

## NaN imputation

### Price(target) - DROP

In [7]:
df = df[df['Price'].notna()]

In [8]:
df.isnull().sum()

Neighbourhood Group Cleansed         0
Property Type                        0
Room Type                            0
Cancellation Policy                  0
Latitude                             0
Longitude                            0
Accommodates                         0
Bathrooms                           48
Bedrooms                            23
Beds                                48
Guests Included                      0
Extra People                         0
Minimum Nights                       0
Availability 30                      0
Availability 365                     0
Number of Reviews                    0
Review Scores Rating              2832
Review Scores Cleanliness         2844
Review Scores Checkin             2860
Review Scores Communication       2844
Review Scores Location            2862
Reviews per Month                 2707
Security Deposit                  7565
Cleaning Fee                      5383
Calculated host listings count       0
Num_Host_Verifications   

### Security Deposit - NaN=0

In [9]:
df['Security Deposit'] = df['Security Deposit'].fillna(0)

In [10]:
df.isnull().sum()

Neighbourhood Group Cleansed         0
Property Type                        0
Room Type                            0
Cancellation Policy                  0
Latitude                             0
Longitude                            0
Accommodates                         0
Bathrooms                           48
Bedrooms                            23
Beds                                48
Guests Included                      0
Extra People                         0
Minimum Nights                       0
Availability 30                      0
Availability 365                     0
Number of Reviews                    0
Review Scores Rating              2832
Review Scores Cleanliness         2844
Review Scores Checkin             2860
Review Scores Communication       2844
Review Scores Location            2862
Reviews per Month                 2707
Security Deposit                     0
Cleaning Fee                      5383
Calculated host listings count       0
Num_Host_Verifications   

### Claening Fee - NaN=0

In [11]:
df['Cleaning Fee'] = df['Cleaning Fee'].fillna(0)

In [12]:
df.isnull().sum()

Neighbourhood Group Cleansed         0
Property Type                        0
Room Type                            0
Cancellation Policy                  0
Latitude                             0
Longitude                            0
Accommodates                         0
Bathrooms                           48
Bedrooms                            23
Beds                                48
Guests Included                      0
Extra People                         0
Minimum Nights                       0
Availability 30                      0
Availability 365                     0
Number of Reviews                    0
Review Scores Rating              2832
Review Scores Cleanliness         2844
Review Scores Checkin             2860
Review Scores Communication       2844
Review Scores Location            2862
Reviews per Month                 2707
Security Deposit                     0
Cleaning Fee                         0
Calculated host listings count       0
Num_Host_Verifications   

### Bathrooms / Bedrooms / Beds -> Drop

In [13]:
df[(df['Bathrooms'].isnull()) | (df['Bedrooms'].isnull()) | (df['Beds'].isnull())]

Unnamed: 0,Neighbourhood Group Cleansed,Property Type,Room Type,Cancellation Policy,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Guests Included,Extra People,Minimum Nights,Availability 30,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Reviews per Month,Security Deposit,Cleaning Fee,Calculated host listings count,Num_Host_Verifications,Name_Len,Summary_Len,Space_Len,Description_Len,Summary_Lang,is_Thumbnail,is_HostAbout,is_ResponseInHours,is_EntireHome,is_Bed,is_UpdatedToday,Price
10,Centro,Apartment,EntireHome,flexible,40.4081,-3.70032,2,,1,,1,6,6,2,189,49,88,9,10,9,9,1.01,0,0,4,3,16,99,420,656,False,True,True,True,True,True,False,34
450,Centro,Apartment,PrivateRoom,flexible,40.4238,-3.6989,1,1,,,1,0,1,0,74,1,100,10,10,10,10,0.61,0,0,1,3,32,103,0,103,False,True,False,True,False,True,False,68
457,Villaverde,Apartment,PrivateRoom,flexible,40.3517,-3.69362,2,,1,1,1,0,1,30,365,0,,,,,,,0,0,1,2,34,0,252,252,False,True,False,False,False,True,False,190
524,Moncloa,Apartment,EntireHome,moderate,40.4303,-3.7178,5,,1,1,2,10,2,6,341,10,78,7,9,9,9,0.17,150,30,9,3,17,0,1000,1000,False,True,True,True,True,True,False,65
602,Centro,Apartment,EntireHome,flexible,40.4211,-3.70247,3,1,1,,1,0,1,11,295,6,100,10,10,10,10,0.13,200,0,10,4,37,0,1000,1000,False,False,True,True,True,True,False,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12537,Centro,Apartment,EntireHome,strict,40.4279,-3.70999,5,,1,3,1,11,3,3,331,16,91,9,10,9,9,0.24,300,42,11,4,22,0,1000,1000,False,True,True,True,True,True,False,102
12579,Arganzuela,House,PrivateRoom,moderate,40.4028,-3.70256,1,1,,,1,0,2,21,81,4,95,10,10,10,9,1.32,0,0,1,2,27,285,0,532,False,True,False,False,False,True,True,20
12692,Moratalaz,Other,PrivateRoom,flexible,40.4027,-3.63608,2,,1,,1,0,1,30,365,0,,,,,,,0,0,1,0,27,0,209,209,False,False,False,False,False,True,False,75
12738,Barajas,Other,PrivateRoom,strict,40.4523,-3.59439,10,,1,,1,0,3,30,365,0,,,,,,,0,0,1,1,18,0,182,182,False,False,True,False,False,True,False,100


In [14]:
df[(df['Bathrooms'].isnull()) | (df['Bedrooms'].isnull()) | (df['Beds'].isnull())].shape

(83, 38)

In [15]:
df = df[df['Bathrooms'].notna()]

In [16]:
df = df[df['Bedrooms'].notna()]

In [17]:
df = df[df['Beds'].notna()]

In [18]:
df.isnull().sum()

Neighbourhood Group Cleansed         0
Property Type                        0
Room Type                            0
Cancellation Policy                  0
Latitude                             0
Longitude                            0
Accommodates                         0
Bathrooms                            0
Bedrooms                             0
Beds                                 0
Guests Included                      0
Extra People                         0
Minimum Nights                       0
Availability 30                      0
Availability 365                     0
Number of Reviews                    0
Review Scores Rating              2800
Review Scores Cleanliness         2812
Review Scores Checkin             2828
Review Scores Communication       2812
Review Scores Location            2830
Reviews per Month                 2676
Security Deposit                     0
Cleaning Fee                         0
Calculated host listings count       0
Num_Host_Verifications   

## Export Datasets

### Dataset Numeric all

In [19]:
df_numeric = df._get_numeric_data()

In [20]:
df_numeric.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Latitude                        13115 non-null  float64
 1   Longitude                       13115 non-null  float64
 2   Accommodates                    13115 non-null  int64  
 3   Bathrooms                       13115 non-null  float64
 4   Bedrooms                        13115 non-null  float64
 5   Beds                            13115 non-null  float64
 6   Guests Included                 13115 non-null  int64  
 7   Extra People                    13115 non-null  int64  
 8   Minimum Nights                  13115 non-null  int64  
 9   Availability 30                 13115 non-null  int64  
 10  Availability 365                13115 non-null  int64  
 11  Number of Reviews               13115 non-null  int64  
 12  Review Scores Rating            

In [21]:
df_numeric.head()

Unnamed: 0,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Guests Included,Extra People,Minimum Nights,Availability 30,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Reviews per Month,Security Deposit,Cleaning Fee,Calculated host listings count,Num_Host_Verifications,Name_Len,Summary_Len,Space_Len,Description_Len,Summary_Lang,is_Thumbnail,is_HostAbout,is_ResponseInHours,is_EntireHome,is_Bed,is_UpdatedToday,Price
0,40.4101,-3.71475,2,1,1,1,1,0,2,1,1,80,98,10,10,10,10,2.33,0,0,1,4,34,67,488,1000,True,True,True,False,False,True,False,36
1,40.417,-3.70944,2,1,1,1,1,0,3,12,320,90,95,9,10,10,10,2.69,100,30,2,6,32,241,1000,1000,True,True,True,False,True,True,False,85
2,40.4097,-3.71356,2,1,1,1,1,10,3,7,158,20,91,9,9,9,10,0.6,100,0,4,4,27,232,469,479,True,True,True,False,False,True,False,20
3,40.4232,-3.71125,4,1,1,2,2,5,3,7,248,55,99,10,10,10,10,3.56,250,15,9,6,35,249,1000,1000,False,True,True,True,True,True,True,112
4,40.4165,-3.71784,8,3,3,4,6,40,3,6,296,46,96,10,10,10,10,1.13,200,20,1,4,33,271,939,1000,True,True,True,True,True,True,False,190


In [22]:
df_numeric.to_csv("data/airbnb-numeric.csv", index=False)

### Dataset Numeric sin Reviews

In [23]:
df_numeric_noReviews = df._get_numeric_data()

In [24]:
drop_columns = ['Review Scores Rating', 'Review Scores Cleanliness','Review Scores Checkin',
               'Review Scores Communication', 'Review Scores Location', 'Reviews per Month',  
                'Number of Reviews']

In [25]:
df_numeric_noReviews = df_numeric_noReviews.drop(drop_columns, axis=1)

In [26]:
df_numeric_noReviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Latitude                        13115 non-null  float64
 1   Longitude                       13115 non-null  float64
 2   Accommodates                    13115 non-null  int64  
 3   Bathrooms                       13115 non-null  float64
 4   Bedrooms                        13115 non-null  float64
 5   Beds                            13115 non-null  float64
 6   Guests Included                 13115 non-null  int64  
 7   Extra People                    13115 non-null  int64  
 8   Minimum Nights                  13115 non-null  int64  
 9   Availability 30                 13115 non-null  int64  
 10  Availability 365                13115 non-null  int64  
 11  Security Deposit                13115 non-null  float64
 12  Cleaning Fee                    

In [27]:
df_numeric_noReviews.head()

Unnamed: 0,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Guests Included,Extra People,Minimum Nights,Availability 30,Availability 365,Security Deposit,Cleaning Fee,Calculated host listings count,Num_Host_Verifications,Name_Len,Summary_Len,Space_Len,Description_Len,Summary_Lang,is_Thumbnail,is_HostAbout,is_ResponseInHours,is_EntireHome,is_Bed,is_UpdatedToday,Price
0,40.4101,-3.71475,2,1,1,1,1,0,2,1,1,0,0,1,4,34,67,488,1000,True,True,True,False,False,True,False,36
1,40.417,-3.70944,2,1,1,1,1,0,3,12,320,100,30,2,6,32,241,1000,1000,True,True,True,False,True,True,False,85
2,40.4097,-3.71356,2,1,1,1,1,10,3,7,158,100,0,4,4,27,232,469,479,True,True,True,False,False,True,False,20
3,40.4232,-3.71125,4,1,1,2,2,5,3,7,248,250,15,9,6,35,249,1000,1000,False,True,True,True,True,True,True,112
4,40.4165,-3.71784,8,3,3,4,6,40,3,6,296,200,20,1,4,33,271,939,1000,True,True,True,True,True,True,False,190


In [28]:
df_numeric_noReviews.to_csv("data/airbnb-numeric-noReviews.csv", index=False)

#### Reduced Numeric

In [29]:
model = df_numeric_noReviews[['Latitude', 'Accommodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Guests Included', 'Extra People', 
       'Availability 30', 'Cleaning Fee', 'Name_Len', 'Space_Len',
       'Description_Len', 'is_Thumbnail', 'is_HostAbout',
       'is_ResponseInHours', 'is_EntireHome', 
       'Price']]

In [30]:
model.to_csv("data/model.csv", index=False)

### Dataset All

In [31]:
df_all = df.copy()

In [32]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Neighbourhood Group Cleansed    13115 non-null  object 
 1   Property Type                   13115 non-null  object 
 2   Room Type                       13115 non-null  object 
 3   Cancellation Policy             13115 non-null  object 
 4   Latitude                        13115 non-null  float64
 5   Longitude                       13115 non-null  float64
 6   Accommodates                    13115 non-null  int64  
 7   Bathrooms                       13115 non-null  float64
 8   Bedrooms                        13115 non-null  float64
 9   Beds                            13115 non-null  float64
 10  Guests Included                 13115 non-null  int64  
 11  Extra People                    13115 non-null  int64  
 12  Minimum Nights                  

In [33]:
df_all.to_csv("data/airbnb-all.csv", index=False)

### Dataset All noReviews

In [34]:
df_all_noReviews = df.copy()

In [35]:
drop_columns = ['Review Scores Rating', 'Review Scores Cleanliness','Review Scores Checkin',
               'Review Scores Communication', 'Review Scores Location', 'Reviews per Month',  
                'Number of Reviews']

In [36]:
df_all_noReviews = df_all_noReviews.drop(drop_columns, axis=1)

In [37]:
df_all_noReviews.head()

Unnamed: 0,Neighbourhood Group Cleansed,Property Type,Room Type,Cancellation Policy,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Guests Included,Extra People,Minimum Nights,Availability 30,Availability 365,Security Deposit,Cleaning Fee,Calculated host listings count,Num_Host_Verifications,Name_Len,Summary_Len,Space_Len,Description_Len,Summary_Lang,is_Thumbnail,is_HostAbout,is_ResponseInHours,is_EntireHome,is_Bed,is_UpdatedToday,Price
0,Centro,Apartment,PrivateRoom,flexible,40.4101,-3.71475,2,1,1,1,1,0,2,1,1,0,0,1,4,34,67,488,1000,True,True,True,False,False,True,False,36
1,Centro,Apartment,EntireHome,strict,40.417,-3.70944,2,1,1,1,1,0,3,12,320,100,30,2,6,32,241,1000,1000,True,True,True,False,True,True,False,85
2,Centro,Apartment,PrivateRoom,flexible,40.4097,-3.71356,2,1,1,1,1,10,3,7,158,100,0,4,4,27,232,469,479,True,True,True,False,False,True,False,20
3,Centro,Apartment,EntireHome,strict,40.4232,-3.71125,4,1,1,2,2,5,3,7,248,250,15,9,6,35,249,1000,1000,False,True,True,True,True,True,True,112
4,Centro,Apartment,EntireHome,moderate,40.4165,-3.71784,8,3,3,4,6,40,3,6,296,200,20,1,4,33,271,939,1000,True,True,True,True,True,True,False,190


In [38]:
df_all_noReviews.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 31 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Neighbourhood Group Cleansed    13115 non-null  object 
 1   Property Type                   13115 non-null  object 
 2   Room Type                       13115 non-null  object 
 3   Cancellation Policy             13115 non-null  object 
 4   Latitude                        13115 non-null  float64
 5   Longitude                       13115 non-null  float64
 6   Accommodates                    13115 non-null  int64  
 7   Bathrooms                       13115 non-null  float64
 8   Bedrooms                        13115 non-null  float64
 9   Beds                            13115 non-null  float64
 10  Guests Included                 13115 non-null  int64  
 11  Extra People                    13115 non-null  int64  
 12  Minimum Nights                  

In [39]:
df_all_noReviews.to_csv("data/airbnb-all-noREviews.csv", index=False)

### Dataset All Encoded

In [40]:
df_all.head()

Unnamed: 0,Neighbourhood Group Cleansed,Property Type,Room Type,Cancellation Policy,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Guests Included,Extra People,Minimum Nights,Availability 30,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Reviews per Month,Security Deposit,Cleaning Fee,Calculated host listings count,Num_Host_Verifications,Name_Len,Summary_Len,Space_Len,Description_Len,Summary_Lang,is_Thumbnail,is_HostAbout,is_ResponseInHours,is_EntireHome,is_Bed,is_UpdatedToday,Price
0,Centro,Apartment,PrivateRoom,flexible,40.4101,-3.71475,2,1,1,1,1,0,2,1,1,80,98,10,10,10,10,2.33,0,0,1,4,34,67,488,1000,True,True,True,False,False,True,False,36
1,Centro,Apartment,EntireHome,strict,40.417,-3.70944,2,1,1,1,1,0,3,12,320,90,95,9,10,10,10,2.69,100,30,2,6,32,241,1000,1000,True,True,True,False,True,True,False,85
2,Centro,Apartment,PrivateRoom,flexible,40.4097,-3.71356,2,1,1,1,1,10,3,7,158,20,91,9,9,9,10,0.6,100,0,4,4,27,232,469,479,True,True,True,False,False,True,False,20
3,Centro,Apartment,EntireHome,strict,40.4232,-3.71125,4,1,1,2,2,5,3,7,248,55,99,10,10,10,10,3.56,250,15,9,6,35,249,1000,1000,False,True,True,True,True,True,True,112
4,Centro,Apartment,EntireHome,moderate,40.4165,-3.71784,8,3,3,4,6,40,3,6,296,46,96,10,10,10,10,1.13,200,20,1,4,33,271,939,1000,True,True,True,True,True,True,False,190


In [41]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Neighbourhood Group Cleansed    13115 non-null  object 
 1   Property Type                   13115 non-null  object 
 2   Room Type                       13115 non-null  object 
 3   Cancellation Policy             13115 non-null  object 
 4   Latitude                        13115 non-null  float64
 5   Longitude                       13115 non-null  float64
 6   Accommodates                    13115 non-null  int64  
 7   Bathrooms                       13115 non-null  float64
 8   Bedrooms                        13115 non-null  float64
 9   Beds                            13115 non-null  float64
 10  Guests Included                 13115 non-null  int64  
 11  Extra People                    13115 non-null  int64  
 12  Minimum Nights                  

In [42]:
cats_df = df_all[['Neighbourhood Group Cleansed', 'Property Type', 'Room Type','Cancellation Policy']]

In [43]:
cats_df.head()

Unnamed: 0,Neighbourhood Group Cleansed,Property Type,Room Type,Cancellation Policy
0,Centro,Apartment,PrivateRoom,flexible
1,Centro,Apartment,EntireHome,strict
2,Centro,Apartment,PrivateRoom,flexible
3,Centro,Apartment,EntireHome,strict
4,Centro,Apartment,EntireHome,moderate


In [44]:
cats_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Neighbourhood Group Cleansed  13115 non-null  object
 1   Property Type                 13115 non-null  object
 2   Room Type                     13115 non-null  object
 3   Cancellation Policy           13115 non-null  object
dtypes: object(4)
memory usage: 512.3+ KB


In [45]:
transformer = make_column_transformer(
    (OneHotEncoder(drop='first'), 
     ['Neighbourhood Group Cleansed', 'Property Type', 'Room Type','Cancellation Policy']),
    remainder='passthrough')

In [46]:
transformed = transformer.fit_transform(cats_df).toarray()

In [47]:
colums_names = transformer.get_feature_names()



In [48]:
colums_names

['onehotencoder__x0_Barajas',
 'onehotencoder__x0_Carabanchel',
 'onehotencoder__x0_Centro',
 'onehotencoder__x0_Chamartin',
 'onehotencoder__x0_Chamberi',
 'onehotencoder__x0_CiudadLineal',
 'onehotencoder__x0_Fuencarral',
 'onehotencoder__x0_Hortaleza',
 'onehotencoder__x0_Latina',
 'onehotencoder__x0_Moncloa',
 'onehotencoder__x0_Moratalaz',
 'onehotencoder__x0_PuenteVallecas',
 'onehotencoder__x0_Retiro',
 'onehotencoder__x0_Salamanca',
 'onehotencoder__x0_SanBlas',
 'onehotencoder__x0_Tetuan',
 'onehotencoder__x0_Usera',
 'onehotencoder__x0_Vicalvaro',
 'onehotencoder__x0_VillaVallecas',
 'onehotencoder__x0_Villaverde',
 'onehotencoder__x1_BedAndBreakfast',
 'onehotencoder__x1_Condominium',
 'onehotencoder__x1_House',
 'onehotencoder__x1_Loft',
 'onehotencoder__x1_Other',
 'onehotencoder__x2_PrivateRoom',
 'onehotencoder__x2_SharedRoom',
 'onehotencoder__x3_moderate',
 'onehotencoder__x3_strict']

In [49]:
colums_names = [col.replace('onehotencoder__x0', 'Neighbourhood') for col in colums_names]

In [50]:
colums_names = [col.replace('onehotencoder__x1', 'PropertyType') for col in colums_names]

In [51]:
colums_names = [col.replace('onehotencoder__x2', 'BedType') for col in colums_names]

In [52]:
colums_names = [col.replace('onehotencoder__x3', 'Cancellation') for col in colums_names]

In [53]:
colums_names 

['Neighbourhood_Barajas',
 'Neighbourhood_Carabanchel',
 'Neighbourhood_Centro',
 'Neighbourhood_Chamartin',
 'Neighbourhood_Chamberi',
 'Neighbourhood_CiudadLineal',
 'Neighbourhood_Fuencarral',
 'Neighbourhood_Hortaleza',
 'Neighbourhood_Latina',
 'Neighbourhood_Moncloa',
 'Neighbourhood_Moratalaz',
 'Neighbourhood_PuenteVallecas',
 'Neighbourhood_Retiro',
 'Neighbourhood_Salamanca',
 'Neighbourhood_SanBlas',
 'Neighbourhood_Tetuan',
 'Neighbourhood_Usera',
 'Neighbourhood_Vicalvaro',
 'Neighbourhood_VillaVallecas',
 'Neighbourhood_Villaverde',
 'PropertyType_BedAndBreakfast',
 'PropertyType_Condominium',
 'PropertyType_House',
 'PropertyType_Loft',
 'PropertyType_Other',
 'BedType_PrivateRoom',
 'BedType_SharedRoom',
 'Cancellation_moderate',
 'Cancellation_strict']

In [54]:
cats_df_trans = pd.DataFrame(transformed, columns=colums_names)

In [55]:
cats_df_trans

Unnamed: 0,Neighbourhood_Barajas,Neighbourhood_Carabanchel,Neighbourhood_Centro,Neighbourhood_Chamartin,Neighbourhood_Chamberi,Neighbourhood_CiudadLineal,Neighbourhood_Fuencarral,Neighbourhood_Hortaleza,Neighbourhood_Latina,Neighbourhood_Moncloa,Neighbourhood_Moratalaz,Neighbourhood_PuenteVallecas,Neighbourhood_Retiro,Neighbourhood_Salamanca,Neighbourhood_SanBlas,Neighbourhood_Tetuan,Neighbourhood_Usera,Neighbourhood_Vicalvaro,Neighbourhood_VillaVallecas,Neighbourhood_Villaverde,PropertyType_BedAndBreakfast,PropertyType_Condominium,PropertyType_House,PropertyType_Loft,PropertyType_Other,BedType_PrivateRoom,BedType_SharedRoom,Cancellation_moderate,Cancellation_strict
0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13110,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13111,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
13112,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13113,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [56]:
cats_df_trans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13115 entries, 0 to 13114
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Neighbourhood_Barajas         13115 non-null  float64
 1   Neighbourhood_Carabanchel     13115 non-null  float64
 2   Neighbourhood_Centro          13115 non-null  float64
 3   Neighbourhood_Chamartin       13115 non-null  float64
 4   Neighbourhood_Chamberi        13115 non-null  float64
 5   Neighbourhood_CiudadLineal    13115 non-null  float64
 6   Neighbourhood_Fuencarral      13115 non-null  float64
 7   Neighbourhood_Hortaleza       13115 non-null  float64
 8   Neighbourhood_Latina          13115 non-null  float64
 9   Neighbourhood_Moncloa         13115 non-null  float64
 10  Neighbourhood_Moratalaz       13115 non-null  float64
 11  Neighbourhood_PuenteVallecas  13115 non-null  float64
 12  Neighbourhood_Retiro          13115 non-null  float64
 13  N

In [57]:
df_all_ecoded = df_all.copy()

In [58]:
df_all_ecoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Neighbourhood Group Cleansed    13115 non-null  object 
 1   Property Type                   13115 non-null  object 
 2   Room Type                       13115 non-null  object 
 3   Cancellation Policy             13115 non-null  object 
 4   Latitude                        13115 non-null  float64
 5   Longitude                       13115 non-null  float64
 6   Accommodates                    13115 non-null  int64  
 7   Bathrooms                       13115 non-null  float64
 8   Bedrooms                        13115 non-null  float64
 9   Beds                            13115 non-null  float64
 10  Guests Included                 13115 non-null  int64  
 11  Extra People                    13115 non-null  int64  
 12  Minimum Nights                  

In [59]:
df_all_ecoded = df_all_ecoded.drop(['Neighbourhood Group Cleansed', 'Property Type', 'Room Type','Cancellation Policy'], axis=1)

In [60]:
df_all_ecoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 34 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Latitude                        13115 non-null  float64
 1   Longitude                       13115 non-null  float64
 2   Accommodates                    13115 non-null  int64  
 3   Bathrooms                       13115 non-null  float64
 4   Bedrooms                        13115 non-null  float64
 5   Beds                            13115 non-null  float64
 6   Guests Included                 13115 non-null  int64  
 7   Extra People                    13115 non-null  int64  
 8   Minimum Nights                  13115 non-null  int64  
 9   Availability 30                 13115 non-null  int64  
 10  Availability 365                13115 non-null  int64  
 11  Number of Reviews               13115 non-null  int64  
 12  Review Scores Rating            

In [61]:
for col in cats_df_trans:
    df_all_ecoded[col] = cats_df_trans[col]

In [62]:
df_all_ecoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 63 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Latitude                        13115 non-null  float64
 1   Longitude                       13115 non-null  float64
 2   Accommodates                    13115 non-null  int64  
 3   Bathrooms                       13115 non-null  float64
 4   Bedrooms                        13115 non-null  float64
 5   Beds                            13115 non-null  float64
 6   Guests Included                 13115 non-null  int64  
 7   Extra People                    13115 non-null  int64  
 8   Minimum Nights                  13115 non-null  int64  
 9   Availability 30                 13115 non-null  int64  
 10  Availability 365                13115 non-null  int64  
 11  Number of Reviews               13115 non-null  int64  
 12  Review Scores Rating            

In [63]:
df_all_ecoded.head()

Unnamed: 0,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Guests Included,Extra People,Minimum Nights,Availability 30,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Reviews per Month,Security Deposit,Cleaning Fee,Calculated host listings count,Num_Host_Verifications,Name_Len,Summary_Len,Space_Len,Description_Len,Summary_Lang,is_Thumbnail,is_HostAbout,is_ResponseInHours,is_EntireHome,is_Bed,is_UpdatedToday,Price,Neighbourhood_Barajas,Neighbourhood_Carabanchel,Neighbourhood_Centro,Neighbourhood_Chamartin,Neighbourhood_Chamberi,Neighbourhood_CiudadLineal,Neighbourhood_Fuencarral,Neighbourhood_Hortaleza,Neighbourhood_Latina,Neighbourhood_Moncloa,Neighbourhood_Moratalaz,Neighbourhood_PuenteVallecas,Neighbourhood_Retiro,Neighbourhood_Salamanca,Neighbourhood_SanBlas,Neighbourhood_Tetuan,Neighbourhood_Usera,Neighbourhood_Vicalvaro,Neighbourhood_VillaVallecas,Neighbourhood_Villaverde,PropertyType_BedAndBreakfast,PropertyType_Condominium,PropertyType_House,PropertyType_Loft,PropertyType_Other,BedType_PrivateRoom,BedType_SharedRoom,Cancellation_moderate,Cancellation_strict
0,40.4101,-3.71475,2,1,1,1,1,0,2,1,1,80,98,10,10,10,10,2.33,0,0,1,4,34,67,488,1000,True,True,True,False,False,True,False,36,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,40.417,-3.70944,2,1,1,1,1,0,3,12,320,90,95,9,10,10,10,2.69,100,30,2,6,32,241,1000,1000,True,True,True,False,True,True,False,85,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,40.4097,-3.71356,2,1,1,1,1,10,3,7,158,20,91,9,9,9,10,0.6,100,0,4,4,27,232,469,479,True,True,True,False,False,True,False,20,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,40.4232,-3.71125,4,1,1,2,2,5,3,7,248,55,99,10,10,10,10,3.56,250,15,9,6,35,249,1000,1000,False,True,True,True,True,True,True,112,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,40.4165,-3.71784,8,3,3,4,6,40,3,6,296,46,96,10,10,10,10,1.13,200,20,1,4,33,271,939,1000,True,True,True,True,True,True,False,190,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [64]:
df_all_ecoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13115 entries, 0 to 13206
Data columns (total 63 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Latitude                        13115 non-null  float64
 1   Longitude                       13115 non-null  float64
 2   Accommodates                    13115 non-null  int64  
 3   Bathrooms                       13115 non-null  float64
 4   Bedrooms                        13115 non-null  float64
 5   Beds                            13115 non-null  float64
 6   Guests Included                 13115 non-null  int64  
 7   Extra People                    13115 non-null  int64  
 8   Minimum Nights                  13115 non-null  int64  
 9   Availability 30                 13115 non-null  int64  
 10  Availability 365                13115 non-null  int64  
 11  Number of Reviews               13115 non-null  int64  
 12  Review Scores Rating            

In [65]:
df_all_ecoded.to_csv("data/airbnb-all-encoded.csv", index=False)