# Import library

In [81]:
# Dataframe manipulation
import numpy as np
import pandas as pd
import math

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
# sns.set(style = 'whitegrid',context = 'notebook')

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer, LabelEncoder

# Modelling Helpers:
from sklearn.preprocessing import Imputer, Normalizer, scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, ShuffleSplit, cross_validate
from sklearn import model_selection
from sklearn.model_selection import train_test_split

# Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
# Evaluation metrics for Classification
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score

# Regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV,ElasticNet,LogisticRegression
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
# Evaluation metrics for Regression 
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error

# Configuration
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)

# Supress warnings
import warnings
warnings.filterwarnings("ignore")

print("Setup complete...")

Setup complete...


# Common function

In [61]:
def Check_Missing_Data(df):    
    # count all missing values of each column
    total = df.isnull().sum().sort_values(ascending=False)
    # calculate percentage of null values for each column
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    return missing_data

# Import dataset

In [86]:
# load dataset
df = pd.read_csv('London_listings.csv')
print(df.shape)
df.head()

(77096, 96)


Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,9554,https://www.airbnb.com/rooms/9554,20181207034825,2018-12-07,"Cozy, 3 minutes to Piccadilly Line",PLEASE CONTACT ME BEFORE BOOKING Homely apartm...,"Hello people, This is a bright, comfortable ro...",PLEASE CONTACT ME BEFORE BOOKING Homely apartm...,none,Details to follow..,,details to follow when i get a chance..,"Kitchen, small dining / smoking room (if you s...",Depends on what else I have on.. It is always ...,"If you smoke, please do so only in the little ...",,,https://a0.muscache.com/im/pictures/77912984/a...,,31655,https://www.airbnb.com/users/show/31655,Guy,2009-08-14,"London, England, United Kingdom",Please contact me before booking! Please see d...,within a few hours,100%,,t,https://a0.muscache.com/im/pictures/9cae5e2c-3...,https://a0.muscache.com/im/pictures/9cae5e2c-3...,LB of Haringey,4.0,4.0,"['email', 'phone', 'manual_online', 'reviews',...",t,f,"London, United Kingdom",LB of Haringey,Haringey,,London,,N8 0EY,London,"London, United Kingdom",GB,United Kingdom,51.587767,-0.105666,f,Apartment,Private room,2,,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Kitchen,""Smoking allowed"",Br...",,$35.00,$198.00,$788.00,,$7.00,1,$15.00,1,730,a week ago,t,18,18,18,291,2018-12-07,133,2012-04-26,2018-08-13,97.0,10.0,9.0,10.0,10.0,9.0,10.0,f,,,f,f,strict_14_with_grace_period,t,f,4,1.65
1,11076,https://www.airbnb.com/rooms/11076,20181207034825,2018-12-07,The Sanctuary,The room has a double bed and a single foldawa...,This Listing is for The Sanctury The accommoda...,The room has a double bed and a single foldawa...,none,"Ealing Broadway, as short walk from our place ...",,extemely good transport links to central londo...,Huge family kitchen and good wifi,We give everyone a great welcome and make sure...,We don't allow alcohol or takeaways,,,https://a0.muscache.com/im/pictures/a0f4c78a-6...,,40471,https://www.airbnb.com/users/show/40471,Rosa,2009-09-22,"London, England, United Kingdom","Hi, I'm Rosa, I'm one of the owners of the At-...",within a day,67%,,f,https://a0.muscache.com/im/users/40471/profile...,https://a0.muscache.com/im/users/40471/profile...,LB of Ealing,6.0,6.0,"['email', 'phone', 'facebook', 'reviews', 'wor...",t,f,"Ealing, Greater London, United Kingdom",LB of Ealing,Ealing,,Ealing,Greater London,W13 8,London,"Ealing, United Kingdom",GB,United Kingdom,51.515645,-0.314508,t,Apartment,Private room,2,,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Breakfast,""Pets l...",,$70.00,,,,,2,$35.00,2,1125,4 months ago,t,0,0,0,0,2018-12-07,3,2016-04-05,2018-10-18,90.0,8.0,9.0,10.0,9.0,9.0,9.0,f,,,t,f,strict_14_with_grace_period,f,f,6,0.09
2,13913,https://www.airbnb.com/rooms/13913,20181207034825,2018-12-07,Holiday London DB Room Let-on going,My bright double bedroom with a large window h...,"Hello Everyone, I'm offering my lovely double ...",My bright double bedroom with a large window h...,business,Finsbury Park is a friendly melting pot commun...,For art lovers I can give guest my Tate Member...,The flat only a 10 minute walk to Finsbury Par...,Guest will have access to the self catering ki...,I like to have little chats with my guest over...,I'm an artist and have my artwork up on the wa...,,,https://a0.muscache.com/im/pictures/985879/b06...,,54730,https://www.airbnb.com/users/show/54730,Alina,2009-11-16,"London, England, United Kingdom",I am a Multi-Media Visual Artist and Creative ...,within a day,67%,,f,https://a0.muscache.com/im/users/54730/profile...,https://a0.muscache.com/im/users/54730/profile...,LB of Islington,4.0,4.0,"['email', 'phone', 'facebook', 'reviews']",t,f,"Islington, Greater London, United Kingdom",LB of Islington,Islington,,Islington,Greater London,N4 3,London,"Islington, United Kingdom",GB,United Kingdom,51.568017,-0.111208,t,Apartment,Private room,2,1.0,1.0,0.0,Real Bed,"{TV,""Cable TV"",Wifi,Kitchen,""Paid parking off ...",538.0,$65.00,$333.00,"$1,176.00",$100.00,$15.00,1,$15.00,1,29,2 weeks ago,t,29,59,89,364,2018-12-07,14,2010-08-18,2018-06-17,95.0,9.0,10.0,9.0,10.0,9.0,9.0,f,,,f,f,moderate,f,f,3,0.14
3,17402,https://www.airbnb.com/rooms/17402,20181207034825,2018-12-07,Superb 3-Bed/2 Bath & Wifi: Trendy W1,"Open from June 2018 after a 3-year break, we a...",Ready again from June 2018 for bookings after ...,"Open from June 2018 after a 3-year break, we a...",none,"Location, location, location! You won't find b...",This property has new flooring throughout. Gue...,You can walk to tourist London or take numerou...,Full use of whole independent apartment,"Always available by email or phone (before, du...",The apartment benefits from new flooring throu...,,,https://a0.muscache.com/im/pictures/5673eb4f-a...,,67564,https://www.airbnb.com/users/show/67564,Liz,2010-01-04,"London, England, United Kingdom",We are Liz and Jack. We manage a number of ho...,within an hour,91%,,t,https://a0.muscache.com/im/users/67564/profile...,https://a0.muscache.com/im/users/67564/profile...,Fitzrovia,15.0,15.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,"London, Fitzrovia, United Kingdom",Fitzrovia,Westminster,,London,Fitzrovia,W1T4BP,London,"London, United Kingdom",GB,United Kingdom,51.520982,-0.140024,t,Apartment,Entire home/apt,6,2.0,3.0,3.0,Real Bed,"{TV,Wifi,Kitchen,""Paid parking off premises"",E...",,$300.00,"$1,378.00",,$350.00,$65.00,4,$10.00,3,365,yesterday,t,26,56,86,360,2018-12-07,39,2011-03-21,2018-10-15,93.0,10.0,9.0,9.0,9.0,10.0,9.0,f,,,f,f,strict_14_with_grace_period,f,f,13,0.42
4,24328,https://www.airbnb.com/rooms/24328,20181207034825,2018-12-07,Battersea 2 bedroom house & parking,"Artist house, high ceiling bedrooms, private p...",- End of terrace two bedroom house close to So...,"Artist house, high ceiling bedrooms, private p...",family,"- Battersea is a quiet family area, easy acces...",- Please have a profile or tell us more about ...,"- 5 mins walk to Battersea Park, 15 mins walk ...",- there is a communal garden in our complex - ...,We rent out our house only when we are away. T...,A house manual will be emailed once a booking ...,,,https://a0.muscache.com/im/pictures/428381/d92...,,41759,https://www.airbnb.com/users/show/41759,Joe,2009-09-28,"Florence, Tuscany, Italy","I've been using Airbnb for a while now, both a...",,,,f,https://a0.muscache.com/im/users/41759/profile...,https://a0.muscache.com/im/users/41759/profile...,Battersea,2.0,2.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,t,"London, United Kingdom",Battersea,Wandsworth,,London,,SW11 5GX,London,"London, United Kingdom",GB,United Kingdom,51.472981,-0.163764,t,Townhouse,Entire home/apt,4,1.5,2.0,2.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,""Free par...",1001.0,$150.00,"$1,050.00","$3,500.00",$250.00,$70.00,2,$15.00,90,90,9 months ago,t,28,58,88,363,2018-12-07,92,2010-11-15,2016-09-07,98.0,10.0,10.0,10.0,10.0,9.0,9.0,f,,,f,f,strict_14_with_grace_period,t,t,1,0.94


### remove redundant features

In [87]:
remove_list = ['listing_url', 'scrape_id', 'last_scraped', 'name',       
              'interaction', 'house_rules', 'thumbnail_url', 'medium_url',
              'picture_url', 'xl_picture_url', 'host_id', 'host_url',
              'host_thumbnail_url', 'host_picture_url',       
              'host_has_profile_pic',
              'calendar_last_scraped', 
              'first_review', 'last_review',
              'requires_license', 'license', 'jurisdiction_names',
              'is_business_travel_ready',
              'require_guest_profile_picture',
              'require_guest_phone_verification']
print(len(remove_list))        # 25
selected_features = [e for e in df.columns.values if e not in remove_list]
len(selected_features)   # 70
df = df.loc[:,selected_features]
df.shape

24


(77096, 72)

# Fixing incorrect data

The incorrect data is found during exploration stage

### Converting format of price

In [88]:
df['price'] = (df['price'].str.replace(r'[^-+\d.]','').astype(float))
df['monthly_price'] = (df['monthly_price'].str.replace(r'[^-+\d.]','').astype(float))
df['weekly_price'] = (df['weekly_price'].str.replace(r'[^-+\d.]','').astype(float))
df['extra_people'] = (df['extra_people'].str.replace(r'[^-+\d.]','').astype(float))
df['cleaning_fee'] = (df['cleaning_fee'].str.replace(r'[^-+\d.]','').astype(float))

### remove rows with target variable as 0

In [89]:
print(df[df.price == 0].shape)
print(df[df.price > 0].shape)

(33, 72)
(77063, 72)


In [90]:
# remove rows with price ==0
df = df.loc[df.price > 0,:]
df.shape

(77063, 72)

### Converting numeric value from object type to numeric type

In [91]:
# make a list of the categorical variables that contain missing values
categorical_list = [var for var in df.columns if df[var].dtypes=='O']

#### Investigate frequency of labels in each variable

In [92]:
def analyze_frequency_labels(df,var):
    df = df.copy()
    tmp = df.groupby(var)['id'].count()/len(df)
    return tmp

for var in categorical_list:
    print(analyze_frequency_labels(df,var))
    print()

summary
          Curious to experience what living in central London means? This is your ideal choice! Comfortable and big bedroom located in a 4 floors house where people with different nationalities and cultures magically mix and live together. 5 min/tube                                                                                                                                                                                                                                                            0.000013
        Modern, newly renovated 2 bedroom flat with West facing balcony & free secure off-street parking just by Newington Green.  With great transport links to trendy Shoreditch, King's Cross/ St Pancras and just about anywhere in Central London.                                                                                                                                                                                                                                          

description
          Curious to experience what living in central London means? This is your ideal choice! Comfortable and big bedroom located in a 4 floors house where people with different nationalities and cultures magically mix and live together. 5 min/tube    The house is very welcoming and not noisy. It is ideal either to meet new people or to relax.  The house is fully furnished and equipped, guests can use the kitchen, the washing machine, the garden, the living room at their discretion. I am a traveller as well and I lived in different countries and I know how important is to feel at home. I always make sure to establish a trustful relationship with my guest and to give him right tips and directions. Bike Maps and Map provided to fully experience the vibrant city of London. I am brazilian and I can speak Brasilian-Portugues, Spanish, English, German. My partner speaks Italian and French as well.  The house is located in a residential and quiet area in North London, zone 2. Th

notes
  After hosting so many different guest, I have found my house an ideal space for families as well as groups of friends. You can truely feel at home- whilst Central London is only 30 minutes by tube away.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

access
   FREE UNDERGROUND PARKING    INTERNET WITH WI-FI     PLAYSTATION 3 WITH GAMES    LINENS AND TOWELS     SHAMPOO, SHOWER GEL, SOAP    KITCHEN ESSENTIALS                                                                                                                                                                                                             0.000013
 Bathroom, Private access, kitchen facilities, inc Dishwasher, Fridge, Laundry Facilities, Modern Bathroom. Patio and Private Garden.                                                                                                                                                                                                                                0.000013
 For anyone less familiar with this area, Hoxton/Shoreditch it is on the doorstep of some of London's best galleries, great bars  and restaurants, good night live and weekend markets such as Columbia Road market, Hoxton market, Broadway market. There is also a 

host_verifications
None                                                                                                                          0.000091
['email', 'facebook', 'google', 'reviews', 'jumio', 'offline_government_id', 'government_id']                                 0.000013
['email', 'facebook', 'google', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']    0.000013
['email', 'facebook', 'google', 'reviews', 'manual_offline', 'jumio', 'government_id']                                        0.000311
['email', 'facebook', 'jumio', 'government_id']                                                                               0.000013
                                                                                                                                ...   
['reviews', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']                                            0.000026
['reviews', 'selfie']               

city
                               0.000013
 Imperial Wharf                0.000013
 Knightsbridge, London         0.000013
 London                        0.000208
 London Borough of Hackney     0.000013
                                 ...   
布伦特福德                          0.000013
赖斯利普                           0.000013
阿克斯布里奇                         0.000013
뉴 몰든                           0.000013
런던                             0.000052
Name: id, Length: 674, dtype: float64

state
                                                               0.000156
                                                               0.000013
-                                                              0.000324
.                                                              0.000714
.Poplar                                                        0.000078
?                                                              0.000013
Abbey Wood                                                     0.000013
Acton 

smart_location
 Imperial Wharf, United Kingdom                0.000013
 Knightsbridge, London, United Kingdom         0.000013
 London Borough of Hackney , United Kingdom    0.000013
 London, United Kingdom                        0.000208
 Morden, United Kingdom                        0.000013
                                                 ...   
布伦特福德, United Kingdom                          0.000013
赖斯利普, United Kingdom                           0.000013
阿克斯布里奇, United Kingdom                         0.000013
뉴 몰든, United Kingdom                           0.000013
런던, United Kingdom                             0.000052
Name: id, Length: 675, dtype: float64

country_code
ES    0.000013
GB    0.999987
Name: id, dtype: float64

country
Spain             0.000013
United Kingdom    0.999987
Name: id, dtype: float64

is_location_exact
f    0.375783
t    0.624217
Name: id, dtype: float64

property_type
Aparthotel                       0.000649
Apartment                        0.676356
Bar

cancellation_policy
flexible                       0.327381
moderate                       0.239427
strict                         0.000078
strict_14_with_grace_period    0.421966
super_strict_30                0.006242
super_strict_60                0.004905
Name: id, dtype: float64



----------------------------------------------------------------------------------------------------------------------

# 1. Separate dataset into train and test

This is to avoid over-fitting. This step involves randomness, therefore, it's needed to set the seed.

In [93]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)

X_train, X_test, y_train, y_test = train_test_split(df, df.price,test_size=0.1,random_state=0) # we are setting the seed here
X_train.shape, X_test.shape

((69356, 72), (7707, 72))

In [94]:
X_train.head()

Unnamed: 0,id,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,number_of_reviews,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,cancellation_policy,calculated_host_listings_count,reviews_per_month
60730,26581320,"Based in high quality student accommodation, o...",What To Expect: * Great 13 sqm room with moder...,"Based in high quality student accommodation, o...",none,Everybody is after a slice of Tottenham these ...,• Do you have free WiFi? Yes we have super fas...,Tottenham Hale Station provides both National ...,You will enjoy your own private room with an e...,Natasha,2018-07-03,GB,Hey Airbnb'ers!\r\n\r\nI'm Natasha and I work ...,,,,t,,35.0,35.0,['email'],f,"Greater London, England, United Kingdom",LB of Haringey,Haringey,,Greater London,England,N17 9NE,London,"Greater London, United Kingdom",GB,United Kingdom,51.588778,-0.057334,f,Serviced apartment,Entire home/apt,2,1.0,1.0,1.0,Real Bed,"{Wifi,Kitchen,Elevator,Heating,Washer,Dryer,""S...",,55.0,,,$76.00,35.0,1,0.0,1,1125,5 weeks ago,t,0,0,0,0,2,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,flexible,35,0.45
67037,28323093,Located in E2 the house is 5-7 min walk to Bet...,"The house comprises of 5 bedrooms, large fully...",Located in E2 the house is 5-7 min walk to Bet...,none,A must have property with very superb location...,,,,IRoom-UK,2013-08-28,"London, United Kingdom",Who we are?\r\n\r\nWe are young company who br...,,,,f,LB of Newham,6.0,6.0,"['email', 'phone', 'reviews', 'jumio', 'govern...",t,"Greater London, England, United Kingdom",Globe Town,Tower Hamlets,,Greater London,England,E2 9PW,London,"Greater London, United Kingdom",GB,United Kingdom,51.530209,-0.04947,f,House,Private room,1,1.5,1.0,1.0,Real Bed,"{Wifi,Kitchen,""Smoke detector"",""Fire extinguis...",,30.0,,,$85.00,20.0,1,5.0,3,1125,1 week ago,t,0,14,14,14,0,,,,,,,,f,strict_14_with_grace_period,3,
76987,30575629,My place is a very homely two bedroom apartment.,,My place is a very homely two bedroom apartment.,none,,,,,Krasimir,2017-05-07,GB,,within a few hours,100%,,f,Covent Garden,10.0,10.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",f,"Greater London, England, United Kingdom",Fitzrovia,Westminster,,Greater London,England,W1W,London,"Greater London, United Kingdom",GB,United Kingdom,51.517702,-0.138022,f,Apartment,Entire home/apt,4,2.0,2.0,2.0,Real Bed,"{TV,""Cable TV"",Wifi,""Air conditioning"",Kitchen...",,1000.0,,,$500.00,150.0,2,15.0,2,1125,today,t,21,51,81,126,0,,,,,,,,t,strict_14_with_grace_period,10,
40630,20313438,Modern and newly renovated apartment. Gives yo...,,Modern and newly renovated apartment. Gives yo...,none,,,,,Jamie,2017-08-08,"England, United Kingdom",,within a day,100%,,f,Fulham,1.0,1.0,"['email', 'phone']",f,"Greater London, England, United Kingdom",,Hammersmith and Fulham,,Greater London,England,SW6 4AA,London,"Greater London, United Kingdom",GB,United Kingdom,51.478065,-0.19161,t,Apartment,Entire home/apt,4,1.0,2.0,2.0,Real Bed,"{TV,Kitchen,Breakfast,""Indoor fireplace"",Heati...",,101.0,,,$200.00,30.0,1,0.0,2,1125,4 days ago,t,0,4,34,124,0,,,,,,,,f,moderate,1,
60204,26395171,This studio flat is brilliant for visiting the...,,This studio flat is brilliant for visiting the...,none,,,,The guest has access to the entire apartment,Rory,2014-11-18,"Everdon, England, United Kingdom",Keen traveller and welcoming host,within an hour,100%,,f,,1.0,1.0,"['email', 'phone', 'reviews', 'jumio', 'offlin...",t,"London, England, United Kingdom",Pimlico,Westminster,,London,England,SW1V 2QT,London,"London, United Kingdom",GB,United Kingdom,51.489238,-0.132231,f,Apartment,Entire home/apt,2,1.0,0.0,1.0,Real Bed,"{TV,Wifi,Kitchen,Elevator,""Buzzer/wireless int...",,110.0,,,,40.0,2,50.0,2,1125,4 weeks ago,t,13,13,13,13,14,94.0,10.0,9.0,9.0,10.0,10.0,9.0,t,moderate,1,2.71


----------------------------------------------------------------------------------------------------------------------

# 2. Handling missing values

### 2.1. Categorical variables

For categorical variables, fill missing information by adding an additional category: "missing"

In [95]:
# make a list of the categorical variables that contain missing values
categorical_list = [var for var in df.columns if df[var].dtypes=='O']
missing_df = Check_Missing_Data(df[categorical_list])
missing_df.head()

Unnamed: 0,Total,Percent
notes,46730,0.606387
access,32724,0.42464
host_about,31948,0.41457
neighborhood_overview,30180,0.391628
transit,30072,0.390226


In [96]:
missing_df = missing_df.loc[missing_df.Percent < 0.3,]
missing_df.shape
categorical_list = missing_df.index.values
len(categorical_list)

29

In [97]:
# function to replace NA in categorical variables
def fill_categorical_na(df, var_list):
    X = df.copy()
    X[var_list] = df[var_list].fillna('Missing')
    return X
                                      
# replace missing values with new label: "Missing"
X_train = fill_categorical_na(X_train, categorical_list)
# check that we have no missing information in the engineered variables
[var for var in missing_df.index.values if X_train[var].isnull().sum()>0]

[]

In [98]:
# replace missing values with new label: "Missing"
X_test = fill_categorical_na(X_test, missing_df.index.values)
# check that we have no missing information in the engineered variables
[var for var in missing_df.index.values if X_test[var].isnull().sum()>0]

[]

### 2.2. Numerical variables

For numerical variables, add an additional variable to capture the missing information, and then replace the missing information in the original variable by the mode, or most frequent value:

In [99]:
removed_list = ['id','price']
# make a list of the categorical variables that contain missing values
numerical_list = [var for var in df.columns if df[var].dtypes!='O' and var not in removed_list]

# alternative approach
# numerics= ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# numdf = df.select_dtypes(include=numerics)
# numerical_list = numdf.columns
# numerical_list

missing_df = Check_Missing_Data(df[numerical_list])
missing_df.shape

(32, 2)

In [100]:
missing_df = missing_df.loc[missing_df.Percent < 0.3,]
missing_df.shape
numerical_list = missing_df.index.values
len(numerical_list)

27

In [101]:
# function to replace NA in categorical variables
def fill_numerical_na(df, var_list):
    X = df.copy()
    for var in var_list:
        # calculate the mode
        mode_val = X[var].mode()[0]
        X[var].fillna(mode_val, inplace=True)
    return X

In [102]:
# replace mode the missing values
X_train = fill_numerical_na(X_train, numerical_list)
# check that we have no missing information in the engineered variables
[var for var in missing_df.index.values if X_train[var].isnull().sum()>0]

[]

In [103]:
# replace mode the missing values
X_test = fill_numerical_na(X_test, missing_df.index.values)
# check that we have no missing information in the engineered variables
[var for var in missing_df.index.values if X_test[var].isnull().sum()>0]


[]

----------------------------------------------------------------------------------------------------------------------

# Data Transformation 

## 4. Numerical variables

### 4.1. Log Transform Non-Gaussian distributed variables

We will log transform the numerical variables that do not contain zeros in order to get a more Gaussian-like distribution. This tends to help Linear machine learning models. 

In [104]:
X_train['log_price'] = np.log(X_train['price'])
X_test['log_price'] = np.log(X_test['price'])
#for var in ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea', 'SalePrice']:
#    X_train[var] = np.log(X_train[var])
#    X_test[var]= np.log(X_test[var])

## 5. Categorical variables

In [105]:
categorical_list = []
def analyse_frequency_labels(df, var):
    df = df.copy()
    tmp = df.groupby(var)['SalePrice'].count() / len(df)
    return tmp

for var in categorical_list:
    print(analyse_frequency_labels(X_train, var))
    print()

In [106]:
cat_df = pd.DataFrame()
full_ds = pd.concat(objs=[X_train[categorical_list], X_test[categorical_list]], axis=0)
for var in categorical_list:
    var_dummies = pd.get_dummies(full_ds[var], prefix=var, prefix_sep='_',drop_first=True)
    cat_df = pd.concat([cat_df, var_dummies], axis=1)
    
l = X_train.shape[0]
train_cat_df = cat_df[:l].reset_index(drop=True)
test_cat_df = cat_df[l:].reset_index(drop=True)

### selected features

In [107]:
selected_features = numerical_list
selected_features

array(['cleaning_fee', 'review_scores_value', 'review_scores_location',
       'review_scores_checkin', 'review_scores_accuracy',
       'review_scores_communication', 'review_scores_cleanliness',
       'review_scores_rating', 'reviews_per_month', 'bathrooms', 'beds',
       'bedrooms', 'host_listings_count', 'host_total_listings_count',
       'number_of_reviews', 'latitude', 'longitude', 'accommodates',
       'maximum_nights', 'minimum_nights', 'availability_365',
       'availability_90', 'availability_60', 'availability_30',
       'guests_included', 'calculated_host_listings_count',
       'extra_people'], dtype=object)

In [108]:
[var for var in selected_features if X_train[var].isnull().sum()>0]

[]

In [109]:
[var for var in selected_features if X_test[var].isnull().sum()>0]

[]

# Feature Scaling

In [110]:
# fit scaler
scaler = MinMaxScaler() # create an instance
scaler.fit(X_train[selected_features]) #  fit  the scaler to the train set for later use

# transform the train and test set, and add on the Id and SalePrice variables
train_num_df = pd.DataFrame(scaler.transform(X_train[selected_features]), columns=selected_features)
test_num_df = pd.DataFrame(scaler.transform(X_test[selected_features]), columns=selected_features)

In [111]:
train_num_df.head()

Unnamed: 0,cleaning_fee,review_scores_value,review_scores_location,review_scores_checkin,review_scores_accuracy,review_scores_communication,review_scores_cleanliness,review_scores_rating,reviews_per_month,bathrooms,beds,bedrooms,host_listings_count,host_total_listings_count,number_of_reviews,latitude,longitude,accommodates,maximum_nights,minimum_nights,availability_365,availability_90,availability_60,availability_30,guests_included,calculated_host_listings_count,extra_people
0,0.057003,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.026586,0.071429,0.043478,0.022727,0.023585,0.023585,0.00346,0.759846,0.564347,0.052632,5.234033e-07,0.0,0.0,0.0,0.0,0.0,0.0,0.031336,0.0
1,0.032573,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.059819,0.107143,0.043478,0.022727,0.004043,0.004043,0.0,0.608466,0.574343,0.0,5.234033e-07,0.001779,0.038356,0.155556,0.233333,0.0,0.0,0.001843,0.021277
2,0.2443,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.059819,0.142857,0.086957,0.045455,0.006739,0.006739,0.0,0.57614,0.461781,0.157895,5.234033e-07,0.00089,0.345205,0.9,0.85,0.7,0.043478,0.008295,0.06383
3,0.04886,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.059819,0.071429,0.086957,0.045455,0.000674,0.000674,0.0,0.473692,0.393664,0.157895,5.234033e-07,0.00089,0.339726,0.377778,0.066667,0.0,0.0,0.0,0.0
4,0.065147,0.875,1.0,0.875,1.0,1.0,0.875,0.925,0.163142,0.071429,0.043478,0.0,0.000674,0.000674,0.024221,0.50257,0.469141,0.052632,5.234033e-07,0.00089,0.035616,0.144444,0.216667,0.433333,0.043478,0.0,0.212766


----------------------------------------------------------------------------------------------------------------------

## Save preprocessed dataset

In [119]:
train_ds = pd.concat([pd.DataFrame(X_train[['id']]).reset_index(drop=True),train_num_df,train_cat_df,pd.DataFrame(X_train['log_price']).reset_index(drop=True)],axis = 1)
train_ds.to_csv('preprocessed_train_01.csv',index=False)
test_ds = pd.concat([pd.DataFrame(X_test[['id']]).reset_index(drop=True),test_num_df,test_cat_df,pd.DataFrame(X_test['log_price']).reset_index(drop=True)],axis = 1)
test_ds.to_csv('preprocessed_test_01.csv',index=False)

In [120]:
# now we save the selected list of features
pd.Series(train_ds.columns).to_csv('selected_features.csv', index=False)

In [121]:
pd.Series(train_num_df.columns)

0                       cleaning_fee
1                review_scores_value
2             review_scores_location
3              review_scores_checkin
4             review_scores_accuracy
5        review_scores_communication
6          review_scores_cleanliness
7               review_scores_rating
8                  reviews_per_month
9                          bathrooms
10                              beds
11                          bedrooms
12               host_listings_count
13         host_total_listings_count
14                 number_of_reviews
15                          latitude
16                         longitude
17                      accommodates
18                    maximum_nights
19                    minimum_nights
20                  availability_365
21                   availability_90
22                   availability_60
23                   availability_30
24                   guests_included
25    calculated_host_listings_count
26                      extra_people
d