In [None]:
# For Analysis
import numpy as np
import pandas as pd

# For Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# For Calculations
from math import floor

#For Feature Selection
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectFromModel
#from boruta import BorutaPy

#For Modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import LinearSVC, SVR

from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

from sklearn.naive_bayes import GaussianNB as gnb
from sklearn.naive_bayes import MultinomialNB as mnb
from sklearn.naive_bayes import BernoulliNB as bnb

# For Validation
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# For Storing Models
import pickle
%matplotlib inline

In [None]:
listings = pd.read_csv('listings.csv')

In [None]:
listings.head()

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,44077,https://www.airbnb.com/rooms/44077,20220911231053,2022-09-12,city scrape,cosy comfortable Irish cottage twin,Our house was built in 1937 when there was ple...,I like our neighbourhood as there is no shorta...,https://a0.muscache.com/pictures/525706/050a3a...,193005,...,4.93,4.66,4.82,,f,2,0,2,0,1.85
1,85156,https://www.airbnb.com/rooms/85156,20220911231053,2022-09-12,city scrape,Cosy Comfortable Irish Cottage 1 Double Bed,Our Cottage is a charming light filled cottage...,"I love Dundrum and its surrounding areas ,<br ...",https://a0.muscache.com/pictures/1749253/9ed2a...,193005,...,4.88,4.64,4.78,,f,2,0,2,0,1.53
2,159889,https://www.airbnb.com/rooms/159889,20220911231053,2022-09-12,city scrape,Friendly Single Room,Washing can be done at a cost of €5 per load....,Plenty of buses into the city and the area is ...,https://a0.muscache.com/pictures/3031697/a8259...,766611,...,4.9,4.63,4.74,,f,3,0,3,0,2.78
3,162809,https://www.airbnb.com/rooms/162809,20220911231053,2022-09-11,city scrape,5.5 miles south of Dublin City :^),A nice place to relax after the bustle of the ...,"Close to the sea, hill walks and the city - vi...",https://a0.muscache.com/pictures/86694529/c07b...,777681,...,4.97,4.77,4.85,,f,2,0,2,0,3.68
4,165828,https://www.airbnb.com/rooms/165828,20220911231053,2022-09-11,city scrape,Pádraig Pearse apt. Kilmainham,"Don't just visit Dublin, experience Dublin in ...","Enjoy a walk along the grand canal, or just ob...",https://a0.muscache.com/pictures/34311419/b3f6...,790601,...,4.75,4.34,4.55,,t,5,5,0,0,0.48


In [None]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7566 entries, 0 to 7565
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            7566 non-null   int64  
 1   listing_url                                   7566 non-null   object 
 2   scrape_id                                     7566 non-null   int64  
 3   last_scraped                                  7566 non-null   object 
 4   source                                        7566 non-null   object 
 5   name                                          7566 non-null   object 
 6   description                                   7411 non-null   object 
 7   neighborhood_overview                         4194 non-null   object 
 8   picture_url                                   7566 non-null   object 
 9   host_id                                       7566 non-null   i

In [None]:
# Check the statistical distribution for Numerical Columns
listings.describe()

Unnamed: 0,id,scrape_id,host_id,host_listings_count,host_total_listings_count,neighbourhood_group_cleansed,latitude,longitude,accommodates,bathrooms,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,license,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
count,7566.0,7566.0,7566.0,7566.0,7566.0,0.0,7566.0,7566.0,7566.0,0.0,...,6081.0,6085.0,6081.0,6079.0,0.0,7566.0,7566.0,7566.0,7566.0,6209.0
mean,1.265543e+17,20220910000000.0,116534600.0,24.575601,39.837959,,53.34553,-6.25483,3.00608,,...,4.828653,4.843747,4.732065,4.614547,,4.160983,2.362543,1.576394,0.183584,1.319006
std,2.581749e+17,0.0,125777400.0,202.623363,340.458134,,0.049009,0.063257,1.888488,,...,0.363138,0.341995,0.352586,0.441799,,9.654986,7.650292,6.266196,1.50733,1.988691
min,44077.0,20220910000000.0,43984.0,1.0,1.0,,53.18173,-6.52602,1.0,,...,0.0,0.0,1.0,0.0,,1.0,0.0,0.0,0.0,0.01
25%,17420720.0,20220910000000.0,25005390.0,1.0,1.0,,53.32916,-6.278058,2.0,,...,4.81,4.82,4.63,4.5,,1.0,0.0,0.0,0.0,0.13
50%,27980280.0,20220910000000.0,67735350.0,1.0,2.0,,53.34421,-6.25963,2.0,,...,4.95,4.97,4.82,4.71,,1.0,1.0,1.0,0.0,0.61
75%,50342040.0,20220910000000.0,162498800.0,3.0,4.0,,53.357332,-6.233332,4.0,,...,5.0,5.0,5.0,4.89,,3.0,1.0,1.0,0.0,1.76
max,7.130342e+17,20220910000000.0,478789400.0,2157.0,12017.0,,53.635,-6.05291,16.0,,...,5.0,5.0,5.0,5.0,,66.0,55.0,66.0,20.0,44.72


In [None]:
# Check the distribution of Categorical and Text columns
listings.describe(include = ["O"])

Unnamed: 0,listing_url,last_scraped,source,name,description,neighborhood_overview,picture_url,host_url,host_name,host_since,...,property_type,room_type,bathrooms_text,amenities,price,has_availability,calendar_last_scraped,first_review,last_review,instant_bookable
count,7566,7566,7566,7566,7411,4194,7566,7566,7566,7566,...,7566,7566,7562,7566,7566,7566,7566,6209,6209,7566
unique,7566,2,2,7368,7023,3577,7386,5436,2337,2506,...,63,4,31,7062,547,2,2,2086,1458,2
top,https://www.airbnb.com/rooms/44077,2022-09-12,previous scrape,Student only - Pleasant room Close to The Poin...,Well situated and comfortable en-suite room ju...,"In this location, the chances are you won’t se...",https://a0.muscache.com/pictures/cd685d4d-4ff2...,https://www.airbnb.com/users/show/129230780,Paul,2017-05-07,...,Entire rental unit,Entire home/apt,1 bath,"[""Essentials"", ""Hot water kettle"", ""Refrigerat...",$100.00,t,2022-09-12,2017-03-19,2022-08-28,f
freq,1,7033,4174,10,54,54,9,66,136,70,...,1518,3786,2581,34,235,7537,7033,38,196,4778


In [None]:
listings.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [None]:
def pp_fix_dtypes(df):
    #Correct the types of the data where required
    df = df.assign(
        id = df.id.astype(str),
        first_review = pd.to_datetime(df.first_review),
        last_review = pd.to_datetime(df.last_review),
        host_since = pd.to_datetime(df.host_since)
    )
    return df
    

In [None]:
listings = pp_fix_dtypes(listings)

### Support Functions

In [None]:
def get_max_date(df):
    most_recent_date = max(
            max(df.host_since),
            max(df.first_review),
            max(df.last_review)
        )
    return most_recent_date

In [None]:
most_recent_date = get_max_date(listings)

### Functions for handling Missing Values

In [None]:
listings[['host_since','first_review','last_review']].describe()

  listings[['host_since','first_review','last_review']].describe()


Unnamed: 0,host_since,first_review,last_review
count,7566,6209,6209
unique,2506,2086,1458
top,2017-05-07 00:00:00,2017-03-19 00:00:00,2022-08-28 00:00:00
freq,70,38,196
first,2009-10-07 00:00:00,2011-03-20 00:00:00,2014-05-26 00:00:00
last,2022-09-09 00:00:00,2022-09-11 00:00:00,2022-09-11 00:00:00


In [None]:
def handle_missing(df):
    temp_df = df.copy()
    temp_df.host_since = df.host_since.fillna(df.first_review)
    temp_df.bedrooms.fillna(0, inplace = True)
    temp_df.bathrooms.fillna(0, inplace = True)
    temp_df.beds.fillna(0, inplace = True)
    temp_df.dropna(subset = ['review_scores_rating'], inplace = True)
    return temp_df

In [None]:
listings = handle_missing(listings)

In [None]:
recode_features = True
def preprocess(df, test_data = False):
    
    global recode_features, most_recent_date
    
    temp_df = df.copy()
    if recode_features == True:
        temp_df = temp_df.replace(
            {
            'host_has_profile_pic': {'t': True, 'f': False},
            'host_identity_verified': {'t': True, 'f': False},
            'instant_bookable': {'t': True, 'f': False},
            'cancellation_policy': {'super_strict_30':'strict','super_strict_60':'strict'}
            }
        )
        
        def recode_prop(value):
            if value not in ['House', 'Apartment','Condominium','Townhouse','Loft']:
                return 'other_prop_type'
            return value

        temp_df['property_type'] = temp_df['property_type'].apply(recode_prop)

        recode_features = False
    if test_data == False:
        most_recent_date = get_max_date(temp_df)
    print("Reference Date Used: ", most_recent_date)
    
    ## Derive the new columns
    temp_df = temp_df.assign(
        year_of_listing = temp_df.host_since.dt.year,
        month_of_listing = temp_df.host_since.dt.month,
        days_since_hosted = (most_recent_date - temp_df.host_since)/np.timedelta64(1, 'D'),
        price = listings['price'],
        host_response_rate = temp_df.host_response_rate.str.strip('%').astype(float)/100

    )
    temp_df = temp_df.assign(
        bedroom_share = temp_df.bedrooms/temp_df.accommodates,
        bathroom_share = temp_df.bathrooms/temp_df.accommodates
    )
    
    df = temp_df
    print("Preprocessing Completed")
    
    return df

In [None]:
listings = preprocess(listings)

Reference Date Used:  2022-09-11 00:00:00
Preprocessing Completed


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model
from sklearn import preprocessing
import matplotlib.pyplot as plt
from progressbar import ProgressBar
import yaml

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import datasets, feature_extraction, decomposition
from nltk.stem.porter import PorterStemmer

import gensim
from gensim import corpora
import re
import stop_words
from stop_words import get_stop_words
import operator

import pyLDAvis



In [None]:
reviews = pd.read_csv('reviews.csv')

ParserError: ignored

In [None]:
pbar = ProgressBar()

In [None]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
reviews["comments_length"] = np.nan
sent_int = SentimentIntensityAnalyzer()

for i in range(len(reviews["comments"])):
    s = reviews['comments'][i]
    sentiment = sent_int.polarity_scores(s)


NameError: ignored

In [None]:
pip install pyLDAvis.gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement pyLDAvis.gensim (from versions: none)[0m
[31mERROR: No matching distribution found for pyLDAvis.gensim[0m


In [None]:
pip install stop_words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stop_words
  Downloading stop-words-2018.7.23.tar.gz (31 kB)
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-py3-none-any.whl size=32910 sha256=cd236587816addeb4ae8eae8a55be057fb0b8ee65e0142c8f49d83ff621614f7
  Stored in directory: /root/.cache/pip/wheels/eb/03/0d/3bd31c983789aeb0b4d5e2ca48590288d9db1586cf5f225062
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [None]:
pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 7.3 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: pyLDAvis, sklearn
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=4d8a29fafe79d022add0661ba7e685cd8d39b4ed9248f025e2603ac9882ceb8c
  Stored in directory: /root/.cache/pip/wheels/90/61/ec/9dbe9efc3acf9c4e37ba70fbbcc3f3a0ebd1210

In [None]:
listings.loc[listings['amenities'] == '{}','amenities'] = ""

#Remove the symbols and one hot encode the amenities feature
listings['amenities'] = listings['amenities'].map(
    lambda amns: "|".join([amn.replace("}", "").replace("{", "").replace('"', "")\
                           for amn in amns.split(",")]))

amenities = np.unique(np.concatenate(listings['amenities'].map(lambda amns: amns.split("|")).values))

amenities_matrix = np.array([listings['amenities'].map(lambda amns: amn in amns).values for amn in amenities])
amen=pd.DataFrame(data=amenities_matrix.T, columns=amenities)

In [None]:
#Find important features from the amenities for target price prediction
listings_amenities = pd.concat([amen,listings[['id','price']]], axis=1)

In [None]:
listings_amenities.head()

Unnamed: 0,Unnamed: 1,smooth pathway to front door,24-hour check-in,Accessible-height bed,Accessible-height toilet,Air conditioning,Air purifier,BBQ grill,Baby bath,Baby monitor,...,Wide clearance to shower and toilet,Wide doorway,Wide entryway,Wide hallway clearance,Window guards,Wireless Internet,translation missing: en.hosting_amenity_49,translation missing: en.hosting_amenity_50,id,log_price
0,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,6901257,5.010635
1,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,6304928,5.129899
2,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,7919400,4.976734
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,13418779,6.620073
4,True,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,3808709,4.744932


In [None]:
listings_amenities=listings_amenities.drop(columns=['','translation missing: en.hosting_amenity_49','translation missing: en.hosting_amenity_50'])
listings_amenities_long = listings_amenities.melt(id_vars=['id', 'price'], var_name='amenity')

In [None]:
amenity_counts = listings_amenities.drop(['id','price'], axis = 1).sum(axis = 0)

We observe that the above 2 columns are most probably the same.

In [None]:
listings_amenities[['Oven','Hot water','Breakfast']].corr()

Unnamed: 0,EV charger,Free parking on premises,Free parking on street,Paid parking off premises
EV charger,1.0,0.007153,-0.000832,-0.000251
Free parking on premises,0.007153,1.0,-0.006835,-0.006668
Free parking on street,-0.000832,-0.006835,1.0,-0.000314
Paid parking off premises,-0.000251,-0.006668,-0.000314,1.0


In [None]:
# The Dictionary for Recoding Amenities

## The original Amenities grouped by their nature.
amenitiy_recodings = {
    'Flat' : 'Accessible Room',
    ' smooth pathway to front door' : 'Accessible Room',
    'Accessible-height bed' : 'Accessible Room', 
    'Accessible-height toilet'  : 'Accessible Room',
    'Flat smooth pathway to front door'  : 'Accessible Room',
    'Fixed grab bars for shower & toilet'  : 'Accessible Room',
    'Grab-rails for shower and toilet'  : 'Accessible Room', 
    'Step-free access'  : 'Accessible Room',
    'Wheelchair accessible'  : 'Accessible Room',
    'Wide clearance to bed'  : 'Accessible Room', 
    'Wide clearance to shower & toilet'  : 'Accessible Room',
    'Wide clearance to shower and toilet'  : 'Accessible Room',
    'Wide doorway'  : 'Accessible Room',
    'Wide entryway'  : 'Accessible Room', 
    'Wide hallway clearance'  : 'Accessible Room',
    'Roll-in shower with chair'  : 'Accessible Room',
    'Bathtub with shower chair'  : 'Accessible Room',
    'Disabled parking spot'  : 'Accessible Room',
    'Ground floor access'  : 'Accessible Room',
    'Handheld shower head'  : 'Accessible Room',
    
    'Cat(s)' : 'Pet Friendly',
    'Other pet(s)' : 'Pet Friendly',
    'Pets allowed' : 'Pet Friendly',
    'Dog(s)' : 'Pet Friendly',
    'Pets live on this property' : 'Pet Friendly', 
    
    'Table corner guards' : 'Family/kid friendly',
    'Fireplace guards' : 'Family/kid friendly',
    'Childrenâ€™s books and toys' : 'Family/kid friendly',
    'Childrenâ€™s dinnerware' : 'Family/kid friendly', 
    'Outlet covers' : 'Family/kid friendly',
    'Babysitter recommendations' : 'Family/kid friendly',
    'Window guards' : 'Family/kid friendly',
    
    'Baby bath': 'Baby Friendly',
    'Baby monitor': 'Baby Friendly',
    'Pack â€™n Play/travel crib': 'Baby Friendly',
    'Crib': 'Baby Friendly', 
    'Changing table': 'Baby Friendly',
    'Stair gates': 'Baby Friendly',
    'High chair': 'Baby Friendly',
       
    
    'Doorman': 'Security',
    'Doorman Entry': 'Security',
    'Buzzer/wireless intercom': 'Security',
    'Smart lock': 'Security', 
    'Smartlock': 'Security',
    'Keypad': 'Security', 
    
    'Internet' : 'Internet',
    'Pocket wifi' : 'Internet',
    'Wireless Internet' : 'Internet',
    'Ethernet connection' : 'Internet',
    
    'Self Check-In' : '24-hour check-in',
    '24-hour check-in': '24-hour check-in',
    'Luggage dropoff allowed': '24-hour check-in',
    
    'Free parking on premises' : 'Free parking',
    'Free parking on street' : 'Free parking',
    
    'Paid parking off premises' : 'Paid parking',
    
    #'EV charger',
    
    #'Elevator',
    'Elevator in building' : 'Elevator', 
    
    'Lake access' : 'Near Water',
    'Beachfront': 'Near Water',
    'Beach essentials': 'Near Water',
    'Waterfront': 'Near Water',
    
    'Cooking basics' : 'Cooking Allowed', 
    'Dishes and silverware' : 'Cooking Allowed', 
    'Dishwasher' : 'Cooking Allowed',
    'Microwave' : 'Cooking Allowed', 
    'Oven' : 'Cooking Allowed',
    'BBQ grill' : 'Cooking Allowed',
    'Stove' : 'Cooking Allowed', 
    'Kitchen' : 'Cooking Allowed',
    #'Refrigerator', 
    
    #'Coffee maker', 
    
    'Cable TV' : 'TV', 
    #'TV',
    
    #'Game console', 
    
    'Private bathroom' : "Privacy Centric", 
    'Private entrance' : "Privacy Centric",
    'Private living room' : "Privacy Centric",
    'Lock on bedroom door' : "Privacy Centric",
       
    'Dryer' : "Laundry Facility", 
    'Washer' : "Laundry Facility", 
    'Washer / Dryer' : "Laundry Facility",
    
    #'Iron', 
    #'Hangers', 
    
    #'Breakfast',
    
    'First aid kit' : 'Fire and Safety Features',
    'Carbon monoxide detector' : 'Fire and Safety Features',
    'Smoke detector' : 'Fire and Safety Features',
    'Fire extinguisher' : 'Fire and Safety Features', 
    'Safety card' : 'Fire and Safety Features',
    
    'Path to entrance lit at night' : 'Outdoor Lighting', 
    'Well-lit path to entrance' : 'Outdoor Lighting',
    
    'Air conditioning' : 'Climate Control',
    'Heating' : 'Climate Control', 
    'Indoor fireplace' : 'Climate Control', 
    
    #'Gym',
    
    'Body soap' : 'Essentials', 
    'Bath towel' : 'Essentials',
    'Bed linens' : 'Essentials', 
    'Hand or paper towel' : 'Essentials', 
    'Hand soap' : 'Essentials',
    'Shampoo' : 'Essentials',
    'Toilet paper' : 'Essentials',
    'Essentials' : 'Essentials',
    
    #'Cleaning before checkout',
    
    #'Patio or balcony', 
    #'Garden or backyard',
    
    'Bathtub' :'Bathtub',
    'Hot tub' : 'Bathtub', 
     
    
    'Hot water' : 'Hot water', 
    'Hot water kettle' : 'Hot water',
    
    #'Host greets you', 
    
    'Extra pillows and blankets' : 'Comfortable Sleep',
    'Firm matress' : 'Comfortable Sleep',
    'Firm mattress' : 'Comfortable Sleep',
    'Room-darkening shades' : 'Comfortable Sleep'
    }


In [None]:
listings_amenities_long = listings_amenities_long.assign(
    amenity_modified = listings_amenities_long.amenity.replace(amenitiy_recodings)
)

listings_amenities_long.head()

Unnamed: 0,id,log_price,amenity,value,amenity_modified
0,6901257,5.010635,smooth pathway to front door,False,Accessible Room
1,6304928,5.129899,smooth pathway to front door,False,Accessible Room
2,7919400,4.976734,smooth pathway to front door,False,Accessible Room
3,13418779,6.620073,smooth pathway to front door,False,Accessible Room
4,3808709,4.744932,smooth pathway to front door,False,Accessible Room


In [None]:
listings_amenities_simplified = listings_amenities_long.pivot_table(
    index='id',
    columns='amenity_modified',
    values='value', 
    aggfunc='max'
)

In [None]:
#Merge the price column from the listings table
listings_amenities_simplified = listings_amenities_simplified.merge(
    listings.loc[:,['id','price']], 
    how = "inner", 
    on = "id"
)

In [None]:
listings_amenities_simplified.columns

Index(['id', '24-hour check-in', 'Accessible Room', 'Air purifier',
       'Baby Friendly', 'Bathtub', 'Breakfast', 'Cleaning before checkout',
       'Climate Control', 'Coffee maker', 'Comfortable Sleep',
       'Cooking Allowed', 'EV charger', 'Elevator', 'Essentials',
       'Family/kid friendly', 'Fire and Safety Features', 'Free parking',
       'Game console', 'Garden or backyard', 'Gym', 'Hair dryer', 'Hangers',
       'Host greets you', 'Hot water', 'Internet', 'Iron',
       'Laptop friendly workspace', 'Laundry Facility', 'Lockbox',
       'Long term stays allowed', 'Near Water', 'Other', 'Outdoor Lighting',
       'Paid parking', 'Patio or balcony', 'Pet Friendly',
       'Pets live on this property', 'Pool', 'Privacy Centric', 'Refrigerator',
       'Security', 'Single level home', 'Ski in/Ski out', 'Smoking allowed',
       'Suitable for events', 'TV', 'log_price'],
      dtype='object')

#### Checking Feature Importance for Amenity columns

In [None]:
listings_amenities = listings_amenities_simplified

In [None]:
#Reset and set numpy random seed for generating random numbers
np.random.seed(2018)
train = np.random.choice([True, False], listings_amenities.shape[0], replace=True, p=[0.8, 0.2])

#Using the random generated TRUE FALSE vector, perform the test train split
listings_amenities_train = listings_amenities.iloc[train,:].values
listings_amenities_test = listings_amenities.iloc[~train,:].values

#Split the response variables and independent variables
listings_amenities_train_x = listings_amenities_train[:,1:-1]
listings_amenities_train_y = listings_amenities_train[:,-1]
listings_amenities_test_x = listings_amenities_test[:,1:-1]
listings_amenities_test_y = listings_amenities_test[:,-1]
feat_labels=list(listings_amenities)

In [None]:
listings = listings.merge(listings_selected_amenities.drop('price', axis = 1), how = "inner", on = "id")

In [None]:
listings.drop(['first_review','host_since','last_review'], axis = 1, inplace = True)

In [None]:
listings_dtypes = dict(listings.dtypes)

In [None]:
listings.to_csv("preprocessed_listings.csv", index=False)

In [None]:
fig = plt.figure(figsize = (10,4))

ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

listings.review_scores_rating.hist(ax = ax1, bins = 5)
ax1.set_title('Distribution of Rating Score')
ax1.set_xlabel('Rating Score')
ax1.set_ylabel('count')

# The Log Price column
plot = listings.review_scores_rating.plot(kind = 'box', ax = ax2)
ax2.set_title('Distribution of Rating Score')

plt.show()

In [None]:
listings_classification = listings.dropna(axis=0, subset = ['review_scores_rating'])
listings_classification.shape

In [None]:
listings_classification = listings_classification.assign(
    rating_bin_ep = pd.qcut(
        listings_classification['review_scores_rating'],
        q=3,
        duplicates='drop',
        #labels=['below_93', '93_to_98', '99_to_100']
        labels=[0,1,2]
    )
)

# Visualize the Split
sns.countplot(listings_classification['rating_bin_ep'])

In [None]:
listings_classification['rating_bin_ep'].value_counts()

In [None]:
plt.figure(figsize=(18, 6))
plt.subplot(1, 3, 1)
plt.legend()
sns.kdeplot(listings_classification.loc[listings_classification['rating_bin_ep'] == 0,'host_response_rate'], shade=True, color="r",label='host_response_rate, bin=0')
plt.legend()
plt.subplot(1, 3, 2)
sns.kdeplot(listings_classification.loc[listings_classification['rating_bin_ep'] == 1,'host_response_rate'], shade=True, color="b",label='host_response_rate, bin=1')
plt.legend()
plt.subplot(1, 3, 3)
sns.kdeplot(listings_classification.loc[listings_classification['rating_bin_ep'] == 2,'host_response_rate'], shade=True, color="g",label='host_response_rate, bin=2')
plt.legend()
plt.show()

In [None]:
corrmat = listings.corr()
sns.heatmap(corrmat);

In [None]:
np.random.seed(20)
train = np.random.choice([True, False], listings_classification.shape[0], replace=True, p=[0.8, 0.2])

listings_classification_train = listings_classification.iloc[train,:]
listings_classification_test = listings_classification.iloc[~train,:]

In [None]:
train_cols = [
    
    'accommodates', 'bathrooms',
     'host_has_profile_pic',
    'host_identity_verified',  'instant_bookable',
    'bedrooms', 'beds',
    'days_since_hosted', 'price',
    'bedroom_share', 'bathroom_share', 'room_type_Entire home/apt',
    'room_type_Private room', 'property_type','property_type_other_prop_type', 
]


target_col = 'rating_bin_ep'

x_train_logr = listings_classification_train[train_cols]
x_test_logr = listings_classification_test[train_cols]
y_train_logr = listings_classification_train[target_col]
y_test_logr = listings_classification_test[target_col]

print(x_train_logr.shape, x_test_logr.shape, y_train_logr.shape, y_test_logr.shape)

In [None]:
listings_classification['price'] = listings_classification['price'].str.replace('$', '')
listings_classification['price'] = listings_classification['price'].str.replace(',', '')
listings_classification['price'] = listings_classification['price'].astype(float)

In [None]:
scaler = MinMaxScaler()
x_train_logr.loc[:,['accommodates', 'days_since_hosted', 'bedroom_share', 'bathroom_share','price']] = scaler.fit_transform(x_train_logr.loc[:,['accommodates', 'days_since_hosted', 'bedroom_share', 'bathroom_share','price']])
x_test_logr.loc[:,['accommodates', 'days_since_hosted', 'bedroom_share', 'bathroom_share','price']] = scaler.transform(x_test_logr.loc[:,['accommodates', 'days_since_hosted', 'bedroom_share', 'bathroom_share','price']])

In [None]:
listings_classification = listings_classification.replace('t', 1)
listings_classification = listings_classification.replace('f', 0)

In [None]:
#Logistic regression 
C = [0.01, 0.1, 1, 10]
for i in C:
    logit = LogisticRegression(C=i, random_state=0, solver='newton-cg',multi_class='multinomial')
    logit.fit(x_train_logr, y_train_logr)
    print("\n")
    print("C = ",i)
    print('Train accuracy score:',round(logit.score(x_train_logr, y_train_logr),4))
    print('Test accuracy score:',round(logit.score(x_test_logr, y_test_logr),4))

In [None]:
#c=1
logreg = LogisticRegression(C=1, random_state=0, solver='newton-cg',multi_class='multinomial')
logreg.fit(x_train_logr, y_train_logr)

In [None]:
logit_y_predict = logreg.predict(x_test_logr)
y_test_logr = y_test_logr.ravel()
print("Logistic Regression - ROC Curve")
plot_roc(y_test_logr, logit_y_predict,'LR')

In [None]:
#Random forest 
rf = RandomForestClassifier(
    n_estimators=1000, 
    max_depth=30, 
    min_samples_split=10, 
    class_weight="balanced"
    )

rf.fit(x_train, y_train)

In [None]:
rf_y_predict = rf.predict(x_test)
y_test = y_test.ravel()
print("Random Forest - ROC Curve")
plot_roc(y_test, rf_y_predict,'Random Forest')