In [1]:
import pandas as pd
import numpy as np
import math
import spacy
import ast
#!pip install -qq -U gensim
from gensim.matutils import cossim

In [2]:
# load datasets
vacancyHk = pd.read_csv('./data/vacancyHk.csv')
listingHk = pd.read_csv('./data/listingHk.csv')
topics = pd.read_csv('./data/listingTopics.csv')

# convert to datetime
vacancyHk.date = pd.to_datetime(vacancyHk.date)
print('date range:',vacancyHk.date.min(),', ', vacancyHk.date.max())

# reset the index for searching
topics.set_index('id', inplace=True)

date range: 2022-06-14 00:00:00 ,  2023-06-13 00:00:00


### Primary Listings Search 
#### Check-in/out dates, number of Guests

In [3]:
# function that returns avail listings for dates selected

def searchListing(checkIn = '2022-06-14', checkOut = '2023-06-13', numGuests = 2):
    # convert dates to datetime and get length of stay
    checkIn, checkOut = pd.to_datetime(checkIn), pd.to_datetime(checkOut)
    stayLen = int((checkOut - checkIn) / np.timedelta64(1, 'D'))
    
    # return listings that match chosen period of stay
    dfStay = vacancyHk.query('minimum_nights >= @stayLen <= maximum_nights').query('date >= @checkIn and date < @checkOut') 
    dfList = dfStay.groupby('listing_id').date.count()
    availListing = [dfList.index[i] for i in range(len(dfList)) if dfList.values[i] == stayLen]
    
    # return listings that match number of guests, period of stay
    return [listing for listing in availListing 
            if listingHk[listingHk.id == listing].accommodates.values[0] >= numGuests]

In [4]:
checkIn, checkOut, numGuests = '2022-06-14', '2022-06-17', 7

In [5]:
%%timeit -r 1 -n 1   # r: repeat, n: number
# check avail listings
print(searchListing(checkIn, checkOut, numGuests))

[263081, 634726, 767022, 1060831, 1944230, 3351826, 3596518, 4179484, 5535344, 6098189, 8207213, 13319328, 14597370, 14902788, 16397688, 16569102, 17684789, 17976109, 21445033, 21454604, 21961333, 22230848, 22297626, 22326498, 22557402, 24289471, 26091331, 26339244, 28526041, 28697889, 28703025, 29837246, 29846996, 30433974, 30484205, 31363590, 32367951, 32852017, 33666696, 35013083, 35696417, 35754300, 35831647, 36865774, 37239063, 37758911, 37945457, 38886826, 44204917, 45195060, 45942492, 46382786, 48760036, 50462322, 52772049, 571999270476425993, 579321019303570754, 583210465090101127, 585684633125306175, 596542152043732856, 608155320545616454, 636103359466288775, 640315573036293518]
1.59 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Secondary Filtering

#### Rank search results by listings with similar price range

In [6]:
def filterPrice(listingId):
    # create df of primary search listings, organized by price bins
    df = pd.qcut(pd.to_numeric(
        listingHk[listingHk.id.isin(searchListing(checkIn, checkOut, numGuests))].price.str.replace(',','', regex=False).str.replace('$','', regex=False)),
                 [0, 0.25, 0.5, 0.75, 1]).to_frame()
    df['id'] = listingHk[listingHk.id.isin(searchListing(checkIn, checkOut, numGuests))].id.to_list()

    # return list of ten listings matching price bin of selected listing
    return df[df.price == df[df.id == listingId].price.values[0]].id.to_list()[1:]

In [7]:
%%timeit -r 1 -n 1 
# running the function
listingId = 263081
print(filterPrice(listingId))

[767022, 1060831, 21454604, 22297626, 21961333, 26091331, 28526041, 32852017, 35754300, 35831647, 36865774, 37945457, 38886826, 571999270476425993, 579321019303570754]
3.01 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


#### Rank search results by listings with similar descriptors

In [8]:
# function to filter for listings with similar descriptors

def filterDesc(listingId):
    dictScore = dict()
    
    # calculate similarity scores of selected listing and all listings
    for listing in searchListing(checkIn, checkOut, numGuests):#dictListing.keys():
        dictScore[listing] = cossim(dict([tuple(el.value for el in i.elts) for i in ast.parse(topics.topics[listingId]).body[0].value.elts]), 
                                    dict([tuple(el.value for el in i.elts) for i in ast.parse(topics.topics[listing]).body[0].value.elts]))

    # return up to top ten listing matches with similar amenities
    return [key for key in dict(sorted(dictScore.items(), key=lambda item: item[1], reverse = True)).keys()][1:11]

In [9]:
%%timeit -r 1 -n 1 
# running the function
listingId = 263081
print(filterDesc(listingId))

[579321019303570754, 3596518, 8207213, 1944230, 22297626, 571999270476425993, 583210465090101127, 3351826, 13319328, 26091331]
1.54 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


#### Rank search results by distance

In [10]:
# function to rank listings by distance

def filterDistance(listingId):
    df = pd.DataFrame(listingHk[listingHk.id.isin(searchListing(checkIn, checkOut, numGuests))].id.to_list(), columns=['id'])
    df['distance'] = [
    math.dist(
        (listingHk[listingHk.id == listingId].longitude.to_list()[0], 
         listingHk[listingHk.id == listingId].latitude.to_list()[0]), 
        (listingHk[listingHk.id.isin(searchListing(checkIn, checkOut, numGuests))].longitude.to_list()[i], 
         listingHk[listingHk.id.isin(searchListing(checkIn, checkOut, numGuests))].latitude.to_list()[i])
        ) for i in range(len(df))]
    df.sort_values(by='distance', inplace=True)
    return df.id.to_list()[1:]  # return nearest listings other than self

In [18]:
listingId = 30484205
print(filterDistance(listingId))

[14902788, 4179484, 16397688, 634726, 28703025, 30433974, 28526041, 636103359466288775, 22326498, 45195060, 21445033, 35754300, 585684633125306175, 22230848, 28697889, 48760036, 29846996, 21454604, 29837246, 37945457, 263081, 33666696, 26339244, 32367951, 44204917, 46382786, 35696417, 26091331, 6098189, 17684789, 22557402, 31363590, 14597370, 596542152043732856, 21961333, 32852017, 45942492, 35831647, 36865774, 37758911, 37239063, 640315573036293518, 608155320545616454, 24289471, 50462322, 16569102, 35013083, 17976109, 767022, 5535344, 3351826, 3596518, 579321019303570754, 8207213, 1944230, 22297626, 1060831, 13319328, 571999270476425993, 38886826, 52772049, 583210465090101127]


#### Rank search results by listings with similar amenities

In [13]:
nlp = spacy.load('en_core_web_lg')  # large pipeline pkg contain word vectors

In [14]:
from spacy.tokens import DocBin

# Deserialize spacy file containing all amenities word embeddings
doc_bin = DocBin().from_disk('./data/amenities.spacy')
docs = list(doc_bin.get_docs(nlp.vocab))

# Loading spacy text into dict values with listing_ids as key
dictListing = dict()
for idx in range(len(listingHk)):
    dictListing[listingHk.id[idx]] = docs[idx]

In [15]:
# function to filter for listings with similar amenties

def filterAmenities(listingId):
    dictScore = dict()
    
    # calculate similarity scores of selected listing and all listings
    for listing in searchListing(checkIn, checkOut, numGuests):#dictListing.keys():
        dictScore[listing] = dictListing[listingId].similarity(dictListing[listing])

    # return up to top ten listing matches with similar amenities
    return [key for key in dict(sorted(dictScore.items(), key=lambda item: item[1], reverse = True)).keys()][1:11]

In [16]:
%%timeit -r 1 -n 1

# running the function
listingId = 263081
print(filterAmenities(listingId))

[767022, 636103359466288775, 37239063, 4179484, 585684633125306175, 634726, 38886826, 35831647, 16569102, 17976109]
1.53 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [17]:
# Code to process all amentities text then serialize and save to file
doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=False)

# processing all amenities text through spacy for purpose of word embeddings
for listing in range(len(listingHk)):
    doc_bin.add(nlp(listingHk.amenities.values[listing].replace('["','').replace('"]','').replace(',','').replace('"','').lower()))

# saving processed text to disk
#doc_bin.to_disk('./data/amenities.spacy')