In [18]:
import warnings

import numpy as np
from sklearn.neighbors import LSHForest, NearestNeighbors
from spark_sklearn import GridSearchCV

from pyspark.ml.feature import (IDF, HashingTF, RegexTokenizer,
                                StopWordsRemover, Tokenizer)
from pyspark.sql.functions import col, when
from pyspark.sql.types import *

In [19]:
warnings.filterwarnings('ignore')

In [20]:
seed = 42

In [21]:
listing = spark.read.csv('listings.csv', header=True).na.fill({'name': ''})
listing.select(col('name')).show(10, False)

+--------------------------------------------------+
|name                                              |
+--------------------------------------------------+
|Clean room Amsterdam. Metro 3min walk.Free parking|
|Sunny and cozy room close to metro                |
|Pop B&B-private room,free parking,sauna,terraces  |
|Tastefully furnished studio with breakfast&parking|
|Cozy room in the SE of Amsterdam                  |
|Great room south of Amsterdam!                    |
|Lovely room in South East Amsterdam               |
|spacious light appartment amsterdam zuid-oost     |
|Amsterdam South Studio                            |
|B&B# green oasis with free parking                |
+--------------------------------------------------+
only showing top 10 rows



In [22]:
listing = Tokenizer(inputCol="name", outputCol='name_tokenized').transform(listing)
listing.select("name", "name_tokenized").show(10, False)

+--------------------------------------------------+----------------------------------------------------------+
|name                                              |name_tokenized                                            |
+--------------------------------------------------+----------------------------------------------------------+
|Clean room Amsterdam. Metro 3min walk.Free parking|[clean, room, amsterdam., metro, 3min, walk.free, parking]|
|Sunny and cozy room close to metro                |[sunny, and, cozy, room, close, to, metro]                |
|Pop B&B-private room,free parking,sauna,terraces  |[pop, b&b-private, room,free, parking,sauna,terraces]     |
|Tastefully furnished studio with breakfast&parking|[tastefully, furnished, studio, with, breakfast&parking]  |
|Cozy room in the SE of Amsterdam                  |[cozy, room, in, the, se, of, amsterdam]                  |
|Great room south of Amsterdam!                    |[great, room, south, of, amsterdam!]                

In [23]:
listing = RegexTokenizer(inputCol="name", outputCol="name_regextokenized", toLowercase=False, pattern="\\W").transform(listing)
listing.select("name", "name_regextokenized").show(10, False)

+--------------------------------------------------+----------------------------------------------------------+
|name                                              |name_regextokenized                                       |
+--------------------------------------------------+----------------------------------------------------------+
|Clean room Amsterdam. Metro 3min walk.Free parking|[Clean, room, Amsterdam, Metro, 3min, walk, Free, parking]|
|Sunny and cozy room close to metro                |[Sunny, and, cozy, room, close, to, metro]                |
|Pop B&B-private room,free parking,sauna,terraces  |[Pop, B, B, private, room, free, parking, sauna, terraces]|
|Tastefully furnished studio with breakfast&parking|[Tastefully, furnished, studio, with, breakfast, parking] |
|Cozy room in the SE of Amsterdam                  |[Cozy, room, in, the, SE, of, Amsterdam]                  |
|Great room south of Amsterdam!                    |[Great, room, south, of, Amsterdam]                 

In [24]:
listing = StopWordsRemover(inputCol='name_regextokenized', outputCol='name_regextokenized_filtered').transform(listing)
listing.select("name", "name_regextokenized_filtered").show(10, False)

+--------------------------------------------------+----------------------------------------------------------+
|name                                              |name_regextokenized_filtered                              |
+--------------------------------------------------+----------------------------------------------------------+
|Clean room Amsterdam. Metro 3min walk.Free parking|[Clean, room, Amsterdam, Metro, 3min, walk, Free, parking]|
|Sunny and cozy room close to metro                |[Sunny, cozy, room, close, metro]                         |
|Pop B&B-private room,free parking,sauna,terraces  |[Pop, B, B, private, room, free, parking, sauna, terraces]|
|Tastefully furnished studio with breakfast&parking|[Tastefully, furnished, studio, breakfast, parking]       |
|Cozy room in the SE of Amsterdam                  |[Cozy, room, SE, Amsterdam]                               |
|Great room south of Amsterdam!                    |[Great, room, south, Amsterdam]                     

In [25]:
stopwords = ['Amsterdam', 'parking', 'room']
listing = StopWordsRemover(inputCol='name_regextokenized_filtered', outputCol='name_regextokenized_filtered_twice', stopWords=stopwords).transform(listing)
listing.select("name", "name_regextokenized_filtered_twice").show(10, False)

+--------------------------------------------------+-------------------------------------------+
|name                                              |name_regextokenized_filtered_twice         |
+--------------------------------------------------+-------------------------------------------+
|Clean room Amsterdam. Metro 3min walk.Free parking|[Clean, Metro, 3min, walk, Free]           |
|Sunny and cozy room close to metro                |[Sunny, cozy, close, metro]                |
|Pop B&B-private room,free parking,sauna,terraces  |[Pop, B, B, private, free, sauna, terraces]|
|Tastefully furnished studio with breakfast&parking|[Tastefully, furnished, studio, breakfast] |
|Cozy room in the SE of Amsterdam                  |[Cozy, SE]                                 |
|Great room south of Amsterdam!                    |[Great, south]                             |
|Lovely room in South East Amsterdam               |[Lovely, South, East]                      |
|spacious light appartment ams

In [26]:
listing = HashingTF(inputCol='name_regextokenized_filtered_twice', outputCol='tf', numFeatures=20).transform(listing)
listing.select("tf").show(10, False)

+-----------------------------------------------+
|tf                                             |
+-----------------------------------------------+
|(20,[2,6,8,11,18],[1.0,1.0,1.0,1.0,1.0])       |
|(20,[5,10,11,19],[1.0,1.0,1.0,1.0])            |
|(20,[4,6,9,10,12,13],[1.0,1.0,2.0,1.0,1.0,1.0])|
|(20,[4,6,8,19],[1.0,1.0,1.0,1.0])              |
|(20,[4,7],[1.0,1.0])                           |
|(20,[15,19],[1.0,1.0])                         |
|(20,[1,9,10],[1.0,1.0,1.0])                    |
|(20,[0,3,9,13,15],[1.0,1.0,1.0,1.0,1.0])       |
|(20,[0,10],[1.0,1.0])                          |
|(20,[5,9,13,15],[1.0,2.0,1.0,1.0])             |
+-----------------------------------------------+
only showing top 10 rows



In [27]:
listing = IDF(inputCol="tf", outputCol="idf").fit(listing).transform(listing)
for row in listing.head(10):
    print(row)

Row(id='20621335', name='Clean room Amsterdam. Metro 3min walk.Free parking', host_id='25403329', host_name='Victor', neighbourhood_group=None, neighbourhood='Bijlmer-Oost', latitude='52.319172968245226', longitude='4.981150531499213', room_type='Private room', price='52', minimum_nights='3', number_of_reviews='23', last_review='2017-11-28', reviews_per_month='6.83', calculated_host_listings_count='1', availability_365='12', name_tokenized=['clean', 'room', 'amsterdam.', 'metro', '3min', 'walk.free', 'parking'], name_regextokenized=['Clean', 'room', 'Amsterdam', 'Metro', '3min', 'walk', 'Free', 'parking'], name_regextokenized_filtered=['Clean', 'room', 'Amsterdam', 'Metro', '3min', 'walk', 'Free', 'parking'], name_regextokenized_filtered_twice=['Clean', 'Metro', '3min', 'walk', 'Free'], tf=SparseVector(20, {2: 1.0, 6: 1.0, 8: 1.0, 11: 1.0, 18: 1.0}), idf=SparseVector(20, {2: 1.9785, 6: 2.6478, 8: 1.6918, 11: 2.1647, 18: 1.6747}))
Row(id='16682109', name='Sunny and cozy room close to me

In [11]:
sample = listing.limit(500)
# convert sparse vector to list
sample = sample.select("idf").rdd.flatMap(lambda x: x).collect()
for row in sample[:10]:
    print(row)

(20,[2,6,8,11,18],[1.978498810168757,2.647771156299783,1.6917914916688026,2.164725096640868,1.6746772233364775])
(20,[5,10,11,19],[2.077127037322882,1.8965992029829544,2.164725096640868,2.062185457324683])
(20,[4,6,9,10,12,13],[1.4578231823836376,2.647771156299783,2.317032835649918,1.8965992029829544,0.8159814823422075,1.41056673546132])
(20,[4,6,8,19],[1.4578231823836376,2.647771156299783,1.6917914916688026,2.062185457324683])
(20,[4,7],[1.4578231823836376,1.711289893742365])
(20,[15,19],[1.4232349791103582,2.062185457324683])
(20,[1,9,10],[1.3631235996147801,1.158516417824959,1.8965992029829544])
(20,[0,3,9,13,15],[1.470170456313043,1.7598861626295568,1.158516417824959,1.41056673546132,1.4232349791103582])
(20,[0,10],[1.470170456313043,1.8965992029829544])
(20,[5,9,13,15],[2.077127037322882,2.317032835649918,1.41056673546132,1.4232349791103582])


In [12]:
distances_lshf, indices_lshf = LSHForest(random_state=seed,
                                         n_estimators=100,
                                         n_candidates=100).fit(sample).kneighbors(sample, n_neighbors=2)

In [13]:
distances_ground_truth, indices_ground_truth = NearestNeighbors(n_neighbors=2).fit(sample).kneighbors(sample)

In [14]:
def score(model, data):
    n_neighbors = model.get_params()['n_neighbors']
    
    _, indicies_model = model.fit(data).kneighbors(data)
    _, indices_ground_truth = NearestNeighbors(n_neighbors=n_neighbors).fit(data).kneighbors(data)
    return np.sum(np.equal(indicies_model, indices_ground_truth)) / (len(sample) * n_neighbors)

In [15]:
param_grid = {
    'random_state': [seed],
    'n_estimators': [10, 25, 50, 100, 150],
    'n_candidates': [10, 25, 50, 100, 150],
    'min_hash_match': [3, 4, 5],
    'n_neighbors': [2, 3, 4]
}

In [16]:
%%time
grid_search_result = GridSearchCV(sc, 
                                  LSHForest(), 
                                  cv=5, 
                                  param_grid=param_grid, 
                                  scoring=score, 
                                  verbose=1).fit(sample)

Fitting 5 folds for each of 225 candidates, totalling 1125 fits
CPU times: user 616 ms, sys: 240 ms, total: 856 ms
Wall time: 38min 53s


My PC characteristics:
Intel(R) Core(TM) i7-6820HQ CPU @ 2.70GHz 
4 logical cores + 4 "virtual" cores / 2 (running this on the virtual machine) = 4 cores overall

In [17]:
print('Best params: {}'.format(grid_search_result.best_estimator_))

Best params: LSHForest(min_hash_match=4, n_candidates=100, n_estimators=50, n_neighbors=2,
     radius=1.0, radius_cutoff_ratio=0.9, random_state=42)


In [16]:
listing_list = listing.select("idf").rdd.flatMap(lambda x: x).collect()
distances_lshf, indices_lshf = LSHForest(min_hash_match=4, 
                 n_candidates=100, 
                 n_estimators=50, 
                 n_neighbors=2,
                 radius=1.0, 
                 radius_cutoff_ratio=0.9, 
                 random_state=42).fit(listing_list).kneighbors(sample, n_neighbors=2)

distances_ground_truth, indices_ground_truth = NearestNeighbors(n_neighbors=2).fit(listing_list).kneighbors(sample)
print("Accuracy: {}".format(np.sum(np.equal(indices_lshf, indices_ground_truth)) / (len(listing_list) * 2)))

Accuracy: 0.014308040525975426
