# Homework 1 (include all parts from challenge 1+optional, 2 and homework)
Authors:
- Nazarii Drushchak
- Igor Babin
- Uliana Zbezhkhovska

In [12]:
!pip install findspark



In [15]:
import findspark
findspark.init()

In [16]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, avg, when
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

import pandas as pd

In [17]:
sc = pyspark.SparkContext('local[*]')
spark = SparkSession(sc)
spark

## Challenge I

In [None]:
!wget http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2023-09-03/visualisations/listings.csv

--2023-10-15 10:52:19--  http://data.insideairbnb.com/the-netherlands/north-holland/amsterdam/2023-09-03/visualisations/listings.csv
Resolving data.insideairbnb.com (data.insideairbnb.com)... 16.182.104.245, 54.231.163.237, 54.231.196.213, ...
Connecting to data.insideairbnb.com (data.insideairbnb.com)|16.182.104.245|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1698431 (1.6M) [application/csv]
Saving to: ‘listings.csv’


2023-10-15 10:52:20 (1.36 MB/s) - ‘listings.csv’ saved [1698431/1698431]



In [20]:
df = spark.read.csv("listings.csv", header=True, multiLine=True)
df.show(10)

+-------+--------------------+-------+---------------+-------------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+
|     id|                name|host_id|      host_name|neighbourhood_group|       neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|             license|
+-------+--------------------+-------+---------------+-------------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+
| 761411|Condo in Amsterda...|4013546|         Xsjong|               NULL|          Noord-Oost|52.40164|  4.95106|   Private ro

In [21]:
# Tokenize (remove punctuation and split by word), you can do it in pure python or using ml-lib tokenizer
tokenizer = Tokenizer(inputCol="name", outputCol="words")
wordData = tokenizer.transform(df)
wordData.show(10)

+-------+--------------------+-------+---------------+-------------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+--------------------+
|     id|                name|host_id|      host_name|neighbourhood_group|       neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|             license|               words|
+-------+--------------------+-------+---------------+-------------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+--------------------+
| 761411|Condo in Amsterda...|4013546|         Xsjong|          

In [22]:
# Remove stopwords using ML-LIB stopwordsremover, and store in a new column called “CleanTokens”
remover = StopWordsRemover(inputCol="words", outputCol="CleanTokens")
cleanData = remover.transform(wordData)
cleanData.show(10)

+-------+--------------------+-------+---------------+-------------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+--------------------+--------------------+
|     id|                name|host_id|      host_name|neighbourhood_group|       neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|             license|               words|         CleanTokens|
+-------+--------------------+-------+---------------+-------------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+--------------------+--------------------+
|

In [23]:
# But we don’t have a stopwordsremover for all language and contexts.
# Create your own list of stopwords from this text (think: what is a stopword?)
# Remove stopwords again, and store in column “MyCleanTokens”
remover = StopWordsRemover(stopWords=['the', 'a', 'an', 'another', "for", "an", "nor", "but", "or", "yet", "so", 
                                      "in", "under", "towards", "before"], inputCol='words', outputCol='MyCleanTokens')
cleanData = remover.transform(cleanData)
cleanData.show(10)

+-------+--------------------+-------+---------------+-------------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+--------------------+--------------------+--------------------+
|     id|                name|host_id|      host_name|neighbourhood_group|       neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|             license|               words|         CleanTokens|       MyCleanTokens|
+-------+--------------------+-------+---------------+-------------------+--------------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+--

In [24]:
# Perform TFIDF in a new column called “VectorSpace”
hashingTF = HashingTF(inputCol="MyCleanTokens", outputCol="VectorSpace", numFeatures=20)
featurizedData = hashingTF.transform(cleanData)

idf = IDF(inputCol="VectorSpace", outputCol="features")
idfModel = idf.fit(featurizedData)
results = idfModel.transform(featurizedData)

results.select("MyCleanTokens", "features").show(10)

+--------------------+--------------------+
|       MyCleanTokens|            features|
+--------------------+--------------------+
|[condo, amsterdam...|(20,[1,2,7,10,11,...|
|[rental, unit, am...|(20,[1,2,5,11,12,...|
|[boat, amsterdam,...|(20,[0,1,2,5,11,1...|
|[houseboat, amste...|(20,[0,1,5,6,11,1...|
|[rental, unit, am...|(20,[1,2,9,11,12,...|
|[bed, and, breakf...|(20,[0,1,2,7,11,1...|
|[rental, unit, am...|(20,[1,2,11,14,15...|
|[bed, and, breakf...|(20,[0,1,2,11,12,...|
|[condo, amsterdam...|(20,[0,1,2,5,7,10...|
|[townhouse, amste...|(20,[1,2,8,11,12,...|
+--------------------+--------------------+
only showing top 10 rows



## Homework (Optional)

In [25]:
# In a new column(‘word2vec’), repeat the procedure using word2vec instead of TF-IDF.
# https://spark.apache.org/docs/2.2.0/mllib-feature-extraction.html#word2vec

from pyspark.ml.feature import Word2Vec

word2Vec = Word2Vec(vectorSize=20, minCount=0, inputCol="MyCleanTokens", outputCol="word2vec")
model = word2Vec.fit(results)
result = model.transform(results)

result.select("MyCleanTokens", "word2vec").show(10)

+--------------------+--------------------+
|       MyCleanTokens|            word2vec|
+--------------------+--------------------+
|[condo, amsterdam...|[0.00846870935388...|
|[rental, unit, am...|[-0.0800861247948...|
|[boat, amsterdam,...|[-0.0027433656729...|
|[houseboat, amste...|[-0.2136970145556...|
|[rental, unit, am...|[-0.1120799005031...|
|[bed, and, breakf...|[0.09851416200399...|
|[rental, unit, am...|[-0.0722905439989...|
|[bed, and, breakf...|[0.01148741978865...|
|[condo, amsterdam...|[-0.0839317339871...|
|[townhouse, amste...|[-0.0639541620122...|
+--------------------+--------------------+
only showing top 10 rows



In [26]:
# show first row word2vec vector
result.select("word2vec").first()

Row(word2vec=DenseVector([0.0085, 0.0896, 0.0188, -0.011, -0.7019, 0.0164, -0.08, 0.181, -0.1832, -0.0382, -0.1302, 0.249, -0.2939, -0.4009, -0.4478, -0.16, 0.1739, -0.1798, -0.3863, 0.0576]))

## Challenge II

In [29]:
# Take the first 500 flats in the list
# Mysample = df.limit(500)

mysample = result.limit(500)
mysample.count()

500

In [13]:
# Find the 3 nearest neighbors for each element in that subset (candidates and query points are within the sample of 500) 
# USING KNN
from sklearn.neighbors import NearestNeighbors

mysample_pd = mysample.toPandas()
tfidf = mysample_pd['features'].tolist()
text = mysample_pd['name'].tolist()
id_ = mysample_pd['id'].tolist()

# fit nearest neighbors
nbrs = NearestNeighbors(n_neighbors=4).fit(tfidf)
distances, indices = nbrs.kneighbors(tfidf[:5])

# show 3 nearest neighbors for first row except itself
print('id', [id_[i] for i in indices[0]])

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Find the 3 nearest neighbors for each element in that subset (candidates and query points are within the sample of 500) 
# USING LSH with sklearn

# IT IS DEPRECATED
from sklearn.neighbors import LSHForest

mysample_pd = mysample.toPandas()
tfidf = mysample_pd['features'].tolist()
text = mysample_pd['name'].tolist()
id_ = mysample_pd['id'].tolist()

lshf = LSHForest(random_state=42)
lshf.fit(tfidf)

# get the feature vectore of the first row
query = tfidf[0]
id_ = id_[0]

# show 3 nearest neighbors for first row except itself
distances, indices = lshf.kneighbors([query], n_neighbors=4)
for i in range(1, len(distances[0])):
    print("distance: ", distances[0][i], "id: ", id_[indices[0][i]])
    

ImportError: cannot import name 'LSHForest' from 'sklearn.neighbors' (/opt/conda/lib/python3.11/site-packages/sklearn/neighbors/__init__.py)

In [30]:
# Find the 3 nearest neighbors for each element in that subset (candidates and query points are within the sample of 500) 
# USING LSH with pyspark
from pyspark.ml.feature import MinHashLSH

mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=3)
model = mh.fit(mysample)

# get the feature vectore of the first row
key =  mysample.select("features").take(1)[0].features
id_ = mysample.select("id").take(1)[0].id


# show 3 nearest neighbors for first row except itself
model.approxNearestNeighbors(mysample, key, 4).filter(col("id") != id_).show()



+-------+--------------------+-------+----------------+-------------------+-------------+--------+---------+---------------+-----+--------------+-----------------+-----------+-----------------+------------------------------+----------------+---------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+
|     id|                name|host_id|       host_name|neighbourhood_group|neighbourhood|latitude|longitude|      room_type|price|minimum_nights|number_of_reviews|last_review|reviews_per_month|calculated_host_listings_count|availability_365|number_of_reviews_ltm|             license|               words|         CleanTokens|       MyCleanTokens|         VectorSpace|            features|            word2vec|              hashes|           distCol|
+-------+--------------------+-------+----------------+-------------------+-------------+--------+

## Challenge III: Homework

In [32]:
# Repeat the LSH experiment for the full Barcelona dataset, and develop and spark boosted
# methodology to have an efficient parameter tuning process.
