In [3]:
!wget -O listings.csv.gz https://data.insideairbnb.com/spain/catalonia/barcelona/2024-09-06/data/listings.csv.gz
!wget -O calendar.csv.gz https://data.insideairbnb.com/spain/catalonia/barcelona/2024-09-06/data/calendar.csv.gz
!wget -O reviews.csv.gz https://data.insideairbnb.com/spain/catalonia/barcelona/2024-09-06/data/reviews.csv.gz

!gunzip -f listings.csv.gz
!gunzip -f calendar.csv.gz
!gunzip -f reviews.csv.gz

--2024-10-16 10:31:30--  https://data.insideairbnb.com/spain/catalonia/barcelona/2024-09-06/data/listings.csv.gz
Resolving data.insideairbnb.com (data.insideairbnb.com)... 108.138.51.28, 108.138.51.112, 108.138.51.6, ...
Connecting to data.insideairbnb.com (data.insideairbnb.com)|108.138.51.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9977937 (9.5M) [application/x-gzip]
Saving to: ‘listings.csv.gz’


2024-10-16 10:31:31 (25.8 MB/s) - ‘listings.csv.gz’ saved [9977937/9977937]

--2024-10-16 10:31:31--  https://data.insideairbnb.com/spain/catalonia/barcelona/2024-09-06/data/calendar.csv.gz
Resolving data.insideairbnb.com (data.insideairbnb.com)... 108.138.51.28, 108.138.51.112, 108.138.51.6, ...
Connecting to data.insideairbnb.com (data.insideairbnb.com)|108.138.51.28|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 18042988 (17M) [application/x-gzip]
Saving to: ‘calendar.csv.gz’


2024-10-16 10:31:32 (27.2 MB/s) - ‘calendar.csv.gz’

In [27]:
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

def create_info_df(df):
    info = pd.DataFrame(df.dtypes, columns=['type'])
    info['missing values'] = df.isnull().sum()
    info['example value'] = df.loc[0]
    return info

listings_df = pd.read_csv('listings.csv')
print(listings_df.shape)
create_info_df(listings_df)

(19482, 75)


Unnamed: 0,type,missing values,example value
id,int64,0,18674
listing_url,object,0,https://www.airbnb.com/rooms/18674
scrape_id,int64,0,20240906140800
last_scraped,object,0,2024-09-06
source,object,0,city scrape
name,object,0,Huge flat for 8 people close to Sagrada Familia
description,object,817,"110m2 apartment to rent in Barcelona. Located in the Eixample district, near the Sagrada Familia. It has a small balcony where you can see the temple of Gaudi. Capacity for 8 people. <br /><br />Licence number: HUTB-002062"
neighborhood_overview,object,9444,"Apartment in Barcelona located in the heart of Eixample district, within only 150 m form the great Sagrada Familia and really near of Gaudí Avenue and the famous Sant Pau Hospital . <br />All kind of services in surroundings (shops, supermarkets, restaurants, bars)."
picture_url,object,0,https://a0.muscache.com/pictures/13031453/413cdbfc_original.jpg
host_id,int64,0,71615


In [6]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, MinHashLSH
from pyspark.ml import Pipeline
from pyspark.sql.functions import col
import time

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Barcelona_LSH") \
    .getOrCreate()

# Load Barcelona dataset into Spark DataFrame
listings_df = spark.read.csv("listings.csv", header=True, inferSchema=True)

# Show the schema to identify correct column names
listings_df.printSchema()

# Select relevant columns (replace 'Title' with the correct column, e.g., 'name')
text_df = listings_df.select("name").na.drop()

# Tokenize the text (split into words)
tokenizer = Tokenizer(inputCol="name", outputCol="words")
wordsData = tokenizer.transform(text_df)

# Use HashingTF to create term frequency vectors
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=10000)
featurizedData = hashingTF.transform(wordsData)

# Calculate the TF-IDF
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# Define a function to run LSH with MinHash
def run_lsh_experiment(data, params):
    # Extract parameters
    num_hash_tables = params['numHashTables']
    bucket_length = params['bucketLength']
    
    # Initialize LSH
    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=num_hash_tables)
    
    # Fit the model
    model = mh.fit(data)
    
    # Perform Approximate Similarity Join
    similar_items = model.approxSimilarityJoin(data, data, threshold=bucket_length, distCol="JaccardDistance")
    
    # Filter out exact matches (self-pairs)
    similar_items = similar_items.filter(col("datasetA.name") != col("datasetB.name"))
    
    # Count number of similar pairs found
    count_similar_pairs = similar_items.count()
    
    return count_similar_pairs

# Function to tune parameters using Grid Search
def grid_search_tuning(data, param_grid):
    results = []
    for params in param_grid:
        start_time = time.time()
        similar_pairs_count = run_lsh_experiment(data, params)
        computation_time = time.time() - start_time
        results.append({
            'params': params,
            'similar_pairs_count': similar_pairs_count,
            'computation_time': computation_time
        })
    return results

# Define parameter grid for LSH
param_grid = [
    {'numHashTables': 3, 'bucketLength': 0.1},
    {'numHashTables': 5, 'bucketLength': 0.05},
    {'numHashTables': 7, 'bucketLength': 0.03},
    {'numHashTables': 10, 'bucketLength': 0.01}
]

# Run grid search for parameter tuning
results = grid_search_tuning(rescaledData, param_grid)

# Print results
for result in results:
    print(f"Params: {result['params']}, Similar Pairs: {result['similar_pairs_count']}, Time: {result['computation_time']}s")

# Stop Spark session after completion
spark.stop()


root
 |-- id: string (nullable = true)
 |-- listing_url: string (nullable = true)
 |-- scrape_id: string (nullable = true)
 |-- last_scraped: string (nullable = true)
 |-- source: string (nullable = true)
 |-- name: string (nullable = true)
 |-- description: string (nullable = true)
 |-- neighborhood_overview: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_url: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- host_since: string (nullable = true)
 |-- host_location: string (nullable = true)
 |-- host_about: string (nullable = true)
 |-- host_response_time: string (nullable = true)
 |-- host_response_rate: string (nullable = true)
 |-- host_acceptance_rate: string (nullable = true)
 |-- host_is_superhost: string (nullable = true)
 |-- host_thumbnail_url: string (nullable = true)
 |-- host_picture_url: string (nullable = true)
 |-- host_neighbourhood: string (nullable = true)
 |-- host_listings_cou

24/10/17 18:09:12 WARN MemoryStore: Not enough space to cache broadcast_30 in memory! (computed 376.0 MiB so far)
24/10/17 18:09:12 WARN BlockManager: Persisting block broadcast_30 to disk instead.
24/10/17 18:09:12 WARN MemoryStore: Not enough space to cache broadcast_30 in memory! (computed 369.0 MiB so far)
                                                                                

Params: {'numHashTables': 3, 'bucketLength': 0.1}, Similar Pairs: 628, Time: 5.76982307434082s
Params: {'numHashTables': 5, 'bucketLength': 0.05}, Similar Pairs: 604, Time: 8.066999912261963s
Params: {'numHashTables': 7, 'bucketLength': 0.03}, Similar Pairs: 604, Time: 10.209972858428955s
Params: {'numHashTables': 10, 'bucketLength': 0.01}, Similar Pairs: 604, Time: 15.870138168334961s
