In [1]:
import kagglehub
import pandas as pd
from scipy.spatial import KDTree
import numpy as np
import requests
from io import BytesIO
from zipfile import ZipFile
import os

In [2]:
# download dataset and load into DataFrame
path_hotel_reviews = kagglehub.dataset_download("jiashenliu/515k-hotel-reviews-data-in-europe")
df = pd.read_csv(path_hotel_reviews + '/Hotel_Reviews.csv')
df['id'] = range(1, len(df) + 1)
# drop rows with missing values (only longitude and latitude columns have missing)
df = df.dropna()

In [3]:
# download and load GeoNames data into  DataFrame
def download_and_load_geonames_data():
    url = "https://download.geonames.org/export/dump/cities500.zip"

    response = requests.get(url, stream=True)
    with ZipFile(BytesIO(response.content)) as zip_ref:
        with zip_ref.open("cities500.txt") as file:
            print("Loading GeoNames data...")
            columns = [
                "geonameid", "name", "asciiname", "alternatenames", "latitude",
                "longitude", "feature_class", "feature_code", "country_code",
                "cc2", "admin1_code", "admin2_code", "admin3_code", "admin4_code",
                "population", "elevation", "dem", "timezone", "modification_date"
            ]

            geonames = pd.read_csv(
                file,
                sep="\t",
                header=None,
                names=columns,
                #usecols=["name", "latitude", "longitude"]
            )
            return geonames

df_geonames = download_and_load_geonames_data()

Loading GeoNames data...


  geonames = pd.read_csv(


In [4]:
row_london = df_geonames.loc[(df_geonames["name"] == "London") & (df_geonames["country_code"] == "GB")]
latitude_london, longitude_london = row_london.iloc[0][["latitude", "longitude"]]

# calculate distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371 
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c

# calculate distance to london for every city
df_geonames['Distance_to_London_km'] = df_geonames.apply(
    lambda row: haversine(latitude_london, longitude_london, row['latitude'], row['longitude']),
    axis=1
)


In [5]:
# find the closest city to a given location
def find_closest_city(lat, lon, geonames, kdtree):
    distance, index = kdtree.query((lat, lon))
    city = geonames.iloc[index]
    return city["geonameid"]

kd_tree = KDTree(df_geonames[["latitude", "longitude"]])
df["geonameid"] = df.apply(
    lambda row: find_closest_city(row["lat"], row["lng"], df_geonames, kd_tree), axis=1
)


In [6]:
# join by geonameid and filter for distance to London <= 15 km
df_merged = df.merge(df_geonames, on="geonameid")
df_merged = df_merged[df_merged["Distance_to_London_km"] <= 15]

# transform Review_Date to datetime and days_since_review to timedelta
df_merged['Review_Date'] = pd.to_datetime(df_merged['Review_Date'], format='%m/%d/%Y').dt.strftime('%Y-%m-%d')
df_merged['days_since_review'] = df_merged['days_since_review'].str.extract(r'(\d+)').astype('int64')

# create result DataFrame
columns_to_keep = ['id','Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date', 'Average_Score', 'Hotel_Name', 'Reviewer_Nationality', 'Negative_Review'
                   ,'Review_Total_Negative_Word_Counts', 'Total_Number_of_Reviews', 'Positive_Review', 'Review_Total_Positive_Word_Counts', 'Total_Number_of_Reviews_Reviewer_Has_Given'
                   ,'Reviewer_Score', 'Tags', 'days_since_review', 'lat', 'lng', 'Distance_to_London_km', 'name']


# select columns to keep and rename columns
df_result = df_merged[columns_to_keep]
df_result = df_result.assign(district=df_result['name'], city='London')
df_result.drop(columns=['name'], inplace=True)

# save result to csv (create directory if not exists)
file_path_booking_london = 'data/booking_london'
os.makedirs(os.path.dirname(file_path_booking_london), exist_ok=True)

df_result.to_csv(file_path_booking_london+'/hotel_reviews_london.csv', index=False, header=True)
df_result.to_parquet(file_path_booking_london+'/hotel_reviews_london.parquet', index=False)
# to read: df = pd.read_parquet(file_path_booking_london+'/hotel_reviews_london.parquet')