# Project title: Price model for airbnb listings in Geneva

 ## Nearest Neigbhor Analysis

### Libraries and settings

In [31]:
# Libraries
import folium
import platform
import pandas as pd
import seaborn as sns
import geopandas as gdp
import matplotlib.pyplot as plt
import sqlite3
import fnmatch

# Function to close a sqlite db-connection
def check_conn(conn):
     try:
        conn.cursor()
        return True
     except Exception as ex:
        return False

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Import functions to calculate nearest-neighbors
import nn_functions as nn

### Import data from SQLite dbs

In [33]:
# Import listings data from listings.db 
conn = sqlite3.connect('listings.db')
df_list = pd.read_sql_query("SELECT * FROM listings", conn)
df_list.head()

Unnamed: 0,id,neighbourhood_cleansed,lat,lon,property_type,room_type,price,accommodates,bathrooms,amenities,review_scores_rating,review_scores_location,instant_bookable,calculated_host_listings_count,lake_access,num_amenities
0,42515,Commune de Genève,46.2014,6.15507,Private room in apartment,Private room,72.0,1,1.5,"[""Cable TV"", ""Dishes and silverware"", ""Washer""...",4.69,4.79,0,1,0,22
1,44959,Commune de Genève,46.19704,6.16076,Private room in apartment,Private room,68.0,1,1.5,"[""Extra pillows and blankets"", ""Cable TV"", ""Po...",4.87,4.72,0,1,0,51
2,194110,Commune de Genève,46.2042,6.16067,Entire apartment,Entire home/apt,135.0,4,1.0,"[""Extra pillows and blankets"", ""Cable TV"", ""Di...",4.5,4.85,0,1,0,26
3,276025,Versoix,46.27944,6.16724,Entire apartment,Entire home/apt,65.0,5,1.0,"[""Extra pillows and blankets"", ""Portable fans""...",4.62,4.75,1,1,1,48
4,325986,Commune de Genève,46.21428,6.1461,Entire apartment,Entire home/apt,150.0,2,1.5,"[""Dishes and silverware"", ""Washer"", ""TV"", ""Smo...",4.84,4.76,0,1,1,26


In [34]:
# Import tourist attractions data from attractions.db
conn = sqlite3.connect('tourist.db')
df_ta = pd.read_sql_query("SELECT * FROM tourist", conn)
print(df_ta.head())

# Import public transport data from oev.db
conn = sqlite3.connect('oev.db')
df_pb = pd.read_sql_query("SELECT * FROM oev", conn)
print (df_pb.head())

           id        lat       lon                                    name  \
0   701851300  46.233972  6.055721  Globe de la Science et de l'Innovation   
1   981690477  46.184291  6.139151                        Musée de Carouge   
2   983366077  46.198282  6.108982                           Musée des EAN   
3  1768216261  46.220829  6.152126           Musée d'histoire des sciences   
4  2021513198  46.198828  6.137523                                   MAMCO   

  category  
0   museum  
1   museum  
2   museum  
3   museum  
4   museum  
         lat       lon         tags.name tags.railway
0  46.231334  6.110271   Genève-Aéroport        train
1  46.220426  6.094808           Vernier        train
2  46.263684  6.161146  Creux-de-Genthod        train
3  46.279740  6.165805           Versoix        train
4  46.222273  6.076288            Meyrin        train


In [35]:
# Close db connection (if open)
try:
    if check_conn(conn):
        conn.close()
    else:
        pass
except:
    pass

# Status (True = open, False = closed)
print(check_conn(conn))

False


### Prepare geodataframes

In [36]:
# Convert data frame of listings data to geodataframe
df_list_geo = gdp.GeoDataFrame(df_list, 
                        geometry=gdp.points_from_xy(df_list['lon'], 
                                                    df_list['lat']))

# Set Coordinate Reference System (CRS)
df_list_geo.set_crs(4326, allow_override=True)
print(df_list_geo.shape)
df_list_geo.head()

(2024, 17)


Unnamed: 0,id,neighbourhood_cleansed,lat,lon,property_type,room_type,price,accommodates,bathrooms,amenities,review_scores_rating,review_scores_location,instant_bookable,calculated_host_listings_count,lake_access,num_amenities,geometry
0,42515,Commune de Genève,46.2014,6.15507,Private room in apartment,Private room,72.0,1,1.5,"[""Cable TV"", ""Dishes and silverware"", ""Washer""...",4.69,4.79,0,1,0,22,POINT (6.15507 46.2014)
1,44959,Commune de Genève,46.19704,6.16076,Private room in apartment,Private room,68.0,1,1.5,"[""Extra pillows and blankets"", ""Cable TV"", ""Po...",4.87,4.72,0,1,0,51,POINT (6.16076 46.19704)
2,194110,Commune de Genève,46.2042,6.16067,Entire apartment,Entire home/apt,135.0,4,1.0,"[""Extra pillows and blankets"", ""Cable TV"", ""Di...",4.5,4.85,0,1,0,26,POINT (6.16067 46.2042)
3,276025,Versoix,46.27944,6.16724,Entire apartment,Entire home/apt,65.0,5,1.0,"[""Extra pillows and blankets"", ""Portable fans""...",4.62,4.75,1,1,1,48,POINT (6.16724 46.27944)
4,325986,Commune de Genève,46.21428,6.1461,Entire apartment,Entire home/apt,150.0,2,1.5,"[""Dishes and silverware"", ""Washer"", ""TV"", ""Smo...",4.84,4.76,0,1,1,26,POINT (6.1461 46.21428)


In [37]:
# Convert data frame of tourist attraction data to geodataframe
df_ta_geo = gdp.GeoDataFrame(df_ta, 
                        geometry=gdp.points_from_xy(df_ta['lon'], 
                                                    df_ta['lat']))

# Set Coordinate Reference System (CRS)
df_ta_geo.set_crs(4326, allow_override=True)
print(df_ta_geo.shape)
df_ta_geo.head()

(51, 6)


Unnamed: 0,id,lat,lon,name,category,geometry
0,701851300,46.233972,6.055721,Globe de la Science et de l'Innovation,museum,POINT (6.05572 46.23397)
1,981690477,46.184291,6.139151,Musée de Carouge,museum,POINT (6.13915 46.18429)
2,983366077,46.198282,6.108982,Musée des EAN,museum,POINT (6.10898 46.19828)
3,1768216261,46.220829,6.152126,Musée d'histoire des sciences,museum,POINT (6.15213 46.22083)
4,2021513198,46.198828,6.137523,MAMCO,museum,POINT (6.13752 46.19883)


In [38]:
# Convert data frame of public transport data to geodataframe
df_pb_geo = gdp.GeoDataFrame(df_pb, 
                        geometry=gdp.points_from_xy(df_pb['lon'], 
                                                    df_pb['lat']))

# Set Coordinate Reference System (CRS)
df_pb_geo.set_crs(4326, allow_override=True)
print(df_pb_geo.shape)
df_pb_geo.head()


(193, 5)


Unnamed: 0,lat,lon,tags.name,tags.railway,geometry
0,46.231334,6.110271,Genève-Aéroport,train,POINT (6.11027 46.23133)
1,46.220426,6.094808,Vernier,train,POINT (6.09481 46.22043)
2,46.263684,6.161146,Creux-de-Genthod,train,POINT (6.16115 46.26368)
3,46.27974,6.165805,Versoix,train,POINT (6.16581 46.27974)
4,46.222273,6.076288,Meyrin,train,POINT (6.07629 46.22227)


### Identify closest museum and public transport per listings and calculate its distance

In [39]:
# Closest tourist attracttion of each listing
closest_ta = nn.nearest_neighbor(df_list_geo, 
                                df_ta_geo, 
                                return_dist=True)

print(len(closest_ta), '==', len(df_list_geo))

# Rename the geometry of closest stops gdf so that we can easily identify it
closest_ta = closest_ta.rename(columns={'geometry': 'closest_ta_geom'})

# Rename column distance to dist_ta
closest_ta = closest_ta.rename(columns={'distance': 'dist_ta'})
closest_ta.head()

2024 == 2024


Unnamed: 0,id,lat,lon,name,category,closest_ta_geom,dist_ta
0,12466920546,46.205103,6.154957,Pierre Dyolin,attraction,POINT (6.15496 46.2051),409.552941
1,12467213889,46.199579,6.158517,"Poudingue de Vallorcine, Erratic Boulder",attraction,POINT (6.15852 46.19958),375.547668
2,12467213889,46.199579,6.158517,"Poudingue de Vallorcine, Erratic Boulder",attraction,POINT (6.15852 46.19958),564.152315
3,1768216261,46.220829,6.152126,Musée d'histoire des sciences,museum,POINT (6.15213 46.22083),6694.037169
4,5538694621,46.22123,6.147951,Lost,attraction,POINT (6.14795 46.22123),795.419536


In [40]:
# Closest tourist attracttion of each listing
closest_pb = nn.nearest_neighbor(df_list_geo, 
                                df_pb_geo, 
                                return_dist=True)

print(len(closest_pb), '==', len(df_list_geo))

# Rename the geometry of closest stops gdf so that we can easily identify it
closest_pb = closest_pb.rename(columns={'geometry': 'closest_pb_geom'})

# Rename column distance to dist_ta
closest_pb = closest_pb.rename(columns={'distance': 'dist_pb'})
closest_pb.head()

2024 == 2024


Unnamed: 0,lat,lon,tags.name,tags.railway,closest_pb_geom,dist_pb
0,46.201072,6.156107,Terrassière,tram,POINT (6.15611 46.20107),120.893645
1,46.200606,6.158541,Villereuse,tram,POINT (6.15854 46.20061),465.095061
2,46.200626,6.158549,Villereuse,tram,POINT (6.15855 46.20063),460.10988
3,46.27974,6.165805,Versoix,train,POINT (6.16581 46.27974),162.973109
4,46.213291,6.145502,Môle,tram,POINT (6.1455 46.21329),127.98462


### Merge closest tourist attraction and public transport to listings

### K-nearest neighbors search

In [41]:
from scipy.spatial import KDTree

# Create a KDTree from the tourist attraction data
tree = KDTree(df_ta_geo[['lon', 'lat']])
# Find the number of tourist attractions within 500 m of each listing
num_ta = tree.query_ball_point(df_list_geo[['lon', 'lat']], r=0.005)
# Add the number of tourist attractions within 500m of each listing to the listings data
df_list_geo['num_ta'] = [len(x) for x in num_ta]
df_list_geo.head()


Unnamed: 0,id,neighbourhood_cleansed,lat,lon,property_type,room_type,price,accommodates,bathrooms,amenities,review_scores_rating,review_scores_location,instant_bookable,calculated_host_listings_count,lake_access,num_amenities,geometry,num_ta
0,42515,Commune de Genève,46.2014,6.15507,Private room in apartment,Private room,72.0,1,1.5,"[""Cable TV"", ""Dishes and silverware"", ""Washer""...",4.69,4.79,0,1,0,22,POINT (6.15507 46.2014),4
1,44959,Commune de Genève,46.19704,6.16076,Private room in apartment,Private room,68.0,1,1.5,"[""Extra pillows and blankets"", ""Cable TV"", ""Po...",4.87,4.72,0,1,0,51,POINT (6.16076 46.19704),1
2,194110,Commune de Genève,46.2042,6.16067,Entire apartment,Entire home/apt,135.0,4,1.0,"[""Extra pillows and blankets"", ""Cable TV"", ""Di...",4.5,4.85,0,1,0,26,POINT (6.16067 46.2042),0
3,276025,Versoix,46.27944,6.16724,Entire apartment,Entire home/apt,65.0,5,1.0,"[""Extra pillows and blankets"", ""Portable fans""...",4.62,4.75,1,1,1,48,POINT (6.16724 46.27944),0
4,325986,Commune de Genève,46.21428,6.1461,Entire apartment,Entire home/apt,150.0,2,1.5,"[""Dishes and silverware"", ""Washer"", ""TV"", ""Smo...",4.84,4.76,0,1,1,26,POINT (6.1461 46.21428),0


In [42]:
# Create a KDTree from the public transport data
tree = KDTree(df_pb_geo[['lon', 'lat']])

# Find the number of public transport stops within 250 m of each listing 
# (note that the distance is in degrees because the CRS is EPSG:4326)
num_stops = tree.query_ball_point(df_list_geo[['lon', 'lat']], r=0.0025)
# Create a list of the number of public transport stops within 250 m of each listing
num_stops_list = [len(x) for x in num_stops]
# Create a dataframe of the number of public transport stops within 250 m of each listing
df_list_geo['num_stops'] = num_stops_list
df_list_geo.head()

Unnamed: 0,id,neighbourhood_cleansed,lat,lon,property_type,room_type,price,accommodates,bathrooms,amenities,review_scores_rating,review_scores_location,instant_bookable,calculated_host_listings_count,lake_access,num_amenities,geometry,num_ta,num_stops
0,42515,Commune de Genève,46.2014,6.15507,Private room in apartment,Private room,72.0,1,1.5,"[""Cable TV"", ""Dishes and silverware"", ""Washer""...",4.69,4.79,0,1,0,22,POINT (6.15507 46.2014),4,3
1,44959,Commune de Genève,46.19704,6.16076,Private room in apartment,Private room,68.0,1,1.5,"[""Extra pillows and blankets"", ""Cable TV"", ""Po...",4.87,4.72,0,1,0,51,POINT (6.16076 46.19704),1,0
2,194110,Commune de Genève,46.2042,6.16067,Entire apartment,Entire home/apt,135.0,4,1.0,"[""Extra pillows and blankets"", ""Cable TV"", ""Di...",4.5,4.85,0,1,0,26,POINT (6.16067 46.2042),0,0
3,276025,Versoix,46.27944,6.16724,Entire apartment,Entire home/apt,65.0,5,1.0,"[""Extra pillows and blankets"", ""Portable fans""...",4.62,4.75,1,1,1,48,POINT (6.16724 46.27944),0,1
4,325986,Commune de Genève,46.21428,6.1461,Entire apartment,Entire home/apt,150.0,2,1.5,"[""Dishes and silverware"", ""Washer"", ""TV"", ""Smo...",4.84,4.76,0,1,1,26,POINT (6.1461 46.21428),0,2


### Update SQLite db with num_ta and num_stops

In [43]:
# add the two new columns to listings data
df_list_geo['dist_ta'] = closest_ta['dist_ta']
df_list_geo['dist_pb'] = closest_pb['dist_pb']
df_list_geo.head()

Unnamed: 0,id,neighbourhood_cleansed,lat,lon,property_type,room_type,price,accommodates,bathrooms,amenities,...,review_scores_location,instant_bookable,calculated_host_listings_count,lake_access,num_amenities,geometry,num_ta,num_stops,dist_ta,dist_pb
0,42515,Commune de Genève,46.2014,6.15507,Private room in apartment,Private room,72.0,1,1.5,"[""Cable TV"", ""Dishes and silverware"", ""Washer""...",...,4.79,0,1,0,22,POINT (6.15507 46.2014),4,3,409.552941,120.893645
1,44959,Commune de Genève,46.19704,6.16076,Private room in apartment,Private room,68.0,1,1.5,"[""Extra pillows and blankets"", ""Cable TV"", ""Po...",...,4.72,0,1,0,51,POINT (6.16076 46.19704),1,0,375.547668,465.095061
2,194110,Commune de Genève,46.2042,6.16067,Entire apartment,Entire home/apt,135.0,4,1.0,"[""Extra pillows and blankets"", ""Cable TV"", ""Di...",...,4.85,0,1,0,26,POINT (6.16067 46.2042),0,0,564.152315,460.10988
3,276025,Versoix,46.27944,6.16724,Entire apartment,Entire home/apt,65.0,5,1.0,"[""Extra pillows and blankets"", ""Portable fans""...",...,4.75,1,1,1,48,POINT (6.16724 46.27944),0,1,6694.037169,162.973109
4,325986,Commune de Genève,46.21428,6.1461,Entire apartment,Entire home/apt,150.0,2,1.5,"[""Dishes and silverware"", ""Washer"", ""TV"", ""Smo...",...,4.76,0,1,1,26,POINT (6.1461 46.21428),0,2,795.419536,127.98462


In [44]:
# add dist_ta and dist_pb based on the id to each listing in the data frame
df_list = pd.merge(df_list, df_list_geo[['id', 'dist_ta', 'dist_pb']], on='id', how='left')
df_list.head()

Unnamed: 0,id,neighbourhood_cleansed,lat,lon,property_type,room_type,price,accommodates,bathrooms,amenities,review_scores_rating,review_scores_location,instant_bookable,calculated_host_listings_count,lake_access,num_amenities,dist_ta,dist_pb
0,42515,Commune de Genève,46.2014,6.15507,Private room in apartment,Private room,72.0,1,1.5,"[""Cable TV"", ""Dishes and silverware"", ""Washer""...",4.69,4.79,0,1,0,22,409.552941,120.893645
1,44959,Commune de Genève,46.19704,6.16076,Private room in apartment,Private room,68.0,1,1.5,"[""Extra pillows and blankets"", ""Cable TV"", ""Po...",4.87,4.72,0,1,0,51,375.547668,465.095061
2,194110,Commune de Genève,46.2042,6.16067,Entire apartment,Entire home/apt,135.0,4,1.0,"[""Extra pillows and blankets"", ""Cable TV"", ""Di...",4.5,4.85,0,1,0,26,564.152315,460.10988
3,276025,Versoix,46.27944,6.16724,Entire apartment,Entire home/apt,65.0,5,1.0,"[""Extra pillows and blankets"", ""Portable fans""...",4.62,4.75,1,1,1,48,6694.037169,162.973109
4,325986,Commune de Genève,46.21428,6.1461,Entire apartment,Entire home/apt,150.0,2,1.5,"[""Dishes and silverware"", ""Washer"", ""TV"", ""Smo...",4.84,4.76,0,1,1,26,795.419536,127.98462


In [45]:
# add num_ta and num_stops based on the id to each listing in the data frame
df_list = pd.merge(df_list, df_list_geo[['id', 'num_ta', 'num_stops']], on='id', how='left')
df_list.head()


Unnamed: 0,id,neighbourhood_cleansed,lat,lon,property_type,room_type,price,accommodates,bathrooms,amenities,review_scores_rating,review_scores_location,instant_bookable,calculated_host_listings_count,lake_access,num_amenities,dist_ta,dist_pb,num_ta,num_stops
0,42515,Commune de Genève,46.2014,6.15507,Private room in apartment,Private room,72.0,1,1.5,"[""Cable TV"", ""Dishes and silverware"", ""Washer""...",4.69,4.79,0,1,0,22,409.552941,120.893645,4,3
1,44959,Commune de Genève,46.19704,6.16076,Private room in apartment,Private room,68.0,1,1.5,"[""Extra pillows and blankets"", ""Cable TV"", ""Po...",4.87,4.72,0,1,0,51,375.547668,465.095061,1,0
2,194110,Commune de Genève,46.2042,6.16067,Entire apartment,Entire home/apt,135.0,4,1.0,"[""Extra pillows and blankets"", ""Cable TV"", ""Di...",4.5,4.85,0,1,0,26,564.152315,460.10988,0,0
3,276025,Versoix,46.27944,6.16724,Entire apartment,Entire home/apt,65.0,5,1.0,"[""Extra pillows and blankets"", ""Portable fans""...",4.62,4.75,1,1,1,48,6694.037169,162.973109,0,1
4,325986,Commune de Genève,46.21428,6.1461,Entire apartment,Entire home/apt,150.0,2,1.5,"[""Dishes and silverware"", ""Washer"", ""TV"", ""Smo...",4.84,4.76,0,1,1,26,795.419536,127.98462,0,2


In [46]:
# save the data frame to a csv file
df_list.to_csv('listings_with_distances.csv', index=False)
