In [17]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from rapidfuzz import fuzz

hotels = gpd.read_file("hotels-austria-named.geojson")
print(hotels.head())
print(hotels.crs)
print(hotels.columns)
print(hotels.count())
print(hotels["name"].isna().sum())
print(hotels.geometry.type.unique())

          id   UID  WLAN access access:fee accommodation addr2:city  \
0  n20824641  None  None   None       None          None       None   
1  n21261459  None  None   None       None          None       None   
2  n27293696  None  None   None       None          None       None   
3  n27293699  None  None   None       None          None       None   
4  n27293701  None  None   None       None          None       None   

  addr2:country addr2:hamlet addr2:housenumber  ... wheelchair:rooms  \
0          None         None              None  ...             None   
1          None         None              None  ...             None   
2          None         None              None  ...             None   
3          None         None              None  ...             None   
4          None         None              None  ...             None   

  wheelchair:source wheelchair:step_height wikidata wikimedia_commons  \
0              None                   None     None              No

Convert coordinates and split shapes

In [18]:
hotels = hotels.to_crs(epsg=32632)

gdf_points = hotels[hotels.geometry.type == "Point"]
gdf_polygons = hotels[hotels.geometry.type.isin(["LineString", "MultiPolygon"])]

print(f"Number of points: {len(gdf_points)}")
print(f"Number of polygons: {len(gdf_polygons)}")

Number of points: 3343
Number of polygons: 7019


Dedupe overlapping polygons

In [32]:
pairs = gpd.sjoin(gdf_polygons, gdf_polygons, how='inner')
print(len(pairs))
pairs = pairs[pairs.id_left != pairs.id_right] # remove self joins
pairs = pairs[pairs.id_left < pairs.id_right] # remove swapped orders e.g. (A,B) and (B,A)
print(pairs.filter(regex="^id").columns)
print(len(pairs))
pairs["similarity_score"] = pairs.apply(
    lambda row: fuzz.ratio(row["name_left"], row["name_right"]), axis=1
)
THRESHOLD = 80
duplicates = pairs[pairs["similarity_score"] > THRESHOLD].copy()

# Merge non-NaN counts for left and right polygons in the duplicates DataFrame
gdf_polygons["non_nan_count"] = gdf_polygons.notna().sum(axis=1)
print(duplicates.filter(regex="^id").columns)
duplicates = duplicates.merge(
    gdf_polygons[["id", "non_nan_count"]], left_on="id_left", right_on="id", how="left"
)
print(duplicates.filter(regex="^id").columns)
duplicates = duplicates.merge(
    gdf_polygons[["id", "non_nan_count"]], left_on="id_right", right_on="id", how="left"
)
print(duplicates.filter(regex="non_nan_count").columns)
duplicates["keep_id"] = duplicates.apply(
    lambda row: row["id_left"] if row["non_nan_count_x"] >= row["non_nan_count_y"] else row["id_right"],
    axis=1
)

to_remove = duplicates.apply(
    lambda row: row["id_right"] if row["keep_id"] == row["id_left"] else row["id_left"],
    axis=1
)
print(len(to_remove))
gdf_polygons_cleaned = gdf_polygons.loc[~gdf_polygons.id.isin(to_remove.unique())]
print(len(gdf_polygons))
print(gdf_polygons_cleaned["name"].isna().sum())

14553
Index(['id_left', 'id_right'], dtype='object')
3767
Index(['id_left', 'id_right'], dtype='object')
Index(['id_left', 'id_right', 'id'], dtype='object')
Index(['non_nan_count_left', 'non_nan_count_right', 'non_nan_count_x',
       'non_nan_count_y'],
      dtype='object')
3496
7019
95


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [None]:
Remove points within polygons

In [33]:
points_in_polygons = gdf_points.sjoin(gdf_polygons_cleaned, how="inner", predicate="within")
points_to_remove = points_in_polygons.index
gdf_points_cleaned = gdf_points.loc[~gdf_points.index.isin(points_to_remove)]
print(f"Number of points after removal: {len(gdf_points_cleaned)}")

Number of points after removal: 3291


Find point that will be used as the hotel location within polygons

In [47]:
gdf_polygons_cleaned_copy = gdf_polygons_cleaned.copy()
gdf_polygons_cleaned_copy['geometry'] = gdf_polygons_cleaned_copy.geometry.centroid
gdf_combined = gpd.GeoDataFrame(pd.concat([gdf_points_cleaned, gdf_polygons_cleaned_copy], ignore_index=True),
                                crs=gdf_polygons_cleaned_copy.crs)
gdf_combined = gdf_combined.to_crs(4326)# long/lat

print(gdf_combined.isna().sum())

id                    0
UID                6867
WLAN               6871
access             6813
access:fee         6871
                   ... 
wlan               6871
wpt_description    6866
wpt_symbol         6861
geometry              0
non_nan_count      3291
Length: 453, dtype: int64


In [48]:
# Calculate the percentage of non-NaN values for each column
non_nan_percentage = gdf_combined.notna().mean() * 100

# Sort the columns by percentage of non-NaN values in descending order
sorted_columns = non_nan_percentage.sort_values(ascending=False)

# Display the top columns with high amounts of data
print(sorted_columns.head(30))

geometry               100.000000
id                     100.000000
name                    89.930151
tourism                 89.289872
addr:housenumber        70.867288
addr:postcode           69.674040
addr:city               69.426659
addr:country            61.728754
addr:street             58.119907
non_nan_count           52.110012
building                50.116414
website                 49.781723
phone                   33.309080
stars                   23.588475
email                   22.045984
source                  19.077416
internet_access         18.218859
operator                17.680442
wheelchair              14.813737
addr:place              14.188009
at_bev:addr_date        13.809662
amenity                 11.830617
internet_access:fee      9.633295
contact:phone            9.458673
fax                      8.993015
check_date               7.697905
contact:email            7.348661
wikidata                 7.013970
entrance                 6.722934
contact:websit

In [55]:
print(gdf_combined[['id', 'geometry', 'stars']].dtypes)
print(gdf_combined.head())
gdf_combined['stars_numeric'] = pd.to_numeric(gdf_combined['stars'], errors='coerce')
gdf_combined['lon'] = gdf_combined['geometry'].x
gdf_combined['lat'] = gdf_combined['geometry'].y
gdf_combined.head()

id            object
geometry    geometry
stars         object
dtype: object
          id   UID  WLAN access access:fee accommodation addr2:city  \
0  n20824641  None  None   None       None          None       None   
1  n21261459  None  None   None       None          None       None   
2  n27293696  None  None   None       None          None       None   
3  n27293699  None  None   None       None          None       None   
4  n27293701  None  None   None       None          None       None   

  addr2:country addr2:hamlet addr2:housenumber  ... wikimedia_commons  \
0          None         None              None  ...              None   
1          None         None              None  ...              None   
2          None         None              None  ...              None   
3          None         None              None  ...              None   
4          None         None              None  ...              None   

  wikipedia  wlan wpt_description wpt_symbol             

Unnamed: 0,id,UID,WLAN,access,access:fee,accommodation,addr2:city,addr2:country,addr2:hamlet,addr2:housenumber,...,wikimedia_commons,wikipedia,wlan,wpt_description,wpt_symbol,geometry,non_nan_count,stars_numeric,lon,lat
0,n20824641,,,,,,,,,,...,,,,,,POINT (14.09505 46.62949),,,14.09505,46.629494
1,n21261459,,,,,,,,,,...,,,,,,POINT (15.44921 47.07194),,4.0,15.449214,47.07194
2,n27293696,,,,,,,,,,...,,,,,,POINT (12.08569 47.50752),,,12.085686,47.507517
3,n27293699,,,,,,,,,,...,,,,,,POINT (12.08912 47.5141),,,12.089119,47.514099
4,n27293701,,,,,,,,,,...,,,,,,POINT (12.08817 47.51285),,,12.088166,47.51285


In [57]:
subset = gdf_combined[['id', 'stars_numeric','lat','lon']].rename(columns={'stars_numeric': 'stars'})
subset['stars'] = subset['stars'].astype('Int64')
subset.to_csv('accomodations.csv', index=False)
print("Subset saved as 'subset.csv'.")

Subset saved as 'subset.csv'.
