In [2]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

In [3]:
accommodations = gpd.read_file("./dataprog/consolidated_accomodations.geojson")
pois = gpd.read_file("./dataprog/consolidated_pois.geojson")

In [4]:
country_crs_mapping = {
    'albania': 32634,
    'kosovo': 32634,
    'andorra': 32631,
    'latvia': 32635,
    'austria': 32633,
    'liechtenstein': 32632,
    'azores': 32626,
    'lithuania': 32635,
    'belarus': 32635,
    'luxembourg': 32631,
    'belgium': 32631,
    'macedonia': 32634,
    'bosnia-herzegovina': 32633,
    'malta': 32633,
    'bulgaria': 32635,
    'moldova': 32635,
    'croatia': 32633,
    'monaco': 32632,
    'cyprus': 32636,
    'montenegro': 32634,
    'czech-republic': 32633,
    'netherlands': 32631,
    'denmark': 32632,
    'norway': 32632,
    'estonia': 32635,
    'poland': 32634,
    'faroe-islands': 32629,
    'portugal': 32629,
    'finland': 32635,
    'romania': 32635,
    'france': 32631,  # Defaulting to Western France
    'russia': 32636,  # Defaulting to Western Russia
    'georgia': 32638,
    'serbia': 32634,
    'germany': 32632,  # Defaulting to Western Germany
    'slovakia': 32633,
    'greece': 32634,  # Defaulting to Western Greece
    'slovenia': 32633,
    'guernsey-jersey': 32630,
    'spain': 32630,  # Defaulting to Western Spain
    'hungary': 32634,
    'sweden': 32633,  # Defaulting to Southern Sweden
    'iceland': 32627,
    'switzerland': 32632,
    'ireland-and-northern-ireland': 32629,
    'turkey': 32636,
    'isle-of-man': 32630,
    'ukraine': 32636,
    'italy': 32632,  # Defaulting to Western Italy
    'united-kingdom': 32630
}
global_crs = "EPSG:4326"

In [5]:
print(accommodations.sort_values(by=['tag_count'], ascending=False).head(n=20))

                  id                           name  tag_count  \
79119     a669656790             Hotel de Normandie         81   
138234    a151363884                 Pfannkuchenhof         76   
141668    a743908278                     Oberreuter         69   
326740    a304507544                 Faro de Lariño         65   
43405    a2599860992      Radisson Blu Iveria Hotel         62   
162642   n1362728026     Hagen`s Hotel & Restaurant         59   
180719    a482039852       Hotel Brasserie Chaussee         58   
79099    n8787692303                    Premier Inn         56   
175425    a149467118                    Ardey Hotel         56   
174218     a61492652   ARAMIS Tagungs- & Sporthotel         55   
290389    a585753090             Rhapsody Hotel Kas         55   
103655  n11459165924               Chez le tourneur         54   
102741   n8440984758                   La tournerie         54   
79158    a1820323476                    Premier Inn         54   
79082    n

In [6]:
country = 'austria'

# Get the corresponding EPSG for the country
target_epsg = country_crs_mapping.get(country.lower(), 3035)  # Default to EPSG:3035

# Filter accommodations and transform to the appropriate CRS
subset_accommodations = accommodations[accommodations['country'] == country].copy()
subset_accommodations = subset_accommodations.to_crs(epsg=target_epsg)

# Filter and transform POIs for this country
country_pois = pois[pois['country'] == country].copy()
country_pois = country_pois.to_crs(epsg=target_epsg)

# Define the radii for testing
radii = [500, 1000, 5000]

# Process accommodations for this country
subset_result = subset_accommodations.copy()
for radius in radii:
    subset_result[f'buffer_{radius}m'] = subset_accommodations["geometry"].buffer(radius)

for radius in radii:
    if subset_result[f'buffer_{radius}m'].isna().any():
        print(f"Some buffers are None for radius {radius}m in {country}")
    
    # Use spatial index to find POIs within the radius
    enriched_data = []
    for idx, accommodation in subset_result.iterrows():
        # Get the buffer geometry
        buffer_geom = accommodation[f'buffer_{radius}m']
        
        # Query the spatial index for possible matches
        possible_matches_index = list(country_pois.sindex.query(buffer_geom))
        possible_matches = country_pois.iloc[possible_matches_index]
        
        # Filter points within the buffer
        pois_within_radius = possible_matches[possible_matches.geometry.within(buffer_geom)]
        
        # Aggregate POI information (example: count by category)
        poi_summary = pois_within_radius['tag_value'].value_counts().to_dict()
        enriched_data.append(poi_summary)
    
    # Add enrichment data as new columns (one column per category, per radius)
    summary_df = pd.DataFrame(enriched_data).fillna(0).astype(int)
    summary_df.columns = [f'{col}_within_{radius}m' for col in summary_df.columns]
    subset_result = pd.concat([subset_result.reset_index(drop=True), summary_df.reset_index(drop=True)], axis=1)

# Drop buffer columns for a cleaner result
subset_result = subset_result.drop(columns=[f'buffer_{radius}m' for radius in radii])

subset_result = subset_result.to_crs(global_crs)

# Print the first few rows of the result
print(subset_result.head())

           id         name  tag_count  has_wikimedia_commons  has_wikipedia  \
0  n293407883  Cilli Hütte          6                      0              0   
1  n379504504         None          0                      0              0   
2  n379504607         None          0                      0              0   
3  n379504614         None          0                      0              0   
4  n379504623         None          0                      0              0   

   country  tag_key tag_value                   geometry  \
0  austria  tourism    chalet  POINT (10.65338 47.47511)   
1  austria  tourism    chalet  POINT (10.64884 47.07146)   
2  austria  tourism    chalet  POINT (10.64816 47.07113)   
3  austria  tourism    chalet  POINT (10.64834 47.07117)   
4  austria  tourism    chalet  POINT (10.64852 47.07126)   

   restaurant_within_500m  ...  wreck_within_5000m  machine_within_5000m  \
0                       6  ...                   0                     0   
1           

In [9]:
print(subset_result.sort_values(by=['restaurant_within_500m'], ascending=False).head(n=20))
# Check which columns only contain 0
zero_columns = subset_result.columns[(subset_result == 0).all()].tolist()
print(f"Columns with only zeros: {zero_columns}")

# Check which columns have constant values (all rows are the same)
constant_columns = subset_result.columns[subset_result.nunique() == 1].tolist()
print(f"Columns with constant values: {constant_columns}")

# General stats for all columns
print("\nGeneral Statistics:")
print(subset_result.describe(include='all'))

# Example: Check non-zero columns
non_zero_columns = [col for col in subset_result.columns if col not in zero_columns]
print(f"\nColumns with non-zero values: {non_zero_columns}")

# Print the first few rows of the non-zero columns for inspection
print("\nSubset of non-zero columns (first few rows):")
print(subset_result[non_zero_columns].head())


               id                                  name  tag_count  \
2195   n956430138                           Hotel Royal          9   
4323  n9045438017                                 Royal          1   
3460  n4234525389                           DO&CO Hotel          2   
5554     a3054755                Hotel am Stephansplatz         13   
3437  n4141815806                          City Pension          7   
2186   n948703143              Hotel Kaiserin Elisabeth          8   
6688   a338372884                  Zum König von Ungarn         16   
2823  n1958068812                                Topazz          4   
2322  n1203033490                           Hotel Lamée          9   
2861  n2046313315                           Hotel Wandl         15   
3438  n4141815842                  Arthotel ANA Amadeus          5   
9698  n4101228995                   Pension Neuer Markt         17   
2198   n957695885                               Domizil          7   
1859   n567059348   

In [6]:
# Define the radii for testing
radii = [500, 1000, 5000]

# Prepare the result DataFrame
result = pd.DataFrame()

# Group accommodations by country
for country, subset_accommodations in accommodations.groupby('country'):
    # Get the corresponding EPSG for the country
    target_epsg = country_crs_mapping.get(country.lower(), 3035)  # Default to EPSG:3035
    print(target_epsg)
    
    # Transform accommodations and POIs to the appropriate CRS
    subset_accommodations = subset_accommodations.to_crs(epsg=target_epsg)
    country_pois = pois[pois['country'] == country].copy()
    country_pois = country_pois.to_crs(epsg=target_epsg)
    
    # Process accommodations for this country
    subset_result = subset_accommodations.copy()

    for radius in radii:
        subset_result[f'buffer_{radius}m'] = subset_accommodations["geometry"].buffer(radius)
    
    for radius in radii:
        if subset_result[f'buffer_{radius}m'].isna().any():
            print(f"Some buffers are None for radius {radius}m in {country}")
        
        # Use spatial index to find POIs within the radius
        enriched_data = []
        for idx, accommodation in subset_result.iterrows():
            # Get the buffer geometry
            buffer_geom = accommodation[f'buffer_{radius}m']
            
            # Query the spatial index for possible matches
            possible_matches_index = list(country_pois.sindex.query(buffer_geom))
            possible_matches = country_pois.iloc[possible_matches_index]
            
            # Filter points within the buffer
            pois_within_radius = possible_matches[possible_matches["geometry"].within(buffer_geom)]
            
            # Aggregate POI information (example: count by category)
            poi_summary = pois_within_radius['tag_value'].value_counts().to_dict()
            enriched_data.append(poi_summary)
        
        # Add enrichment data as new columns (one column per category, per radius)
        summary_df = pd.DataFrame(enriched_data).fillna(0).astype(int)
        summary_df.columns = [f'{col}_within_{radius}m' for col in summary_df.columns]
        subset_result = pd.concat([subset_result.reset_index(drop=True), summary_df.reset_index(drop=True)], axis=1)
    
    # Drop buffer columns for a cleaner result
    subset_result = subset_result.drop(columns=[f'buffer_{radius}m' for radius in radii])
    
    # Transform back to global CRS for consistency
    subset_result = subset_result.to_crs(global_crs)
    
    # Append the processed data for this country to the result DataFrame
    result = pd.concat([result, subset_result], ignore_index=True)

# Final result
print(result)


32634
32631
32633
32626
32635
32631
32633
32635
32633
32636
32633
32632
32635
32629
32635
32631
32638
32632
32634
32630
32634
32627
32629
32630
32632
32634
32635
32632
32635
32631
32634
32633
32635
32632
32634
32631
32632
32634
32629
32635
32636
32634
32633
32633
32630
32633
32632
32636
32636
32630
                 id                         name  tag_count  \
0         n33208727               Panorama Hotel          2   
1        n274322328              Hotel Mangalemi          7   
2        n611766908                    Nord Park          4   
3       n1195395943  Dajti Tower Belvedere Hotel          8   
4       n1248191561     Bar Restaurant Hotel Ilo          6   
...             ...                          ...        ...   
387854  a2693529860           Falcon Manor Hotel          5   
387855  a2695914560                  Premier Inn         15   
387856  a2696836310                  Court Royal          3   
387857  a2697928856           The Sandgate Hotel          6   
387858 

In [13]:
has_columns = [col for col in result.columns if '_within_' in col and col.endswith('m')]

# Replace NaN with 0 only in the identified columns
result[has_columns] = result[has_columns].fillna(0).astype(int)
result['lat'] = pd.concatresult['geometry'].y
result['lon'] = result['geometry'].x
# Print the first few rows to verify the changes
print(result.head())
print(result.describe(include='all'))

  super().__setitem__(key, value)
  super().__setitem__(key, value)


            id                         name  tag_count  has_wikimedia_commons  \
0    n33208727               Panorama Hotel          2                      0   
1   n274322328              Hotel Mangalemi          7                      0   
2   n611766908                    Nord Park          4                      0   
3  n1195395943  Dajti Tower Belvedere Hotel          8                      0   
4  n1248191561     Bar Restaurant Hotel Ilo          6                      0   

   has_wikipedia  country  tag_key tag_value                   geometry  \
0              0  albania  tourism     hotel  POINT (19.76731 40.09976)   
1              0  albania  tourism     hotel    POINT (19.95094 40.706)   
2              0  albania  tourism     hotel   POINT (19.69842 41.4704)   
3              0  albania  tourism     hotel  POINT (19.90584 41.36834)   
4              0  albania  tourism     hotel  POINT (20.91066 40.76457)   

   restaurant_within_500m  ...  creamery_within_1000m  \
0    

In [15]:
output_path = 'train.csv'  # Replace with your desired file path
result.drop(columns=['geometry','id','name']).to_csv(output_path, index=False)

print(f"GeoDataFrame exported to {output_path} without the geometry column")

GeoDataFrame exported to train.csv without the geometry column
