# Load Data
In this notebook we load and preprocess the OSM data and save it in the data directory

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..

d:\hex2vec2


## Download from OSM
Using the `osmnx` library we can directly download all OSM objects into `DATA_RAW_DIR`

In [3]:
from src.data.download import download_whole_city
from src.settings import DATA_RAW_DIR
from tqdm import tqdm
import pandas as pd

In [8]:
RESOLUTION = 9

In [6]:
cities = [
    "Cologne, Germany",
    "Dortmund, Germany",
    "Duesseldorf, Germany",
    "Essen, Germany",
    "Duisburg, Germany",
    "Bochum, Germany",
    "Wuppertal, Germany",
    "Bielefeld, Germany",
    "Bonn, Germany",
    "Muenster, Germany"
    ]

In [7]:
years = [
    # "2015",
    # "2016",
    # "2017",
    # "2018",
    # "2019",
    # "2020",
    # "2021",
    # "2022",
    "2023"
]

In [None]:
for city in tqdm(cities):
    for year in tqdm(years):
        download_whole_city(city, DATA_RAW_DIR, year)

## Select Tags to use
We now specify which tags and columns we want to use for our training. These are saved in `TAG_FILTER`.

In [13]:
from src.data.load_data import load_filter

In [14]:
selected_tags = [
    "aeroway",
    "amenity",
    "building",
    "healthcare",
    "highway",
    "historic",
    "landuse",
    "leisure",
    "military",
    "natural",
    "office",
    "public_transport",
    "shop",
    "sport",
    "tourism",
    "water",
    "waterway",
]

remove_columns = [
    'amenity_waste_basket',
    'landuse_grass',
    'historic_tomb',
    'natural_tree',
    'natural_tree_row',
    'natural_valley',
]

In [15]:
TAG_FILTER = load_filter("from_wiki.json")

# remove tags that are not in selected tags
TAG_FILTER = {
    k: TAG_FILTER[k] for k in TAG_FILTER if k in selected_tags
} 

# remove problematic columns
for tag in remove_columns:
    super_tag, *sub_tag = tag.split("_")
    if isinstance(sub_tag, (list, tuple)):
        sub_tag = "_".join(sub_tag)
    TAG_FILTER[super_tag] = [
        tag for tag in TAG_FILTER[super_tag] if tag != sub_tag
    ]

## Map OSM objects to their hexagons
Now we add the hexagon indices to the amenities and save the files in `DATA_INTERIM_DIR`.

In [12]:
from src.data.make_dataset import add_h3_indices_to_city
from src.settings import DATA_INTERIM_DIR

In [None]:
for city in tqdm(cities):
    for year in tqdm(years):
        add_h3_indices_to_city(city, year, RESOLUTION, filter_values=TAG_FILTER)

In [14]:
# this is what the data looks like at this stage (for highways in Cologne)
pd.read_feather(DATA_INTERIM_DIR.joinpath("Cologne, Germany/2023/highway_9.feather")).head()

Unnamed: 0,osmid,highway,h3
0,2703430,motorway_junction,891fa113347ffff
1,4370663,motorway,891fa113347ffff
2,4394292,motorway_link,891fa113347ffff
3,23126813,path,891fa113347ffff
4,46629297,motorway_link,891fa113347ffff


## Group selected tags in cities
We now group all tags in one file for each city and save it in `DATA_PROCESSED_DIR`

In [23]:
from src.data.make_dataset import group_city_tags
from src.settings import DATA_PROCESSED_DIR

In [None]:
for city in tqdm(cities):
    for year in tqdm(years):
        group_city_tags(city, year, RESOLUTION, tags=list(TAG_FILTER.keys()), filter_values=TAG_FILTER, fill_missing=True)

In [18]:
# this is what the data looks like at this stage (for Bonn)
pd.read_feather(DATA_PROCESSED_DIR.joinpath("Bonn, Germany/2023/9.feather")).head()

Unnamed: 0,h3,aeroway_aerodrome,aeroway_apron,aeroway_gate,aeroway_hangar,aeroway_helipad,aeroway_heliport,aeroway_navigationaid,aeroway_runway,aeroway_spaceport,...,waterway_pressurised,waterway_river,waterway_riverbank,waterway_soakhole,waterway_stream,waterway_tidal_channel,waterway_turning_point,waterway_water_point,waterway_waterfall,waterway_weir
0,891fa10320bffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,891fa10321bffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,891fa103243ffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,891fa103247ffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,891fa103253ffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Group all cities and years
We group all cities and years and merge certain tag-value-pairs. To be precise, we merge almost all tag-value-pairs that have the same value. So the columns amenity_ice_cream and shop_ice_cream become just ice_cream, because both tag-value-pairs stand for ice cream shops. Finally we save everything in `9.feather` to `DATA_PROCESSED_DIR`.

In [4]:
from src.data.make_dataset import group_cities, merge_tags

In [18]:
df_grouped = group_cities(cities=cities, years = years, resolution=RESOLUTION, add_city_column=True)

In [11]:
#tags we dont want to be merged (e.g. landuse_residential != buiding_residential)
tags_not_to_merge = {
    "landuse"
}

#values we dont want to merge (e.g. building_yes != tourism_yes)
values_not_to_merge = {
    "yes", 
    "office", 
    "water", 
    "parking", 
    "fishing", 
    "charity", 
    "religion", 
    "golf", 
    "farm", 
    "ruins", 
    "swimming_pool"
}

In [19]:
df_grouped = merge_tags(df_grouped, TAG_FILTER, tags_not_to_merge, values_not_to_merge)

In [21]:
# some more optional changes to the data (e.g. merge building_semidetached_house and building_house because
# building_semidetached_house is only used in Cologne)

df_grouped["building_yes"] = df_grouped["building_terrace"] + df_grouped["building_yes"]
df_grouped.drop(columns="building_terrace", inplace=True)

df_grouped["building_shed"] = df_grouped["building_shed"] + df_grouped["building_hut"]
df_grouped.drop(columns="building_hut", inplace=True)

df_grouped["building_house"] = df_grouped["building_house"] + df_grouped["building_semidetached_house"]
df_grouped.drop(columns="building_semidetached_house", inplace=True)


In [24]:
df_grouped.reset_index().to_feather(DATA_PROCESSED_DIR.joinpath(f"{RESOLUTION}.feather"))
df_grouped.head()

Unnamed: 0_level_0,aeroway_aerodrome,aeroway_apron,aeroway_gate,aeroway_helipad,aeroway_heliport,aeroway_navigationaid,aeroway_runway,aeroway_spaceport,aeroway_taxiway,aeroway_terminal,...,water_point,government,toilets,retail,monastery,college,ice_cream,river,scuba_diving,military
h3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
891fa181003ffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
891fa181007ffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
891fa18100bffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
891fa18100fffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
891fa181013ffff,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
