In [1]:
from geopy.geocoders import Nominatim   # for geocoding
import numpy as np                      # for numerical computations
import folium                           # for map plotting

# List of names of the cities we have Airbnb data for
city_names = ["Amsterdam", "Athens", "Barcelona", "Berlin", "Budapest", "Lisbon", "London", "Paris", "Rome", "Vienna"]

In [2]:
# This function generates a list of coordinates around a city in order to cover the area
def generar_coordenades(ciutat, num_punts=400):
    """
    Genera un conjunt de coordenades al voltant d'una ciutat.

    input:
    ciutat: string amb el nom de la ciutat
    num_punts: número de punts a generar al voltant de la ciutat

    output:
    coordenades: llista de tuples amb les coordenades (lat, lon) dels punts generats
    """
    geolocator = Nominatim(user_agent="gia-bda")
    location = geolocator.geocode(ciutat)
    lat, lon = location.latitude, location.longitude
    
    dist_lat = np.linspace(-0.08, 0.08, int(np.sqrt(num_punts)))
    dist_lon = np.linspace(-0.08, 0.08, int(np.sqrt(num_punts)))
    
    coordenades = []
    for dlat in dist_lat:
        for dlon in dist_lon:
            coordenades.append((lat + dlat, lon + dlon))
    
    return coordenades

In [3]:
city_coords = {}
for city in city_names:
    print(f"Generant les coordenades per a {city}.")
    city_coords[city] = generar_coordenades(city)

Generant les coordenades per a Amsterdam.
Generant les coordenades per a Athens.
Generant les coordenades per a Barcelona.
Generant les coordenades per a Berlin.
Generant les coordenades per a Budapest.
Generant les coordenades per a Lisbon.
Generant les coordenades per a London.
Generant les coordenades per a Paris.
Generant les coordenades per a Rome.
Generant les coordenades per a Vienna.


Now, we can visualize the maps of each city in order to see if it actually covers the whole city.

In [4]:
# Visualization of the generated coordinates
visualitzar = False
if visualitzar:
    for city, ll_coords in city_coords.items():
        # Create a map
        city_map = folium.Map(location=ll_coords[0], zoom_start=13)
        # Add the markers
        for coord in ll_coords:
            folium.Marker(coord).add_to(city_map)
        # Save the map
        city_map.save(f"maps/{city}_map.html")

### Weather

In [5]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

from tqdm import tqdm
import time

In [6]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

We make a function to get all the data of each point in the map.

In [7]:
def check_weather(city, ll_coords):

    dataframe = pd.DataFrame()

    for lat, lon in tqdm(ll_coords, desc=f"Processing {city}"):

        # Make sure all required weather variables are listed here
        # The order of variables in hourly or daily is important to assign them correctly below
        url = "https://api.open-meteo.com/v1/forecast"
        params = {
            "latitude": lat,
            "longitude": lon,
            "hourly": ["temperature_2m", "relative_humidity_2m", "precipitation_probability", "precipitation", "weather_code", "cloud_cover", "cloud_cover_low", "cloud_cover_mid", "cloud_cover_high"],
            "forecast_days": 16
        }
        responses = openmeteo.weather_api(url, params=params)

        # Process first location. Add a for-loop for multiple locations or weather models
        response = responses[0]

        # Process hourly data. The order of variables needs to be the same as requested.
        hourly = response.Hourly()
        hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
        hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
        hourly_precipitation_probability = hourly.Variables(2).ValuesAsNumpy()
        hourly_precipitation = hourly.Variables(3).ValuesAsNumpy()
        hourly_weather_code = hourly.Variables(4).ValuesAsNumpy()
        hourly_cloud_cover = hourly.Variables(5).ValuesAsNumpy()
        hourly_cloud_cover_low = hourly.Variables(6).ValuesAsNumpy()
        hourly_cloud_cover_mid = hourly.Variables(7).ValuesAsNumpy()
        hourly_cloud_cover_high = hourly.Variables(8).ValuesAsNumpy()

        hourly_data = {"date": pd.date_range(
            start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
            end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
            freq = pd.Timedelta(seconds = hourly.Interval()),
            inclusive = "left"
        )}
        hourly_data["temperature_2m"] = hourly_temperature_2m
        hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
        hourly_data["precipitation_probability"] = hourly_precipitation_probability
        hourly_data["precipitation"] = hourly_precipitation
        hourly_data["weather_code"] = hourly_weather_code
        hourly_data["cloud_cover"] = hourly_cloud_cover
        hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
        hourly_data["cloud_cover_mid"] = hourly_cloud_cover_mid
        hourly_data["cloud_cover_high"] = hourly_cloud_cover_high

        hourly_dataframe = pd.DataFrame(data = hourly_data)
        hourly_dataframe["latitude"] = lat
        hourly_dataframe["longitude"] = lon
        hourly_dataframe["city"] = city
        dataframe = pd.concat([dataframe, hourly_dataframe])

    return dataframe

In [8]:
# For each city and its coordinates, we will check the weather, stopping 1 minute between each city to avoid reaching the API limit
df_weather = pd.DataFrame()
for city, ll_coords in city_coords.items():
    city_weather = check_weather(city, ll_coords)
    df_weather = pd.concat([df_weather, city_weather])
    # Sleep 1 minute
    time.sleep(61)

Processing Amsterdam: 100%|██████████| 400/400 [00:01<00:00, 344.89it/s]
Processing Athens: 100%|██████████| 400/400 [00:00<00:00, 405.29it/s]
Processing Barcelona: 100%|██████████| 400/400 [00:01<00:00, 396.75it/s]
Processing Berlin: 100%|██████████| 400/400 [00:22<00:00, 17.89it/s]
Processing Budapest: 100%|██████████| 400/400 [00:30<00:00, 13.23it/s]
Processing Lisbon: 100%|██████████| 400/400 [00:30<00:00, 13.13it/s]
Processing London: 100%|██████████| 400/400 [00:29<00:00, 13.76it/s]
Processing Paris: 100%|██████████| 400/400 [00:26<00:00, 14.94it/s]
Processing Rome: 100%|██████████| 400/400 [00:28<00:00, 14.27it/s]
Processing Vienna: 100%|██████████| 400/400 [00:27<00:00, 14.35it/s]


In [10]:
df_weather.to_parquet("datalake/weather_data.parquet", index=False)