# Data Collection
- Get list of 20 most visited cities.
- Create a dictionary with coordinates of 20 most visited cities.

In [84]:
import tqdm

### Get a list of the 20 most visited cities:

In [85]:
import requests
from scrapy import Selector
import json

In [86]:
cities_url = "https://travelness.com/most-visited-cities-in-the-world" # URL of the page with the list of cities

response = requests.get(cities_url)
sel = Selector(response)

cities = sel.xpath("//table//tr/td[2]/text()").getall()

### Use xxx API to convert the city names into coordinates

In [100]:
from geopy.geocoders import Nominatim

In [101]:
def geocode_city(city):
    """
    Geocode a single city using OpenStreetMap API.
    """
    geolocator = Nominatim(user_agent="my_geocoder")
    location = geolocator.geocode(city)
    return {"city": city, "latitude": location.latitude, "longitude": location.longitude}

def geocode_cities(city_list):
    """
    Geocode a list of cities using OpenStreetMap API.
    """
    geocoded_cities = [geocode_city(city) for city in city_list if geocode_city(city)]
    return geocoded_cities

# Geocode the list of cities
geocoded_cities = geocode_cities(cities)

In [102]:
# export geocoded cities to a json file
with open("../data/city_coordinates.json", "w") as f:
    json.dump(geocoded_cities, f)

### Use the open-meteo API to get weather data

In [None]:
# import relevant modules
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

In [99]:
# Define our variables of interest as a list
daily_variables_of_interest = [
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "daylight_duration",
    "sunshine_duration",
    "precipitation_sum",
    "rain_sum",
    "precipitation_hours",
]

# Create the API params dictionary
params = {
    "latitude": [city["latitude"] for city in geocoded_cities],
    "longitude": [city["longitude"] for city in geocoded_cities],
    "start_date": "1940-01-01",
    "end_date": "2023-12-31",
    "daily": daily_variables_of_interest,
}

In [None]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
responses = openmeteo.weather_api(url, params=params)

In [None]:
# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
    start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
    end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
    freq = pd.Timedelta(seconds = hourly.Interval()),
    inclusive = "left"
)}
hourly_data["temperature_2m"] = hourly_temperature_2m

hourly_dataframe = pd.DataFrame(data = hourly_data)
print(hourly_dataframe)

In [None]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": [39.55, 19.0728, 22.5455, 36.9081, 41.0138, 3.1412, 25.0772, 1.2897, 51.5085, 22.2783, 48.8534, 28.6519, 35.6895, 25.0478, 12.9333, 41.8919, 40.7143, 13.754],
    "longitude": [2.7333, 72.8826, 114.0683, 30.6956, 28.9497, 101.6865, 55.3093, 103.8501, -0.1257, 114.1747, 2.3488, 77.2315, 139.6917, 121.5319, 100.8833, 12.5113, -74.006, 100.5014],
    "start_date": "1940-01-01",
    "end_date": "2024-03-01",
    "daily": ["precipitation_sum", "rain_sum", "precipitation_hours"]
}
responses = openmeteo.weather_api(url, params=params)


# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")


# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_precipitation_sum = daily.Variables(0).ValuesAsNumpy()
daily_rain_sum = daily.Variables(1).ValuesAsNumpy()
daily_precipitation_hours = daily.Variables(2).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
    start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
    end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
    freq = pd.Timedelta(seconds = daily.Interval()),
    inclusive = "left"
)}
daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["rain_sum"] = daily_rain_sum
daily_data["precipitation_hours"] = daily_precipitation_hours

daily_dataframe = pd.DataFrame(data = daily_data)
display(daily_dataframe)


Coordinates 39.54305648803711°N 2.742382287979126°E
Elevation 2.0 m asl
Timezone None None
Timezone difference to GMT+0 0 s


Unnamed: 0,date,precipitation_sum,rain_sum,precipitation_hours
0,1940-01-01 00:00:00+00:00,,,0.0
1,1940-01-02 00:00:00+00:00,1.300000,1.300000,6.0
2,1940-01-03 00:00:00+00:00,6.400000,6.400000,13.0
3,1940-01-04 00:00:00+00:00,2.000000,2.000000,8.0
4,1940-01-05 00:00:00+00:00,0.000000,0.000000,0.0
...,...,...,...,...
30737,2024-02-26 00:00:00+00:00,8.400000,8.400000,8.0
30738,2024-02-27 00:00:00+00:00,0.400000,0.400000,4.0
30739,2024-02-28 00:00:00+00:00,6.099999,6.099999,14.0
30740,2024-02-29 00:00:00+00:00,0.000000,0.000000,0.0


Unnamed: 0,precipitation_sum,rain_sum,precipitation_hours
count,30741.0,30741.0,30742.0
mean,1.088254,1.085811,2.256359
std,3.344872,3.334524,4.256175
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.5,0.5,3.0
max,90.099991,90.099991,24.0


In [None]:
# doing it just for London 


import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 51.5085,
    "longitude": -0.1257,
    "start_date": "1940-01-01",
    "end_date": "2024-03-01",
    "daily": ["precipitation_sum", "precipitation_hours"]
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_precipitation_sum = daily.Variables(0).ValuesAsNumpy()
daily_precipitation_hours = daily.Variables(1).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
    start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
    end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
    freq = pd.Timedelta(seconds = daily.Interval()),
    inclusive = "left"
)}
daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["precipitation_hours"] = daily_precipitation_hours

daily_dataframe = pd.DataFrame(data = daily_data)
display(daily_dataframe)


OpenMeteoRequestsError: {'reason': 'Hourly API request limit exceeded. Please try again in the next hour.', 'error': True}