# Data Collection
- Get list of 20 most visited cities.
- Create a dictionary with coordinates of 20 most visited cities.

In [1]:
import tqdm

### Get a list of the 20 most visited cities:

In [2]:
import requests
from scrapy import Selector
import json
import custom_functions as cf



In [3]:
cities_url = "https://travelness.com/most-visited-cities-in-the-world" # URL of the page with the list of cities

response = requests.get(cities_url)
sel = Selector(response)

cities = sel.xpath("//table//tr/td[2]/text()").getall()

### Use OpenStreetMaps API to convert the city names into coordinates

In [4]:
from geopy.geocoders import Nominatim

In [6]:
def geocode_city(city):
    geolocator = Nominatim(user_agent="my_geocoder")
    location = geolocator.geocode(city)
    return {"city": city, "latitude": location.latitude, "longitude": location.longitude}

def geocode_cities(city_list):
    geocoded_cities = [geocode_city(city) for city in city_list if geocode_city(city)]
    return geocoded_cities

# Geocode the list of cities
geocoded_cities = geocode_cities(cities)

In [7]:
# export geocoded cities to a json file
with open("../data/city_coordinates.json", "w") as f:
    json.dump(geocoded_cities, f)

### Use the open-meteo API to get weather data

In [8]:
# import relevant modules
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

In [9]:
# Define our variables of interest as a list
daily_variables_of_interest = [
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "daylight_duration",
    "sunshine_duration",
    "precipitation_sum",
    "rain_sum",
    "precipitation_hours",
]

# Create the API params dictionary
params = {
    "latitude": [city["latitude"] for city in geocoded_cities],
    "longitude": [city["longitude"] for city in geocoded_cities],
    "start_date": "1940-01-01",
    "end_date": "2023-12-31",
    "daily": daily_variables_of_interest,
}

In [10]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
responses = openmeteo.weather_api(url, params=params)

In [13]:
# List comprehension to create a list of dataframes
dataframes_list = [cf.process_response(response, geocoded_cities, i) for i, response in enumerate(responses)]

In [16]:
merged_df = pd.concat(dataframes_list, ignore_index=True)
merged_df.to_csv("../data/weather_data.csv", index=False)