# Data Collection
- Get list of 20 most visited cities.
- Create a dictionary with coordinates of 20 most visited cities.

In [1]:
import tqdm
import urllib
import pandas as pd



### Get a list of the 20 most visited cities:

In [2]:
import requests
from scrapy import Selector
import json
import custom_functions as cf

In [3]:
cities_url = "https://travelness.com/most-visited-cities-in-the-world" # URL of the page with the list of cities

response = requests.get(cities_url)
sel = Selector(response)

cities = sel.xpath("//table//tr/td[2]/text()").getall()

### Use OpenStreetMaps API to convert the city names into coordinates

In [4]:
from geopy.geocoders import Nominatim

In [5]:
def geocode_city(city):
    geolocator = Nominatim(user_agent="my_geocoder")
    location = geolocator.geocode(city)
    return {"city": city, "latitude": location.latitude, "longitude": location.longitude}

def geocode_cities(city_list):
    geocoded_cities = [geocode_city(city) for city in city_list if geocode_city(city)]
    return geocoded_cities

# Geocode the list of cities
geocoded_cities = geocode_cities(cities)

In [6]:
# export geocoded cities to a json file
with open("../data/city_coordinates.json", "w") as f:
    json.dump(geocoded_cities, f)

### Use the open-meteo API to get weather data

In [7]:
# import relevant modules
import openmeteo_requests
import requests_cache
from retry_requests import retry

In [8]:
# Define our variables of interest as a list
daily_variables_of_interest = [
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "daylight_duration",
    "sunshine_duration",
    "precipitation_sum",
    "rain_sum",
    "precipitation_hours",
]

# Create the API params dictionary
params = {
    "latitude": [city["latitude"] for city in geocoded_cities],
    "longitude": [city["longitude"] for city in geocoded_cities],
    "start_date": "1940-01-01",
    "end_date": "2023-12-31",
    "daily": daily_variables_of_interest,
}

In [9]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
responses = openmeteo.weather_api(url, params=params)

In [10]:
# List comprehension to create a list of dataframes
dataframes_list = [cf.process_response(response, geocoded_cities, i) for i, response in enumerate(responses)]

In [11]:
merged_df = pd.concat(dataframes_list, ignore_index=True)
merged_df.to_csv("../data/weather_data.csv", index=False)

### Scrape the NGRAM data into dataframes

In [3]:
NGRAM_df = pd.DataFrame()
queries = ["London rain", "London Rain", "rainy London", "rain in London", "Rain in London", "raining in London", "Raining in London"]

for query in queries:
    df_query = cf.runQuery(query)
    NGRAM_df = pd.concat([NGRAM_df, df_query])

NGRAM_df.reset_index(drop=True, inplace=True)
NGRAM_df

Unnamed: 0,query,Year,Appearence %
0,London rain,1940,8.224380e-10
1,London rain,1941,6.946620e-10
2,London rain,1942,5.788850e-10
3,London rain,1943,5.777963e-10
4,London rain,1944,5.943155e-10
...,...,...,...
555,Raining in London,2015,5.425769e-11
556,Raining in London,2016,5.221881e-11
557,Raining in London,2017,3.226468e-11
558,Raining in London,2018,3.871761e-11


### Sum the appearances into one dataframe

In [4]:
NGRAM_df_grouped = NGRAM_df.groupby('Year').sum().reset_index()
NGRAM_df_grouped = NGRAM_df_grouped.loc[:, ['Year', 'Appearence %']]
NGRAM_df_grouped

Unnamed: 0,Year,Appearence %
0,1940,2.731485e-09
1,1941,2.756745e-09
2,1942,2.743879e-09
3,1943,2.774656e-09
4,1944,2.788329e-09
...,...,...
75,2015,1.111827e-08
76,2016,1.074132e-08
77,2017,1.051684e-08
78,2018,1.025569e-08


In [5]:
# convert NGRAM_df_grouped into a csv file
NGRAM_df_grouped.to_csv('../data/NGRAM_df_grouped.csv', index=False)