# Data Collection
- Get list of 20 most visited cities.
- Create a dictionary with coordinates of 20 most visited cities.

In [1]:
import tqdm
import urllib
import pandas as pd
import requests
from scrapy import Selector
import json
import custom_functions as cf
from geopy.geocoders import Nominatim
import openmeteo_requests
import requests_cache
from retry_requests import retry
from sqlalchemy import create_engine

### Get a list of the 20 most visited cities:

In [2]:
cities_url = "https://travelness.com/most-visited-cities-in-the-world" # URL of the page with the list of cities

response = requests.get(cities_url)
sel = Selector(response)

cities = sel.xpath("//table//tr/td[2]/text()").getall()

### Get the City Stereotypes about weather for these 20 cities

In [3]:
suggestion_list = cf.extract_words(cities)

In [4]:
filtered_weather_words = cf.filter_weather_words(suggestion_list)

In [5]:
# Export the dict to a json file
with open('./../data/filtered_weather_words.json','w') as file:
    json.dump(filtered_weather_words,file)

### Use OpenStreetMaps API to convert the city names into coordinates

In [3]:
def geocode_city(city):
    geolocator = Nominatim(user_agent="my_geocoder")
    location = geolocator.geocode(city)
    return {"city": city, "latitude": location.latitude, "longitude": location.longitude}

def geocode_cities(city_list):
    geocoded_cities = [geocode_city(city) for city in city_list if geocode_city(city)]
    return geocoded_cities

# Geocode the list of cities
geocoded_cities = geocode_cities(cities)

In [4]:
# export geocoded cities to a json file
with open("../data/city_coordinates.json", "w") as f:
    json.dump(geocoded_cities, f)

### Use the open-meteo API to get weather data

In [5]:
# Define our variables of interest as a list
daily_variables_of_interest = [
    "temperature_2m_max",
    "temperature_2m_min",
    "temperature_2m_mean",
    "daylight_duration",
    "sunshine_duration",
    "precipitation_sum",
    "rain_sum",
    "precipitation_hours",
]

# Create the API params dictionary
params = {
    "latitude": [city["latitude"] for city in geocoded_cities],
    "longitude": [city["longitude"] for city in geocoded_cities],
    "start_date": "1940-01-01",
    "end_date": "2023-12-31",
    "daily": daily_variables_of_interest,
}

In [6]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
responses = openmeteo.weather_api(url, params=params)

In [7]:
# List comprehension to create a list of dataframes
dataframes_list = [cf.process_response(response, geocoded_cities, i) for i, response in enumerate(responses)]

In [8]:
merged_df = pd.concat(dataframes_list, ignore_index=True)
merged_df.to_csv("../data/weather_data.csv", index=False)

### Scrape the NGRAM data into dataframes

In [11]:
NGRAMS_df = pd.DataFrame()
queries = ["London rain", "London Rain", "rainy London", "rain in London", "Rain in London", "raining in London", "Raining in London"]

for query in queries:
    df_query = cf.runQuery(query)
    NGRAMS_df = pd.concat([NGRAMS_df, df_query])

NGRAMS_df.reset_index(drop=True, inplace=True)
NGRAMS_df

Unnamed: 0,query,Year,Appearances
0,London rain,1940,8.224380e-10
1,London rain,1941,6.946620e-10
2,London rain,1942,5.788850e-10
3,London rain,1943,5.777963e-10
4,London rain,1944,5.943155e-10
...,...,...,...
555,Raining in London,2015,5.425769e-11
556,Raining in London,2016,5.221881e-11
557,Raining in London,2017,3.226468e-11
558,Raining in London,2018,3.871761e-11


### Sum the appearances into one dataframe

In [12]:
NGRAMS_df_grouped = NGRAMS_df.groupby('Year').sum().reset_index()
NGRAMS_df_grouped = NGRAMS_df_grouped.loc[:, ['Year', 'Appearances']]
NGRAMS_df_grouped

Unnamed: 0,Year,Appearances
0,1940,2.731485e-09
1,1941,2.756745e-09
2,1942,2.743879e-09
3,1943,2.774656e-09
4,1944,2.788329e-09
...,...,...
75,2015,1.111827e-08
76,2016,1.074132e-08
77,2017,1.051684e-08
78,2018,1.025569e-08


In [13]:
# divide each item in the 'Appearances' column by the first item in the column
NGRAMS_df_grouped['Appearances'] = NGRAMS_df_grouped['Appearances'].apply(lambda x: x / NGRAMS_df_grouped['Appearances'].iloc[0])
NGRAMS_df_grouped = NGRAMS_df_grouped.rename(columns={'Appearances': 'Perception'})

NGRAMS_df_grouped

Unnamed: 0,Year,Perception
0,1940,1.000000
1,1941,1.009248
2,1942,1.004537
3,1943,1.015805
4,1944,1.020811
...,...,...
75,2015,4.070414
76,2016,3.932411
77,2017,3.850227
78,2018,3.754620


In [14]:
# convert NGRAM_df_grouped into a csv file
NGRAMS_df_grouped.to_csv('../data/perception_data.csv', index=False)

# Create a SQL database using the following terminal commands:
- ```rm data/rainy.db``` Deletes the database if it is there (this is only run if we have updated our data)
- ```sqlite3 data/rainy.db``` Enters the SQL shell.
- ```.mode csv``` Changes mode to CSV.
- ```.import data/weather_data.csv weather``` Import each CSV as a new table.
- ```.import data/perception_data.csv perception```
- ```SELECT * FROM perception``` Check the data has loaded properly.
- ```SELECT * FROM weather```
- ```.quit``` Exits the SQL shell.

# Adjust the datatypes to the most efficient using the following SQL code run in the SQLTools VSCode Extension:
```SQL
CREATE TABLE new_weather (
    date DATE,
    city VARCHAR(18),
    temperature_2m_max DECIMAL(7,4),
    temperature_2m_min DECIMAL(7,4),
    temperature_2m_mean DECIMAL(7,4),
    daylight_duration DECIMAL(9,3),
    sunshine_duration DECIMAL(9,3),
    precipitation_sum DECIMAL(8,5),
    rain_sum DECIMAL(8,5),
    precipitation_hours TINYINT UNSIGNED
);

INSERT INTO new_weather (date, city, temperature_2m_max, temperature_2m_min, temperature_2m_mean,
                         daylight_duration, sunshine_duration, precipitation_sum, rain_sum, precipitation_hours)
SELECT date, city, temperature_2m_max, temperature_2m_min, temperature_2m_mean,
       daylight_duration, sunshine_duration, precipitation_sum, rain_sum, precipitation_hours
FROM weather;

DROP TABLE weather;

ALTER TABLE new_weather RENAME TO weather;

CREATE TABLE new_perception (
    Year YEAR,
    Perception DECIMAL(18,16)
);

INSERT INTO new_perception (Year, Perception) SELECT Year, Perception FROM perception;

DROP TABLE perception;

ALTER TABLE new_perception RENAME TO perception;
```

In [2]:
engine = create_engine('sqlite:///../data/rainy.db', echo=False, isolation_level="AUTOCOMMIT")

with engine.connect() as conn:
    pass

In [3]:
df = pd.read_sql('SELECT * FROM perception', engine)