In [14]:
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
import lxml.html
import requests
from lxml import etree

import matplotlib.pyplot as plt
import seaborn as sns

import time


import warnings
warnings.filterwarnings('ignore')



# Environmental Performance Index
The 2022 EPI provides a quantitative basis for comparing, analyzing, and understanding environmental performance for 180 countries. We score and rank these countries on their environmental performance using the most recent year of data available and calculate how these scores have changed over the previous decade.


chatgpt me ha vacilado así que lo vamos a sacar a mano

In [15]:
url = 'https://epi.yale.edu/epi-results/2022/component/epi'

In [16]:
driver = webdriver.Firefox()
driver.get(url)
time.sleep(5)

html = lxml.html.document_fromstring(driver.page_source)
driver.close()

In [26]:
tabla = html.xpath('//*[@id="DataTables_Table_0"]')

In [46]:
rows = tabla[0].xpath('./tbody//tr')
len(rows)

180

In [65]:
rows[0].xpath('./td[1]/a/text()')
rows[0].xpath('./td[2]/text()')[-1].split()
rows[0].xpath('./td[3]/text()')[-1].split()

['77.90']

In [66]:
rows = tabla[0].xpath('./tbody//tr')

df = pd.DataFrame(columns=['country', 'rank', 'epi_score'])

for row in rows:
    aux = pd.DataFrame()
    aux['country'] = row.xpath('.//td/a/text()')
    aux['rank'] = row.xpath('./td[2]/text()')[-1].split()
    aux['epi_score'] = row.xpath('./td[3]/text()')[-1].split()

    df = pd.concat([df, aux])

In [67]:
df

Unnamed: 0,country,rank,epi_score
0,Denmark,1,77.90
0,United Kingdom,2,77.70
0,Finland,3,76.50
0,Malta,4,75.20
0,Sweden,5,72.70
...,...,...,...
0,Pakistan,176,24.60
0,Bangladesh,177,23.10
0,Viet Nam,178,20.10
0,Myanmar,179,19.40


### Resto de variables environmental

#### Ecosystem Vitality
The Ecosystem Vitality policy objective measures how well countries are preserving, protecting, and enhancing ecosystems and the services they provide. It comprises 42% of the total EPI score and is made up of six issue categories: Biodiversity & Habitat, Ecosystem Services, Fisheries, Acid Rain, Agriculture, and Water Resources.

In [68]:
url1 = 'https://epi.yale.edu/epi-results/2022/component/eco'

#### Biodiversity & Habitat
The Biodiversity and Habitat  issue category assesses countries’ actions toward retaining natural ecosystems and protecting the full range of biodiversity within their borders. It consists of seven indicators: terrestrial biome protection (weighted for the national and global rarity of biomes), marine protected areas, Protected Areas Representativeness Index, Species Habitat Index, Species Protection Index, and Biodiversity Habitat Index.

In [69]:
url2 = 'https://epi.yale.edu/epi-results/2022/component/bdh'

#### Environmental Health
The Environmental Health policy objective measures how well countries are protecting their populations from environmental health risks. It comprises 20% of the total EPI score and is made up of four issue categories: Air Quality, Sanitation & Drinking Water, Heavy Metals, and Waste Management.

In [70]:
url3 = 'https://epi.yale.edu/epi-results/2022/component/hlt'

#### Air Quality
The Air Quality issue category measures the direct impacts of air pollution on human health in each country. It consists of seven indicators: PM2.5 exposure, household solid fuels, ozone exposure, nitrogen oxides exposure, sulfur dioxide exposure, carbon monoxide exposure, and volatile organic compound exposure.

In [71]:
url4 = 'https://epi.yale.edu/epi-results/2022/component/air'

#### Unsafe Sanitation
We measure recycling rates as the proportion of recyclable materials (metal, plastic, paper, and glass) recycled in each country. A score of 100 indicates a country recycles all recyclable post-consumer material, while a score of 0 indicates a country recycles no recyclable post-consumer material.

In [73]:
url5 = 'https://epi.yale.edu/epi-results/2022/component/usd'

#### Recycling
We measure recycling rates as the proportion of recyclable materials (metal, plastic, paper, and glass) recycled in each country. A score of 100 indicates a country recycles all recyclable post-consumer material, while a score of 0 indicates a country recycles no recyclable post-consumer material.

In [74]:
url6 = 'https://epi.yale.edu/epi-results/2022/component/rec'

#### Ocean Plastics
We measure ocean plastic pollution as the absolute quantity, in millions of metric tons, of plastics a country releases into the oceans in a given year. A score of 100 indicates a country emits zero tons of plastic each year, while a score of 0 indicates a country is among the highest (≥99th-percentile) ocean plastic polluters.

In [75]:
url7 = 'https://epi.yale.edu/epi-results/2022/component/ocp'

#### Climate Change Policy Objective
The 2022 EPI introduces Climate Change as a new policy objective. It comprises 38% of the total EPI score and is made up of a single issue category: Climate Change Mitigation.

The Climate Change Mitigation issue category measures progress to combat global climate change, which exacerbates other environmental threats and imperils human health and safety. It is composed of nine indicators: adjusted emission growth rates for four greenhouse gases (CO2, CH4, F-gases, and N2O) and one climate pollutant (black carbon); projected greenhouse gas emissions in 2050; growth rate in CO2 emissions from land cover; greenhouse gas intensity growth rate; and greenhouse gas emissions per capita.

 

In [76]:
url8 = 'https://epi.yale.edu/epi-results/2022/component/cch'

### Sacamos todo junto

In [81]:
list_urls = [url1, url2, url3, url4, url5, url6, url7, url8]
col_names = ['eco', 'bdh']
list_urls

['https://epi.yale.edu/epi-results/2022/component/eco',
 'https://epi.yale.edu/epi-results/2022/component/bdh',
 'https://epi.yale.edu/epi-results/2022/component/hlt',
 'https://epi.yale.edu/epi-results/2022/component/air',
 'https://epi.yale.edu/epi-results/2022/component/usd',
 'https://epi.yale.edu/epi-results/2022/component/rec',
 'https://epi.yale.edu/epi-results/2022/component/ocp',
 'https://epi.yale.edu/epi-results/2022/component/cch']

In [89]:
df_final = pd.DataFrame()

for url in list_urls:

    driver = webdriver.Firefox()
    driver.get(url)
    time.sleep(5)
    html = lxml.html.document_fromstring(driver.page_source)
    driver.close()

    col_name = url.split(sep='/')[-1] + '_score'

    tabla = html.xpath('//*[@id="DataTables_Table_0"]')
    rows = tabla[0].xpath('./tbody//tr')

    for row in rows:
        aux = pd.DataFrame()
        aux['country'] = row.xpath('.//td/a/text()')
        aux['rank'] = row.xpath('./td[2]/text()')[-1].split()
        aux[col_name] = row.xpath('./td[3]/text()')[-1].split()

        df_final = pd.concat([df_final, aux])
    
    

se ha sacado regular por culpa del rango (habría que quitarlo o meter un merge por country en vez de un concat) pero da igual, lo arreglamos aquí en vez de volver a scrapear todo, es más rápido

In [113]:
# arreglamos el dataframe y juntamos con el que sacamos original
environment = df_final.drop('rank', axis = 1).groupby('country').first().reset_index()
environment = pd.merge(df, environment, on = 'country')

In [118]:
environment

Unnamed: 0,country,rank,epi_score,eco_score,bdh_score,hlt_score,air_score,usd_score,rec_score,ocp_score,cch_score
0,Denmark,1,77.90,61.30,76.90,85.50,80.50,99.90,35.40,39.10,92.40
1,United Kingdom,2,77.70,62.30,81.50,83.90,78.60,100.00,34.20,24.40,91.50
2,Finland,3,76.50,62.00,71.10,93.40,93.50,100.00,35.40,46.50,83.60
3,Malta,4,75.20,68.20,72.90,76.50,73.20,99.50,10.50,59.70,82.30
4,Sweden,5,72.70,60.60,68.80,93.10,94.00,99.60,39.70,44.80,75.40
...,...,...,...,...,...,...,...,...,...,...,...
175,Pakistan,176,24.60,37.80,37.40,11.40,5.70,20.90,13.80,10.20,16.90
176,Bangladesh,177,23.10,29.40,37.40,18.10,14.40,28.00,15.70,14.60,18.80
177,Viet Nam,178,20.10,22.10,27.90,35.10,26.50,55.70,46.00,12.30,10.10
178,Myanmar,179,19.40,20.20,21.80,21.60,16.90,32.30,12.30,12.50,17.30


In [117]:
environment.to_excel('data/processed/environmental_variables.xlsx')

# Restaurants

In [141]:
url = 'https://www.tasteatlas.com/best/cuisines'

In [146]:
driver = webdriver.Firefox()
driver.get(url)
time.sleep(5)

# we find the button to see the complete list and click on it
try:
    button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "section-button.with-border"))
    )
    button.click()
except:
    print("Error: Button not found (Option 1).")


html = lxml.html.document_fromstring(driver.page_source)
driver.close()

Error: Button not found (Option 2).


In [147]:
rows = html.xpath('//*[@id="BestCuisines"]//a[@class="content-holder content"]')
len(rows)

100

In [149]:
rows[0].xpath('.//@href')

['italy']

In [160]:
countries = []
rankings = []

ranking = 1
for row in rows:
    country = row.xpath('.//@href')
    countries.extend(country)
    rankings.append(ranking)
    ranking += 1

cuisine = pd.DataFrame({'Country': countries, 'Ranking': rankings})
cuisine

Unnamed: 0,Country,Ranking
0,italy,1
1,japan,2
2,greece,3
3,portugal,4
4,china,5
...,...,...
95,northern-ireland,96
96,the-bahamas,97
97,dominican-republic,98
98,wales,99


In [161]:
cuisine.to_excel('data/processed/cuisines_variable.xlsx')

# Global Peace & Global Terrorism Index
Produced by the Institute for Economics and Peace (IEP), the __Global Peace Index (GPI)__ is the world’s leading measure of global peacefulness. This report presents the most comprehensive data-driven analysis to-date on trends in peace, its economic value, and how to develop peaceful societies. The Global Peace Index covers 99.7% of the world’s population, and is calculated using 23 qualitative and quantitative indicators from highly respected sources, and measures the state of peace across three domains:

- the level of Societal Safety and Security,
- the extent of Ongoing Domestic and International Conflict,
- and the degree of Militarisation.

The __Global Terrorism Index (GTI)__ is a composite measure made up of four indicators: incidents, fatalities, injuries and hostages. To measure the impact of terrorism, a five-year weighted average is applied.



In [None]:
# fuente sacada de internet, no cuadra con el dato de la web
gpi = pd.read_stata('./data/raw/data_peace.dta')

In [None]:
gpi.query('year == 2023').sort_values(by = 'peace').head(20)

Unnamed: 0,year,country,peace,id
2507,2023,Iceland,1.0,63.0
2535,2023,Mauritius,1.0,91.0
2572,2023,Singapore,1.0,128.0
2601,2023,Uruguay,1.0,157.0
2532,2023,Malaysia,1.005,88.0
2546,2023,NewZealand,1.009,102.0
2462,2023,Botswana,1.018,18.0
2512,2023,Ireland,1.028,68.0
2469,2023,Canada,1.037,25.0
2583,2023,Switzerland,1.046,139.0


In [None]:
url_peace = 'https://www.visionofhumanity.org/maps/#/'
url_terr = 'https://www.visionofhumanity.org/maps/global-terrorism-index/#/'

In [None]:
year_option

NameError: name 'year_option' is not defined

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Replace with the actual website URL 
url = "https://www.visionofhumanity.org/maps/#/"

# Specify your desired year
year = "2021" 

# Set up Selenium WebDriver (replace with your preferred browser)
driver = webdriver.Chrome()
driver.get(url)

# Option 1: Targeting by multiple classes
try:
    year_filter = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "div.voh-selector.voh-selector__three.FJdvw > div.a > div.kpJrqq"))
    )
except:
    print("Error: Could not locate year filter (Option 1).")

# Option 2: Targeting by text "Year" (try this if Option 1 fails)
try:
    year_filter = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//div[normalize-space()='Year']/ancestor::div[contains(@class, 'voh-selector')]"))
    )
except:
    print("Error: Could not locate year filter (Option 2).")

# Click the year filter (assuming it's a clickable element)
year_filter.click()

# Click on the desired year (how to do this depends on the element type)
# Example 1: If it's a dropdown
year_filter.click()
year_option = WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, f"//div[contains(@class,'fzsyuK') and text()='{year}']"))
)
year_option.click()

# Wait for the data table to update (several strategies can be used)
try:
    data_table = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.ID, "dataTable"))  # Assuming the table has an ID
    )
    print("Data table updated!")
except:
    print("Error: Data table might not have updated.") 

# ... (You can add more sophisticated wait logic if needed)


TimeoutException: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6063AAD02+56930]
	(No symbol) [0x00007FF60631F602]
	(No symbol) [0x00007FF6061D42E5]
	(No symbol) [0x00007FF6062198ED]
	(No symbol) [0x00007FF606219A2C]
	(No symbol) [0x00007FF60625A967]
	(No symbol) [0x00007FF60623BCDF]
	(No symbol) [0x00007FF6062581E2]
	(No symbol) [0x00007FF60623BA43]
	(No symbol) [0x00007FF60620D438]
	(No symbol) [0x00007FF60620E4D1]
	GetHandleVerifier [0x00007FF606726F8D+3711213]
	GetHandleVerifier [0x00007FF6067804CD+4077101]
	GetHandleVerifier [0x00007FF60677865F+4044735]
	GetHandleVerifier [0x00007FF606449736+706710]
	(No symbol) [0x00007FF60632B8DF]
	(No symbol) [0x00007FF606326AC4]
	(No symbol) [0x00007FF606326C1C]
	(No symbol) [0x00007FF6063168D4]
	BaseThreadInitThunk [0x00007FFC206B53E0+16]
	RtlUserThreadStart [0x00007FFC21AE485B+43]


# Seasonality
The best time of the year to visit every country in the world by Frequent Miler.
The goal was to find months of overlapping agreement between all of these “authorities,” prioritizing optimal weather and moderate crowds.
- Lonely Planet, Rough Guides, Bradt, Fodor's, Travel + Leisure