In [75]:
import pandas as pd
import numpy as np
import json
import requests
import openpyxl

### SLEEP QUALITY AND DURATION DATA

In [59]:
# Importing the two datasets
country_abbrevs = pd.read_csv('country_abbrev_list.csv')

re = requests.get('https://sandman.sleepcycle.com/data/per-country-stats')
country_sleep_data = pd.DataFrame(re.json())

In [60]:
# Merging the two datasets
country_sleep_data['country'] = country_sleep_data['country'].str.upper()
sleep_by_country = pd.merge(country_sleep_data, country_abbrevs, left_on = 'country', right_on = 'Code').drop(['country', 'Code'], axis=1).set_index('Name')

In [61]:
# Converting all the strings to floats and converting sleep values to hours or minutes (from seconds)
sleep_by_country = sleep_by_country.astype(float)
sleep_by_country['avg_duration'] = round(sleep_by_country['avg_duration'] / 3600, 2)
sleep_by_country['avg_snore_duration'] = round(sleep_by_country['avg_snore_duration'] / 60, 2)

In [62]:
# Renaming columns appropriately and dropping unnecessary columns
sleep_by_country = sleep_by_country.rename(columns = {'avg_sleep_quality': 'Average Sleep Quality (0-1)', 'avg_duration': 'Average Sleep Duration (Hours)', 'avg_snore_duration': 'Average Snore Duration (Minutes)'})
sleep_by_country = sleep_by_country.drop(['avg_bedtime', 'avg_wakeup', 'group_size'], axis=1)

In [69]:
sleep_by_country

Unnamed: 0_level_0,Average Sleep Quality (0-1),Average Sleep Duration (Hours),Average Snore Duration (Minutes)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Italy,0.748668,7.38,52.83
Switzerland,0.767681,7.49,42.45
United Kingdom,0.780650,7.67,46.21
Austria,0.763325,7.47,45.16
Israel,0.735570,7.24,37.35
...,...,...,...
United States,0.769510,7.56,42.06
Spain,0.748990,7.46,44.85
Indonesia,0.742885,7.34,39.82
Czech Republic,0.765598,7.46,46.05


### SOCIOECONOMIC DATA

In [99]:
# Importing and cleaning the dataset (removing some columns, making sure to only keep the most recent year for each country)
socioeconomic_data = pd.read_csv('GLOB.SES.csv', encoding = 'latin1')
socioeconomic_data = socioeconomic_data.drop(['unid', 'wbid', 'popshare', 'yrseduc'], axis=1)
socioeconomic_data = socioeconomic_data[socioeconomic_data['year'] == 2010]
socioeconomic_data = socioeconomic_data.set_index('country')

In [102]:
socioeconomic_data

Unnamed: 0_level_0,year,SES,gdppc
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,2010,5.676400,1662.8035
Angola,2010,21.247763,6492.1768
Albania,2010,74.860367,9927.1758
United Arab Emirates,2010,89.092285,57406.7380
Argentina,2010,79.750809,18794.2700
...,...,...,...
Vietnam,2010,38.222027,4486.2612
Yemen,2010,19.492294,4481.5547
South Africa,2010,71.227959,12028.9340
Zambia,2010,27.127140,3263.3948


### WHO ENVIRONMENTAL DATA

In [121]:
# Importing the dataset
air_df = pd.read_csv('data/who_2022_air_quality.csv', encoding='latin1')

In [122]:
# Cleaning the dataset by getting only most recent year for each country and dropping unnecessary columns
air_df = air_df.loc[air_df.groupby('WHO Country Name')['Measurement Year'].idxmax()].set_index('WHO Country Name')
air_df = air_df.drop(['WHO Region', 'ISO3', 'Version of the database', 'Number and type of monitoring stations', 'Reference', 'Status', 'Unnamed: 15', 'Unnamed: 16'], axis=1)

In [131]:
air_df

Unnamed: 0_level_0,City or Locality,Measurement Year,PM2.5 (?g/m3),PM10 (?g/m3),NO2 (?g/m3),PM25 temporal coverage (%),PM10 temporal coverage (%),NO2 temporal coverage (%)
WHO Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Afghanistan,Kabul,2019,119.77,,,18.0,,
Albania,Vlore,2019,10.32,,,,,
Algeria,Algiers,2019,21.53,,,44.0,,
Andorra,Escaldes-Engordany,2019,,24.58,31.01,,,92.180365
Argentina,Buenos Aires,2019,,25.50,18.25,,97.2,87.500000
...,...,...,...,...,...,...,...,...
United States of America,Albuquerque (Nm),2019,,18.30,17.70,,,
Uruguay,Montevideo,2018,12.00,19.00,,,,
Uzbekistan,Tashkent,2019,41.96,,,81.0,,
Venezuela (Bolivarian Republic of),Caracas,2012,,45.00,,,,
