In [1]:
import pandas as pd
import numpy as np
import json
import requests
import openpyxl
from bs4 import BeautifulSoup

# SLEEP QUALITY AND DURATION DATA

In [2]:
# Importing the two datasets
country_abbrevs = pd.read_csv('data/country_abbrev_list.csv')

re = requests.get('https://sandman.sleepcycle.com/data/per-country-stats')
country_sleep_data = pd.DataFrame(re.json())

In [3]:
country_sleep_data

Unnamed: 0,country,avg_sleep_quality,avg_duration,avg_snore_duration,avg_bedtime,avg_wakeup,group_size
0,ca,0.7706280786149319,27203.393388256663,2658.828667280861,-0.010468966378401922,0.30500672811254403,28646
1,co,0.7297271999818904,25615.27579824156,1437.7857592261742,-0.03228318759677625,0.2644591619912697,2161
2,in,0.7060974741852611,25093.229276895945,2137.2825717013548,0.004082686733507055,0.2950252965085311,1134
3,il,0.7406121077804889,26042.273445212242,2051.836718594944,0.017049587070040857,0.31956485420204445,2026
4,se,0.786575637584524,27685.027529392293,3116.2711398811175,-0.007667250763352499,0.3134636193388775,24156
...,...,...,...,...,...,...,...
57,cy,0.7502196809844399,27078.2886002886,2816.03259211972,0.0043422235917116225,0.31831668729306556,693
58,jp,0.6784701596828577,24236.974481437002,3323.9126377116636,0.00410389190726472,0.2842500281637383,100711
59,gb,0.7840466475722139,27963.86005062065,2777.4097512704925,-0.00660779543906787,0.31820536414364314,65981
60,pe,0.7117202666169559,25158.92667509482,1714.1829438347725,-0.010349246817979626,0.2795599722538905,791


In [4]:
# Merging the two datasets
country_sleep_data['country'] = country_sleep_data['country'].str.upper()
sleep_by_country = pd.merge(country_sleep_data, country_abbrevs, left_on = 'country', right_on = 'Code').drop(['country', 'Code'], axis=1).set_index('Name')

In [5]:
# Converting all the strings to floats and converting sleep values to hours or minutes (from seconds)
sleep_by_country = sleep_by_country.astype(float)
sleep_by_country['avg_duration'] = round(sleep_by_country['avg_duration'] / 3600, 2)
sleep_by_country['avg_snore_duration'] = round(sleep_by_country['avg_snore_duration'] / 60, 2)

In [6]:
# Renaming columns appropriately and dropping unnecessary columns
sleep_by_country = sleep_by_country.rename(columns = {'avg_sleep_quality': 'Average Sleep Quality (0-1)', 'avg_duration': 'Average Sleep Duration (Hours)', 'avg_snore_duration': 'Average Snore Duration (Minutes)'})
sleep_by_country = sleep_by_country.drop(['avg_bedtime', 'avg_wakeup', 'group_size', 'Average Snore Duration (Minutes)'], axis=1)

In [7]:
sleep_by_country

Unnamed: 0_level_0,Average Sleep Quality (0-1),Average Sleep Duration (Hours)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Canada,0.770628,7.56
Colombia,0.729727,7.12
India,0.706097,6.97
Israel,0.740612,7.23
Sweden,0.786576,7.69
...,...,...
Cyprus,0.750220,7.52
Japan,0.678470,6.73
United Kingdom,0.784047,7.77
Peru,0.711720,6.99


# SOCIOECONOMIC DATA

In [8]:
# Importing and cleaning the dataset (removing some columns, making sure to only keep the most recent year for each country)
socioeconomic_data = pd.read_csv('data/raw_socioeconomic_data.csv', encoding = 'latin1')
socioeconomic_data = socioeconomic_data.drop(['unid', 'wbid', 'popshare', 'yrseduc'], axis=1)
socioeconomic_data = socioeconomic_data[socioeconomic_data['year'] == 2010]
socioeconomic_data = socioeconomic_data.set_index('country')

In [9]:
socioeconomic_data

Unnamed: 0_level_0,year,SES,gdppc
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,2010,5.676400,1662.8035
Angola,2010,21.247763,6492.1768
Albania,2010,74.860367,9927.1758
United Arab Emirates,2010,89.092285,57406.7380
Argentina,2010,79.750809,18794.2700
...,...,...,...
Vietnam,2010,38.222027,4486.2612
Yemen,2010,19.492294,4481.5547
South Africa,2010,71.227959,12028.9340
Zambia,2010,27.127140,3263.3948


# WHO ENVIRONMENTAL DATA

In [10]:
# Importing the dataset
air_df = pd.read_csv('data/who_2022_air_quality.csv', encoding='latin1')

In [11]:
# Cleaning the dataset by getting only most recent year for each country and dropping unnecessary columns
air_df = air_df.loc[air_df.groupby('WHO Country Name')['Measurement Year'].idxmax()].set_index('WHO Country Name')
air_df = air_df.drop(['WHO Region', 'ISO3', 'Version of the database', 'Number and type of monitoring stations', 'City or Locality', 'Reference', 'Status', 'Unnamed: 15', 'Unnamed: 16', 'PM25 temporal coverage (%)', 'PM10 temporal coverage (%)', 'NO2 temporal coverage (%)'], axis=1)

In [12]:
air_df

Unnamed: 0_level_0,Measurement Year,PM2.5 (?g/m3),PM10 (?g/m3),NO2 (?g/m3)
WHO Country Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,2019,119.77,,
Albania,2019,10.32,,
Algeria,2019,21.53,,
Andorra,2019,,24.58,31.01
Argentina,2019,,25.50,18.25
...,...,...,...,...
United States of America,2019,,18.30,17.70
Uruguay,2018,12.00,19.00,
Uzbekistan,2019,41.96,,
Venezuela (Bolivarian Republic of),2012,,45.00,


# New Environmental Dataset

In [13]:
re = requests.get('https://en.wikipedia.org/wiki/List_of_countries_by_air_pollution')
soup = BeautifulSoup(re.text, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})
air_quality_df = pd.read_html(str(table))[0].drop(columns=['Rank', 'Loss of life expectancy (years)'])
air_quality_df = air_quality_df.rename(columns={air_quality_df.columns[-1]: 'Air Pollution (PM 2.5)'}).set_index('Country')

  air_quality_df = pd.read_html(str(table))[0].drop(columns=['Rank', 'Loss of life expectancy (years)'])


In [14]:
air_quality_df

Unnamed: 0_level_0,Air Pollution (PM 2.5)
Country,Unnamed: 1_level_1
Bangladesh,54.17
India,41.39
Nepal,39.18
Qatar,39.16
Pakistan,38.90
...,...
Cook Islands,1.13
Greenland,1.11
Marshall Islands,1.02
Federated States of Micronesia,0.94


# Merging our 3 Datasets

In [15]:
merged_df = pd.merge(pd.merge(sleep_by_country, socioeconomic_data, left_index=True, right_index=True, how='left'), air_quality_df, left_index=True, right_index=True, how='left').drop(['year'], axis=1)

In [16]:
merged_df.head()

Unnamed: 0_level_0,Average Sleep Quality (0-1),Average Sleep Duration (Hours),SES,gdppc,Air Pollution (PM 2.5)
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Canada,0.770628,7.56,93.772118,40699.355,6.04
Colombia,0.729727,7.12,59.636009,10900.504,16.23
India,0.706097,6.97,22.542639,4404.5376,41.39
Israel,0.740612,7.23,86.517677,29599.793,14.32
Sweden,0.786576,7.69,91.959633,42942.563,4.86


In [51]:
df_dict = {}
for col in merged_df.columns:
    df_dict[col] = merged_df[col].describe()

In [52]:
pd.DataFrame(df_dict).drop(['count', '25%', '50%', '75%'])

Unnamed: 0,Average Sleep Quality (0-1),Average Sleep Duration (Hours),SES,gdppc,Air Pollution (PM 2.5)
mean,0.74935,7.381935,77.829817,28981.52236,13.212982
std,0.029136,0.278726,17.27471,17941.554663,7.206984
min,0.67847,6.73,22.542639,4404.5376,3.36
max,0.804231,7.91,97.551964,89727.039,41.39


In [53]:
merged_df['gdppc'].corr(merged_df['Average Sleep Quality (0-1)'])

0.4134036099512685

In [57]:
merged_df['Air Pollution (PM 2.5)'].corr(merged_df['Average Sleep Duration (Hours)'])

-0.6052384965956255