In [2]:
pip install pycountry



In [3]:
pip install wbdata

Collecting wbdata
  Downloading wbdata-0.3.0-py3-none-any.whl (14 kB)
Installing collected packages: wbdata
Successfully installed wbdata-0.3.0


In [4]:
import wbdata
import pandas as pd
import datetime
from geopy.geocoders import Nominatim
import pycountry
import json
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random

In [5]:
indicators = {'NY.GDP.PCAP.KD':'GDP per capita',
              'SH.PRV.SMOK': 'Prevalence of total tobacco use',
              'SH.PRV.SMOK.FE':'Tobacco use by Females',
              'SH.PRV.SMOK.MA': 'Tobacco use by males',
              'SH.ALC.PCAP.LI': 'Prevalence of total alcohol use',
              'SH.ALC.PCAP.FE.LI':'Alcohol use by females',
              'SH.ALC.PCAP.MA.LI':'Alcohol use by males',
             }

In [None]:
# southeast_asia(in case) = [
#     'BRN', 'KH' , 'ID', 'LA', 'MY', 'MM', 'PH', 'SG', 'TH', 'VN'
# ]

df = wbdata.get_dataframe(indicators, country =[])
df.reset_index(inplace=True)
years_of_interest = ['2000', '2005', '2010', '2015', '2019']

# Filter the DataFrame to include only the desired years
df = df[df.date.isin(years_of_interest)]
df = df.groupby('country').apply(lambda x: x.sort_values('date', ascending=True))

# Reset the index to make the DataFrame look cleaner
df.reset_index(drop=True, inplace=True)
df

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
print(df.isnull().sum())

In [None]:
country_name_to_code = {country.name: country.alpha_3 for country in pycountry.countries}

# Add a new column for country codes and populate it based on country names
df['country_code'] = df['country'].apply(lambda x: country_name_to_code.get(x, None))

In [None]:
country_codes = {
    'Bahamas, The':'BHS',
    'Bolivia' :'BOL',
    'British Virgin Islands':'VGB',
    'Congo, Dem. Rep.':'CD',
    'Congo, Rep.' :'COG',
    "Cote d'Ivoire" :'CIV',
    'Egypt, Arab Rep.':'EGY',
    'Gambia, The':'GMB',
    'Hong Kong SAR, China':'HKG',
    'Iran, Islamic Rep.':'IRN',
    "Korea, Dem. People's Rep.":'PRK',
    'Korea, Rep.':'KOR',
    'Kosovo':'XXK',
    'Kyrgyz Republic' :'KGZ',
    'Lao PDR' :'LAO',
    'Macao SAR, China':'MAC',
    'Micronesia, Fed. Sts.':'FSM',
    'Moldova':'MDA',
    'Slovak Republic' :'SVK',
    'St. Kitts and Nevis' :'KNA',
    'St. Lucia':'LCA',
    'St. Vincent and the Grenadines' :'VCT',
    'Tanzania':'TZA',
    'Turkiye' :'TUR',
    'Venezuela, RB':'VEN',
    'Virgin Islands (U.S.)' :'VIR',
    'West Bank and Gaza':'PSE',
    'Yemen, Rep.':'YEM'  # Add more countries as needed
}

missing_codes = df['country_code'].isnull()
for country, code in country_codes.items():
    df.loc[df['country'] == country, 'country_code'] = code

In [None]:
income_levels = ['Low', 'Lower-middle', 'Upper-middle', 'High']

# Create bins for income levels
bins = [0, 1086, 4255, 13205, 200000]

# Cut gdp_per_capita into bins based on income levels
df['Income Level'] = pd.cut(df['GDP per capita'], bins, labels=income_levels)
df

In [None]:
#Reordering for better visibility
df = df[['country', 'date', 'GDP per capita', 'Income Level', 'Prevalence of total tobacco use',
         'Tobacco use by Females', 'Tobacco use by males', 'Prevalence of total alcohol use',
         'Alcohol use by females', 'Alcohol use by males', 'country_code']]
df

In [None]:
print(df.isnull().sum())

In [None]:
testing_missing = df.loc[df['country_code'].isnull(), 'country'].unique()
print(testing_missing)

In [None]:
missing_country = df[df['country_code'].isnull()]
missing_country_codes = missing_country.assign()

missing_country_codes

In [None]:
df_cleaned = df.dropna(subset=['country_code', 'Prevalence of total tobacco use'])
df = df_cleaned
df

In [None]:
print(df.isnull().sum())

In [None]:
income_level_null = df[df['Income Level'].isnull()]
print(income_level_null)

In [None]:
sorted_missing_country_codes = missing_country_codes.sort_values('date', ascending=True)
country_names = ['Africa Eastern and Southern', 'Africa Western and Central',
                'East Asia & Pacific', 'Europe & Central Asia', 'Latin America & Caribbean',
                'Middle East & North Africa', 'North America', 'South Asia',
                'Sub-Saharan Africa', 'Arab World']

countries_to_plot = sorted_missing_country_codes.country  # Add more countries as needed

plt.figure(figsize=(11, 7))

# Iterate over the list of countries and plot the data for each
for country_name in country_names:
    country_data = sorted_missing_country_codes[sorted_missing_country_codes['country'] == country_name]
    sns.lineplot(data=country_data, x='date', y='Prevalence of total tobacco use', marker='o', label=country_name)

plt.xlabel('Year')
plt.ylabel('Smokers % of adult population')
plt.title('Trend of Smokers % of adult population over the Years')
plt.grid(True)
plt.legend()  # Add a legend to distinguish countries
plt.show()

### Explain the decrease: https://www3.paho.org/hq/index.php?option=com_content&view=article&id=13496:who-report-finds-dramatic-increase-in-life-saving-tobacco-control-policies-in-last-decade&Itemid=0&lang=en#gsc.tab=0

More control policies

In [None]:
df_2000 = df_cleaned[df_cleaned['date'] == '2000']
df_2005 = df_cleaned[df_cleaned['date'] == '2005']
df_2010 = df_cleaned[df_cleaned['date'] == '2010']
df_2015 = df_cleaned[df_cleaned['date'] == '2015']
df_2019 = df_cleaned[df_cleaned['date'] == '2019']

In [None]:
# Load GeoJSON data from file into a variable
with open('countries.geojson', 'r') as geojson_file:
    geojson_data = json.load(geojson_file)

# px.set_mapbox_access_token("your_mapbox_token")

fig = px.choropleth_mapbox(
    df_2019,
    geojson=geojson_data,
    locations="country_code",
    featureidkey="properties.ISO_A3",
    color="Prevalence of total tobacco use",
    color_continuous_scale="sunset",
    range_color=(10, 50),
    title="Tobacco Use by Country",
    mapbox_style="carto-positron",
    center={"lat": 0, "lon": 0},
    zoom=1,
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

## General graphs to make (refer to Happiness Report) (Toshi)

### Heatmap

In [None]:
correlation_matrix = df.corr()
plt.figure(figsize=(10,6))
annot_kws={'fontsize': 10,
           'color':"k",
           'alpha': 0.8,
           'verticalalignment':'center'}
#matrix = np.triu(correlation_matrix) + mask=matrix
sns.heatmap(correlation_matrix, annot=True, annot_kws = annot_kws, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
correlation_matrix = sorted_missing_country_codes.corr()
plt.figure(figsize=(10,6))
annot_kws={'fontsize': 10,
           'color':"k",
           'alpha': 0.8,
           'verticalalignment':'center'}
#matrix = np.triu(correlation_matrix) + mask=matrix
sns.heatmap(correlation_matrix, annot=True, annot_kws = annot_kws, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap for Regions')
plt.show()

### Histogram

In [None]:
plt.figure(figsize=(10,10))
sns.histplot(data=df, x='Prevalence of total tobacco use', bins=10, kde=True)
plt.title('Prevalence of total tobacco use distribution')
plt.xlabel('% of adult population')
plt.ylabel('Count')
plt.show()

In [None]:
years = df['date'].unique()

# Create subplots for each year
fig, axes = plt.subplots(nrows=1, ncols=len(years), figsize=(15, 5))

for i, year in enumerate(years):
    ax = axes[i]
    sns.histplot(data=df[df['date'] == year], x='Prevalence of total tobacco use', bins=8, kde=True, ax=ax)
    ax.set_title(f'Year {year}')
    ax.set_xlabel('Smokers % of adult population')
    ax.set_xlim(0, 60)
    ax.set_ylabel('Count')
    ax.set_ylim(0, 50)

plt.tight_layout()
plt.show()

In [None]:
country_name = 'Myanmar'

country_data = df[df['country'] == country_name]

plt.figure(figsize=(10,6))
sns.lineplot(data=country_data, x = 'date', y = 'Prevalence of total tobacco use', marker = 'o')
plt.xlabel('Year')
plt.ylabel('Smokers % of adult population')
plt.title(f'Trend of Smokers % of adult population over the Years for {country_name}')
plt.grid(True)
plt.show()

In [None]:
# all_countries = df['country'].unique()  # Add more countries as needed

# Assuming df['country'] contains the list of all countries
all_countries = df['country'].unique()

# Select 30 random countries
random_countries = random.sample(all_countries.tolist(), 10)
countries_to_plot = random_countries


plt.figure(figsize=(10, 6))

# Iterate over the list of countries and plot the data for each
for country_name in countries_to_plot:
    country_data = df[df['country'] == country_name]
    sns.lineplot(data=country_data, x='date', y='Prevalence of total tobacco use', marker='o', label=country_name)

plt.xlabel('Year')
plt.ylabel('Smokers % of adult population')
plt.title('Trend of Smokers % of adult population over the Years')
plt.grid(True)
plt.legend()  # Add a legend to distinguish countries
plt.show()

### Pairplot

In [None]:
sns.pairplot(df[['GDP per capita', 'Prevalence of total tobacco use',
                 'Tobacco use by Females', 'Tobacco use by males','Prevalence of total alcohol use',
                 'Alcohol use by females','Alcohol use by males']], height=2)

plt.suptitle('Pairwise Relationships')
plt.show()

[FARIHA] : HYPOTHESIS part
## Hypotheses:
1. Richer countries consume more tobacco. (GDP and tobacco)
2. Countries with more people consuming tobacco also have more people drinking more alcohol.
3. Richer countries has more equal consumption between male and female tobacco use.

# Richer countries consume more tobacco. (GDP and tobacco)

In [None]:
#no median, fuzzy graph
plt.figure(figsize=(10, 6))
sns.lineplot(data=df, x='GDP per capita', y='Prevalence of total tobacco use', errorbar=None)
plt.xlabel('GDP per capita')
plt.ylabel('Prevalence of total tobacco use')
plt.title('Trend of GDP and Tobbaco User')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='GDP per capita', y='Prevalence of total tobacco use')
plt.xlabel('GDP per capita')
plt.ylabel('Prevalence of total tobacco use')
plt.title('Trend of GDP and Tobbaco User')
plt.show()

In [None]:
fig, axes = plt.subplots(1, len(years), figsize=(15, 4))

# Iterate through years and create scatter plots
for i, year in enumerate(years):
    ax = axes[i]
    data_year = df[df['date'] == year]
    sns.scatterplot(data=data_year, x='GDP per capita', y='Prevalence of total tobacco use', ax=ax)
    ax.set_xlabel('GDP per capita')
    ax.set_ylabel('Prevalence of total tobacco use')
    ax.set_title(f'Trend for Year {year}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
median_gdp_per_capita = df.groupby('date')['GDP per capita'].median()

# Calculate the median tobacco consumption for each year
median_tobacco_consumption = df.groupby('date')['Prevalence of total tobacco use'].median()

# Plot the median GDP per capita against the median tobacco consumption for each year
sns.lineplot(x=median_gdp_per_capita, y=median_tobacco_consumption)
plt.title('GDP and Tobacco Use Relation')

In [None]:
# Create a scatter plot
plt.scatter(x=median_tobacco_consumption, y=median_gdp_per_capita)
# Label
plt.xlabel('Tobacco')
plt.ylabel('GDP')
plt.title('GDP and Tobacco Use Relation')
plt.show()
#trend is not easy to define

In [None]:
cor1 = median_tobacco_consumption.corr(median_gdp_per_capita)
cor1

In [None]:
#by year 2015, still error
plt.figure(figsize=(10, 6))
da=df[(df['date']==2015)]
plt.scatter(data=da, x='GDP per capita', y='Prevalence of total tobacco use')
plt.xlabel('GDP per capita')
plt.ylabel('Prevalence of total tobacco use')
plt.title('2015 GDP and Tobbaco User')
plt.show()

they have a high negative correlation. the higher the GDP, the less tobacco consumption they have. likely due to: higher lebel of educatuon and income are more aware of the health risks tobacco has. other than that, they might have a stronger regulation like tax and public places for smoking. do we need to prove this?

# Countries with more people consuming tobacco also have more people drinking more alcohol.


In [None]:
median_alcohol = df.groupby('date')['Prevalence of total alcohol use'].median()

# Calculate the median tobacco consumption for each year
median_tobacco_consumption = df.groupby('date')['Prevalence of total tobacco use'].median()

# Plot the median GDP per capita against the median tobacco consumption for each year
sns.lineplot(x=median_alcohol, y=median_tobacco_consumption)
plt.title('Alcohol and Tobacco Use Relation')

In [None]:
plt.figure(figsize=(10, 6))
#East Asia & Pacific
dfeap = wbdata.get_dataframe(indicators, country ='EAP')
sns.lineplot(data=dfeap, x='Prevalence of total alcohol use', y='Prevalence of total tobacco use', errorbar=None)
plt.xlabel('Alcohol use')
plt.ylabel('Tobacco_use')
plt.title('Alcohol and Tobbaco User in East Asia & Pacific')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Prevalence of total alcohol use', y='Prevalence of total tobacco use')
plt.xlabel('Prevalence of total alcohol use')
plt.ylabel('Prevalence of total tobacco use')
plt.title('Trend of Alcohol and Tobbaco User')
plt.show()

In [None]:
fig, axes = plt.subplots(1, len(years), figsize=(15, 4))

# Iterate through years and create scatter plots
for i, year in enumerate(years):
    ax = axes[i]
    data_year = df[df['date'] == year]
    sns.scatterplot(data=data_year, x='Prevalence of total alcohol use', y='Prevalence of total tobacco use', ax=ax)
    ax.set_xlabel('Prevalence of total alcohol use')
    ax.set_ylabel('Prevalence of total tobacco use')
    ax.set_title(f'Trend for Year {year}')

# Adjust layout
plt.tight_layout()
plt.show()

Conclusion: A loose positive correlation between prevalence of total tobacco use and alcohol use.

# Richer countries has more equal consumption between male and female tobacco use.


In [None]:
column_name = 'GDP per capita'
quartiles = df[column_name].quantile([0.25, 0.5, 0.75])
high_income = df.loc[df[column_name] >= quartiles[0.75]]
#middle_income != high_income & low_income
low_income = df.loc[df[column_name] >= quartiles[0.25]]

In [None]:
# Calculate the mean and standard deviation of female and male tobacco use in high income countries
hftobacco_mean = high_income['Tobacco use by Females'].mean()
hftobacco_std = high_income['Tobacco use by Females'].std()
hmtobacco_mean = high_income['Tobacco use by males'].mean()
hmtobacco_std = high_income['Tobacco use by males'].mean()

# Calculate the mean and standard deviation of female and male tobacco use in low income countries
lftobacco_mean = low_income['Tobacco use by Females'].mean()
lftobacco_std = low_income['Tobacco use by Females'].std()
lmtobacco_mean = low_income['Tobacco use by males'].mean()
lmtobacco_std = low_income['Tobacco use by males'].std()

In [None]:
print("In high income countries: ")
print("Mean of tobacco use for males:",hmtobacco_mean)
print("Mean of tobacco use for females:",hftobacco_mean)
print("In low income countries")
print("Mean of tobacco use for males:",lmtobacco_mean)
print("Mean of tobacco use for females:",lftobacco_mean)

In [None]:
absolute_difference_low = abs(lmtobacco_mean-lftobacco_mean)
print("absolute difference in low income countries:", absolute_difference_low)
absolute_difference_high= abs(hmtobacco_mean-hftobacco_mean)
print("absolute difference in high income countries:", absolute_difference_high)


In [None]:
df['total_tobacco_consumption'] = df['Tobacco use by Females'] + df['Tobacco use by males']

In [None]:
#an attempt to see both female and male in the same graph
da = df.sort_values(by='GDP per capita', ascending=False)
#da = da['GDP per capita']<=60000
# Create the bar chart
plt.figure(figsize=(10, 6))
plt.bar(df['GDP per capita'], df['total_tobacco_consumption'], color=['blue', 'red'])
# Add a legend
# Add labels and title
plt.xlabel('GDP per capita')
plt.ylabel('Total Tobacco Consumption')
plt.title('Total Tobacco Consumption by GDP and Gender')
plt.legend(loc='upper left')

# Show the plot
plt.show()
# Add labels and title

In [None]:
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.histplot(data=df, x='Tobacco use by males', bins=10, kde=True)
plt.title('Prevalence of tobacco use by males distribution')
plt.xlabel('% of males population')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
sns.histplot(data=df, x='Tobacco use by Females', bins=10, kde=True, color='orange')
plt.title('Prevalence of tobacco use by females distribution')
plt.xlabel('% of females population')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
df['Difference in tobacco use by gender (males - females)'] = df['Tobacco use by males'] - df['Tobacco use by Females']

In [None]:
df

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='GDP per capita', y='Difference in tobacco use by gender (males - females)')
plt.xlabel('GDP per capita')
plt.ylabel('Difference in tobacco use by gender')
plt.title('Trend of GDP per capita and Tobbaco Use by Gender')
plt.show()

In [None]:
fig, axes = plt.subplots(1, len(years), figsize=(20, 4))

# Iterate through years and create scatter plots
for i, year in enumerate(years):
    ax = axes[i]
    data_year = df[df['date'] == year]
    sns.scatterplot(data=data_year, x='GDP per capita', y='Difference in tobacco use by gender (males - females)', ax=ax)
    ax.set_xlabel('GDP per capita')
    ax.set_ylabel('Difference in tobacco use by gender')
    ax.set_title(f'Trend for Year {year}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
countries_income = df[df['Income Level'] == 'High']

fig, axes = plt.subplots(1, len(years), figsize=(20, 4))

# Iterate through years and create scatter plots
for i, year in enumerate(years):
    ax = axes[i]
    data_year = countries_income[countries_income['date'] == year]
    sns.scatterplot(data=data_year, x='GDP per capita', y='Difference in tobacco use by gender (males - females)', ax=ax)
    ax.set_xlabel('GDP per capita')
    ax.set_ylabel('Difference in tobacco use by gender')
    ax.set_title(f'Trend for Year {year}')

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
higher_female_tobacco_use = df[df['Difference in tobacco use by gender (males - females)']<0]
higher_female_tobacco_use

In [None]:
sorted_missing_country_codes['Difference in tobacco use by gender (males - females)'] = sorted_missing_country_codes['Tobacco use by males'] - sorted_missing_country_codes['Tobacco use by Females']
plt.figure(figsize=(11, 7))

# Iterate over the list of countries and plot the data for each
for country_name in country_names:
    country_data = sorted_missing_country_codes[sorted_missing_country_codes['country'] == country_name]
    sns.lineplot(data=country_data, x='date', y='Difference in tobacco use by gender (males - females)', marker='o', label=country_name)

plt.xlabel('Year')
plt.ylabel('Smokers % of adult population')
plt.title('Trend of Smokers % of adult population over the Years')
plt.grid(True)
plt.legend()  # Add a legend to distinguish countries
plt.show()