In [None]:
import pandas as pd

In [None]:
# Load your final dataset (replace with your actual file path)
file1_path = "IHME-GBD_2021_DATA-5503ed84-1.csv"  # Risk factors
file2_path = "IHME-GBD_2021_DATA-6f8f7ceb-1.csv"  # Cause of death

file_air_pollution = "air-pollution.csv"
file_co2 = "co2 Emission Africa.csv"
file_haq = "IHME_GBD_2019_HAQ_1990_2019_DATA.CSV"

# Load datasets
df_risk = pd.read_csv(file1_path)
df_cause = pd.read_csv(file2_path)
df_air = pd.read_csv(file_air_pollution)
df_co2 = pd.read_csv(file_co2)
df_haq = pd.read_csv(file_haq)

# Load the datasets
air_pollution_df = pd.read_csv("air-pollution.csv")
co2_emission_df = pd.read_csv("co2 Emission Africa.csv")

In [None]:
# Dataset 1: Health Burden (Cause of Death Only)
# drop the columns not needed.
df_health_burden = df_cause.drop(columns=['sex','age','upper', 'lower'])

df_health_burden.head()


In [None]:
# Dataset 2: Risk Factor Burden
df_risk_factor = df_risk.drop(columns=['sex','age','upper', 'lower'])
df_risk_factor = df_risk_factor.rename(columns={'rei': 'risk_factor'})


df_risk_factor.head()

In [None]:
# Dataset 3: Contextual Dataset
african_countries = [
    'Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi',
    'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros',
    'Congo', 'Côte d’Ivoire', 'Democratic Republic of the Congo', 'Djibouti',
    'Egypt', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon',
    'Gambia', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia',
    'Libya', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco',
    'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Sao Tome and Principe',
    'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa',
    'South Sudan', 'Sudan', 'Togo', 'Tunisia', 'Uganda', 'United Republic of Tanzania',
    'Zambia', 'Zimbabwe'
]


In [None]:
#filter the health access and quality index for african countries

# Filter the df_haq DataFrame for the required indicators and countries
# Apply filtering criteria
df_haq_filtered = df_haq[
    (df_haq['location_name'].isin(african_countries)) &
    (df_haq['age_group_name'] == 'Age-standardized') &
    (df_haq['indicator_name'].isin(['Chronic respiratory diseases', 'HAQ Index'])) &
    (df_haq['year_id'] >= 2000)
][['location_name', 'year_id', 'indicator_name', 'val']]

# Pivot the table to create separate columns for each indicator
df_haq_pivot = df_haq_filtered.pivot_table(
    index=['location_name', 'year_id'],
    columns='indicator_name',
    values='val',
    aggfunc='mean'  # Using mean in case of duplicates
).reset_index()

# Rename columns
df_haq_pivot = df_haq_pivot.rename(columns={
    'location_name': 'Country',
    'year_id': 'Year',
    'Chronic respiratory diseases': 'Chronic_Respiratory(RSD)',
    'HAQ Index': 'HAQ_Index'
})

# Display the final dataset
print(df_haq_pivot.head())

In [None]:
# Merge on 'Country' and 'Year'
df_air_pol = pd.merge(df_co2, df_air, on=["Country", "Year"], how="inner")

# Calculate Population Density
df_air_pol["Population Density"] = df_air_pol["Population"] / df_air_pol["Area (Km2)"]

# Select and reorder columns to match the final structure
final_columns_order = [
    "Country", "Sub-Region", "Year", "Population", "GDP PER CAPITA (USD)", "Area (Km2)",
    "Population Density", "Total CO2 Emission excluding LUCF (Mt)", "Nitrogen Oxide",
    "Sulphur Dioxide", "Carbon Monoxide", "Organic Carbon", "NMVOCs", "Black Carbon", "Ammonia"
]

# Create the final DataFrame
df_air_pol = df_air_pol[final_columns_order]


df_air_pol.head()


In [None]:
df_socioeconomic = pd.merge(df_air_pol, df_haq_pivot, on=['Country', 'Year'], how='left')
df_socioeconomic = df_socioeconomic.loc[:, ~df_socioeconomic.columns.str.contains('upper|lower')]


df_socioeconomic.head()

In [None]:
# Save all datasets to CSV
df_health_burden.to_csv("health_burden_dataset.csv", index=False)
df_risk_factor.to_csv("risk_factor_dataset.csv", index=False)
df_socioeconomic.to_csv("socioeconomic_dataset.csv", index=False)