In [1]:
#!pip install selenium beautifulsoup4
#!pip install pycountry-convert
#!pip install Pandoc

In [2]:
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
import numpy as np
import pycountry_convert as pc

def get_continent(country_name):
    # Handle special cases and known discrepancies
    special_cases = {
        "Côte d'Ivoire": "Ivory Coast",
        "Democratic People's Republic of Korea": "North Korea",
        "Republic of Korea": "South Korea",
        "Russian Federation": "Russia",
        "Syrian Arab Republic": "Syria",
        "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
        "United Republic of Tanzania": "Tanzania",
        "United States of America": "United States",
        "Viet Nam": "Vietnam",
        "Iran (Islamic Republic of)": "Iran",
        "Bolivia (Plurinational State of)": "Bolivia",
        "Micronesia (Federated States of)": "Micronesia",
        "Lao People's Democratic Republic": "Laos",
        "Moldova, Republic of": "Moldova",
        "Palestine, State of": "Palestine",
        "Taiwan, Province of China": "Taiwan",
        "Venezuela (Bolivarian Republic of)": "Venezuela",
        "China, Hong Kong Special Administrative Region": "Hong Kong",
        "China, Macao Special Administrative Region": "Macao",
        "Sint Maarten (Dutch part)": "Sint Maarten",
        "Czechia": "Czech Republic",
        "Swaziland": "Eswatini",
        "Myanmar": "Burma",
        "Cabo Verde": "Cape Verde",
        "Timor-Leste": "East Timor",
        "Türkiye": "Turkey",
        "Kosovo": "Kosovo"
    }

    # Remove content within parentheses and strip any extra whitespace
    country_name = re.sub(r"\(.*?\)", "", country_name).strip()

    # Replace with standardized name if in special cases
    standardized_name = special_cases.get(country_name, country_name)

    try:
        # Convert country name to alpha-2 country code
        country_code = pc.country_name_to_country_alpha2(standardized_name)
        # Convert alpha-2 country code to continent code
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        # Map continent code to continent name
        continent_name = {
            "AF": "Africa",
            "AS": "Asia",
            "EU": "Europe",
            "NA": "North America",
            "SA": "South America",
            "OC": "Oceania",
            "AN": "Antarctica"
        }
        return continent_name[continent_code]
    except Exception as e:
        return "Unknown"


def normalize_name(name):
    stop_words = {"and", "of", "the"}

    # Remove anything in parentheses
    name = re.sub(r"\(.*?\)", "", name)

    # Convert to lowercase
    name = name.lower()

    # Remove all special characters except hyphens and spaces
    name = re.sub(r"[^\w\s-]", "", name)

    # Remove stop words
    words = [word for word in name.split() if word not in stop_words]

    # Replace spaces with hyphens
    return "-".join(words)



def parse_program_data_block(html_str):
    soup = BeautifulSoup(html_str, "html.parser")

    # Initialize all expected keys with 0
    data = {
        "Total Spending": 0,
        "UNFPA": 0,
        "GOV": 0,
        "NGO": 0,
        "UN": 0,
        "Core Resources": 0.0,
        "Non-core Resources": 0.0
    }

    # 1. Total Spending
    total_elem = soup.find("div", class_="projects-project-spec-key", string="Total Spending:")
    if total_elem:
        value_elem = total_elem.find_next_sibling("div", class_="projects-project-spec-value")
        if value_elem:
            value_text = value_elem.get_text(strip=True).replace("$", "").replace(",", "")
            try:
                data["Total Spending"] = int(float(value_text))
            except ValueError:
                pass

    # 2. Implemented by (extract dollar values by org name)
    impl_elem = soup.find("div", class_="projects-project-spec-key", string="Implemented by:")
    if impl_elem:
        value_elem = impl_elem.find_next_sibling("div", class_="projects-project-spec-value")
        if value_elem:
            for org in ["UNFPA", "GOV", "NGO", "UN"]:
                match = re.search(rf"{org}\s*\$([\d,]+)", value_elem.get_text())
                if match:
                    data[org] = int(match.group(1).replace(",", ""))

    # 3. Funded by (extract percentages)
    fund_elem = soup.find("div", class_="projects-project-spec-key", string="Funded by:")
    if fund_elem:
        value_elem = fund_elem.find_next_sibling("div", class_="projects-project-spec-value")
        if value_elem:
            spans = value_elem.find_all("span")
            #print(spans)
            for i in range(0, len(spans), 2):
                try:
                    label = spans[i].get_text(strip=True)
                    percent_text = spans[i+1].get_text(strip=True).replace("(", "").replace(")", "").replace("%", "")
                    percent = float(percent_text) / 100.0
                    if label in data:
                        data[label] = percent
                except (IndexError, ValueError):
                    continue

    """
    fund_elem = soup.find("div", class_="projects-project-spec-key", string="Funded by:")
    if fund_elem:
        value_elem = fund_elem.find_next_sibling("div", class_="projects-project-spec-value")
        if value_elem:
            for source in ["Core Resources", "Non-core Resources"]:
                match = re.search(rf"{source}\s*\(?(\d+)%\)?", value_elem.get_text())
                if match:
                    data[source] = int(match.group(1)) / 100.0
    """

    return data

def scrape_unfpa(year=2023, country='cameroon'):
    url = f"https://www.unfpa.org/data/transparency-portal/unfpa-{country}"
    year = str(year)
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=chrome_options)

    driver.get(url)
    time.sleep(1)  # Wait for initial page load

    # Close cookie popup if present
    try:
        cookie_popup = driver.find_element(By.ID, "cookies-popup")
        close_button = cookie_popup.find_element(By.CLASS_NAME, "popup-close")
        close_button.click()
        time.sleep(1)  # Let the DOM update
    except NoSuchElementException:
        #print("No cookie popup found.")
        pass

    # Select the year
    select_element = Select(driver.find_element(By.ID, "edit-year"))
    select_element.select_by_value(year)

    # Click submit
    submit_btn = driver.find_element(By.ID, "edit-submit")
    try:
        submit_btn.click()
    except ElementClickInterceptedException:
        #print("Click intercepted. Using JavaScript click fallback.")
        driver.execute_script("arguments[0].click();", submit_btn)

    time.sleep(1)  # Wait for JS-rendered content to load

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()

    program_blocks = soup.find("div", class_=f"program-year-wrapper program-wrapper-{year}")
    
    htmls = program_blocks.find_all("div", class_="program-child-wrapper")
    programs = []
    
    for i in range(len(htmls)):
        
        title = htmls[i].find("div", class_="program-parent-title").get_text(strip=True)
        maternal_death_programs_html = htmls[i].find_all("div", class_="program-data-wrapper") #Ending preventable maternal deaths
        maternal_death_total_html = str(maternal_death_programs_html[0])  #get only total amount (not for sub-programs)
        data_dic = parse_program_data_block(maternal_death_total_html)
        data_dic['Program'] = title
        data_dic['Country'] = country
        data_dic['Year'] = year
        programs.append(data_dic)

    return programs

def scrape_unfpa_all(countries, year):
    
    datas = []
    for country in countries:
        try:
            data = scrape_unfpa(year=year, country=country)
            df = pd.DataFrame(data)
            datas.append(df)
        
        except:
            print(f"No Data Found for: {country}")
            continue
            
    return pd.concat(datas).reset_index(drop=True)
 
    
country_names = [
        'Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bhutan', 'Bolivia (Plurinational State of)',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Central African Republic', 'Chad', 'Chile', 'China',
       'China, Hong Kong Special Administrative Region',
       'China, Macao Special Administrative Region', 'Colombia',
       'Comoros', 'Congo', 'Costa Rica', "Côte d'Ivoire", 'Croatia',
       'Cuba', 'Curaçao', 'Cyprus', 'Czechia',
       "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt',
       'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France',
       'French Guiana', 'French Polynesia', 'Gabon', 'Gambia', 'Georgia',
       'Germany', 'Ghana', 'Greece', 'Grenada', 'Guadeloupe', 'Guam',
       'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti',
       'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia',
       'Iran (Islamic Republic of)', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati',
       'Kuwait', 'Kyrgyzstan', "Lao People's Democratic Republic",
       'Latvia', 'Lebanon', 'Lesotho', 'Liberia', 'Libya', 'Lithuania',
       'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', 'Maldives',
       'Mali', 'Malta', 'Martinique', 'Mauritania', 'Mauritius', 'Mexico',
       'Micronesia (Federated States of)', 'Mongolia', 'Montenegro',
       'Morocco', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal',
       'Netherlands', 'New Caledonia', 'New Zealand', 'Nicaragua',
       'Niger', 'Nigeria', 'North Macedonia', 'Norway', 'Oman',
       'Pakistan', 'Panama', 'Papua New Guinea', 'Paraguay', 'Peru',
       'Philippines', 'Poland', 'Portugal', 'Puerto Rico', 'Qatar',
       'Republic of Korea', 'Republic of Moldova', 'Réunion', 'Romania',
       'Russian Federation', 'Rwanda', 'Saint Kitts and Nevis',
       'Saint Lucia', 'Saint Vincent and the Grenadines', 'Samoa',
       'San Marino', 'Sao Tome and Principe', 'Saudi Arabia', 'Senegal',
       'Serbia', 'Seychelles', 'Sierra Leone', 'Singapore',
       'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia',
       'Solomon Islands', 'Somalia', 'South Africa', 'South Sudan',
       'Spain', 'Sri Lanka', 'State of Palestine1', 'Sudan', 'Suriname',
       'Sweden', 'Switzerland', 'Syrian Arab Republic', 'Tajikistan',
       'Thailand', 'Timor-Leste', 'Togo', 'Tonga', 'Trinidad and Tobago',
       'Tunisia', 'Türkiye', 'Turkmenistan', 'Turks and Caicos Islands',
       'Tuvalu', 'Uganda', 'Ukraine', 'United Arab Emirates',
       'United Kingdom of Great Britain and Northern Ireland',
       'United Republic of Tanzania', 'United States of America',
       'United States Virgin Islands', 'Uruguay', 'Uzbekistan', 'Vanuatu',
       'Venezuela (Bolivarian Republic of)', 'Viet Nam', 'Western Sahara',
       'Yemen', 'Zambia', 'Zimbabwe']
countries = {normalize_name(c):c for c in country_names}

run_code = False

if run_code:
    for year in [2015,2016,2017,2018,2019,2020,2021,2022,2023]:
        try:
            df = scrape_unfpa_all(countries.keys(), year)
            df['Country_Name'] = df.Country.map(countries)
            df.to_csv(f'../data/unfpa_fundings_{year}.csv', index=False)
        except:
            continue
else:
    df = pd.read_csv(f'../data/unfpa_fundings_2015_2023.csv')

In [3]:
normalize_name("Côte d'Ivoire")

'côte-divoire'

In [30]:
df['%UNFPA'] = df['UNFPA']/df['Total Spending']
df['%GOV'] = df['GOV']/df['Total Spending']
df['%NGO'] = df['NGO']/df['Total Spending']
df['%UN'] = df['UN']/df['Total Spending']
df['Continent'] = [get_continent(c) for c in df.Country_Name.values]
df.to_excel('../data/unfpa_fundings_2015_2023.xlsx', index=False)

In [24]:
dfi.head()

Unnamed: 0,Total Spending,UNFPA,GOV,NGO,UN,Core Resources,Non-core Resources,Program,Country,Year,Country_Name
0,29349486,13881713,0,15324658,143115,0.04,0.96,Ending gender-based violence and harmful pract...,afghanistan,2023,Afghanistan
1,66628129,29668983,0,35998398,960749,0.05,0.95,Ending preventable maternal deaths,afghanistan,2023,Afghanistan
2,4821761,2728702,0,2093059,0,0.0,1.0,Ending the unmet need for family planning,afghanistan,2023,Afghanistan
3,926933,515695,13894,397344,0,0.44,0.56,Ending gender-based violence and harmful pract...,albania,2023,Albania
4,11295,2259,169,8866,0,0.82,0.18,Ending preventable maternal deaths,albania,2023,Albania


In [28]:
dfi['%UNFPA'] = dfi['UNFPA']/dfi['Total Spending']
dfi['%GOV'] = dfi['GOV']/dfi['Total Spending']
dfi['%NGO'] = dfi['NGO']/dfi['Total Spending']
dfi['%UN'] = dfi['UN']/dfi['Total Spending']
dfi['Continent'] = [get_continent(c) for c in dfi.Country_Name.values]

dfi.to_excel('../data/unfpa_temp.xlsx', index=False)
dfi.head()

Unnamed: 0,Total Spending,UNFPA,GOV,NGO,UN,Core Resources,Non-core Resources,Program,Country,Year,Country_Name,%UNFPA,%GOV,%NGO,%UN,Continent
0,29349486,13881713,0,15324658,143115,0.04,0.96,Ending gender-based violence and harmful pract...,afghanistan,2023,Afghanistan,0.47298,0.0,0.522144,0.004876,Asia
1,66628129,29668983,0,35998398,960749,0.05,0.95,Ending preventable maternal deaths,afghanistan,2023,Afghanistan,0.445292,0.0,0.540288,0.01442,Asia
2,4821761,2728702,0,2093059,0,0.0,1.0,Ending the unmet need for family planning,afghanistan,2023,Afghanistan,0.565914,0.0,0.434086,0.0,Asia
3,926933,515695,13894,397344,0,0.44,0.56,Ending gender-based violence and harmful pract...,albania,2023,Albania,0.556345,0.014989,0.428665,0.0,Europe
4,11295,2259,169,8866,0,0.82,0.18,Ending preventable maternal deaths,albania,2023,Albania,0.2,0.014962,0.784949,0.0,Europe


## Health Outcome

In [189]:
path = 'https://www.unfpa.org/modules/custom/unfpa_global_sowp_portal/data-file/SWOP-Data-2024.xlsx'
columns=[
    'Country_Name'
    ,'MMR per 100,000'
    ,'MMR_LL','MMR_UL'
    ,'Births_Attended'
    ,'New_HIV'
    ,'Any_Con_All'
    ,'Any_Con_married'
    ,'Modern_Con_All'
    ,'Modern_Con_married'
    ,'Unmet_Need_Fam_plan_All'
    ,'Unmet_Need_Fam_plan_married'
    ,'PM_All'
    ,'Laws'
    ,'UHC_Index'
]
df = pd.read_excel(path, skiprows=6, sheet_name='ICPD_Health_2024')
df = df.drop(columns=['Unnamed: 0','Unnamed: 16'])
df.columns = columns
df.replace('-', np.nan, inplace=True)
df = df.head(204)
df['Country'] = [normalize_name(c) for c in df.Country_Name.values]
countries = {c:get_continent(c) for c in df.Country_Name.values}

df.to_csv('../data/unfpa_outcomes.csv',index=False)

In [174]:
df.head()

Unnamed: 0,Country_Name,"MMR per 100,000",MMR_LL,MMR_UL,Births_Attended,New_HIV,Any_Con_All,Any_Con_married,Modern_Con_All,Modern_Con_married,Unmet_Need_Fam_plan_All,Unmet_Need_Fam_plan_married,PM_All,Laws,UHC_Index,Country
0,Afghanistan,620,406,1050,62,0.03,21,29,19,26,17,23,50,56.0,41,afghanistan
1,Albania,8,4,16,100,0.02,33,45,6,6,12,16,12,79.0,64,albania
2,Algeria,78,41,164,99,0.05,35,60,30,52,7,11,72,,74,algeria
3,Angola,222,148,330,50,0.44,17,18,16,17,26,35,37,62.0,37,angola
4,Antigua and Barbuda,21,11,36,99,,42,63,40,61,10,13,77,,76,antigua-barbuda


In [188]:
dff = pd.DataFrame(countries, index=[0]).T.reset_index()
dff['Country'] = [normalize_name(c) for c in df.Country_Name.values]
dff.columns = ['Country_Name','Continent', 'Country']
dff.to_csv('../data/unfpa_countries.csv',index=False)

In [186]:
dff.head()

Unnamed: 0,Country_Name,Continent,Country
0,Afghanistan,Asia,afghanistan
1,Albania,Europe,albania
2,Algeria,Africa,algeria
3,Angola,Africa,angola
4,Antigua and Barbuda,North America,antigua-barbuda


In [11]:
temp = pd.read_excel('../data/UNICEF_GLOBAL_MMR_2000-2024.xlsx')

In [13]:
import pycountry
import pycountry_convert as pc
import pandas as pd

# Your list of country names
countries = [
    "Afghanistan", "Albania", "Algeria", "Andorra", "Angola", "Antigua and Barbuda",
    "Argentina", "Armenia", "Australia", "Austria", "Azerbaijan", "Bahamas", "Bahrain",
    "Bangladesh", "Barbados", "Belarus", "Belgium", "Belize", "Benin", "Bhutan",
    "Bolivia (Plurinational State of)", "Bosnia and Herzegovina", "Botswana", "Brazil",
    "Brunei Darussalam", "Bulgaria", "Burkina Faso", "Burundi", "Cabo Verde", "Cambodia",
    "Cameroon", "Canada", "Central African Republic", "Chad", "Chile", "China",
    "Colombia", "Comoros", "Congo", "Cook Islands", "Costa Rica", "Côte d'Ivoire",
    "Croatia", "Cuba", "Cyprus", "Czechia", "Democratic People's Republic of Korea",
    "Democratic Republic of the Congo", "Denmark", "Djibouti", "Dominica",
    "Dominican Republic"
]

# Function to get continent
def get_continent(country_name):
    try:
        # Try to find a matching country
        country = pycountry.countries.lookup(country_name)
        country_alpha2 = country.alpha_2
        continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        return pc.convert_continent_code_to_continent_name(continent_code)
    except:
        return "Unknown"

# Create DataFrame
#df = pd.DataFrame(countries, columns=["Country"])
#df["Continent"] = df["Country"].apply(get_continent)

In [17]:
temp["Continent"] = temp["Geographic area"].apply(get_continent)
temp.columns = ['Country','Year','MMR','Continet']
temp = temp[['Continet','Country','Year','MMR']]

In [18]:
temp.to_excel('../data/UNICEF_GLOBAL_MMR_2000-2024-temp.xlsx', index=False)

In [16]:
temp.head()

Unnamed: 0,Geographic area,TIME_PERIOD,OBS_VALUE,Continent
0,Afghanistan,2000,1371.65,Asia
1,Afghanistan,2001,1310.95,Asia
2,Afghanistan,2002,1262.52,Asia
3,Afghanistan,2003,1201.38,Asia
4,Afghanistan,2004,1166.47,Asia
