In [20]:
from bs4 import BeautifulSoup
import requests
import time
import csv

#Code modified from actor/url assignment

my_url = "https://www.cia.gov/the-world-factbook/field/population/country-comparison"

hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'}

def get_urls_from_single_page(my_url):
    # open page and create soup - 2 lines - USE requests NOT urlopen - USE headers=hdr 

    page = requests.get(my_url, headers=hdr)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # create an empty list to hold all URLs
    country_list = []
    
    # get the country URLs and write them one-by-one into the list
    countries = soup.find_all('a', class_="text-button content-table-link")
    for country in countries:
        link = "https://cia.gov" + country.get('href')
        country_list.append(link)

    # return the completed listof URLs - end of function
    return country_list

def scrape_one_country(url):
    
    # create an empty list to hold the scraped information
    country_details = []
    
    # open page and create soup
   
    page = requests.get(url, headers=hdr)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # find the desired information on the page and append it to the list
    
    title = soup.find("h1", class_="hero-title")
    if title:
        country_details.append(title.text.strip())
    else:
        country_details.append("")
        
    heading = soup.find("h3", class_="mt30", string="GDP (official exchange rate)")
    if heading:
        real_gdp = heading.find_next_sibling("p")
        if real_gdp:
            country_details.append(real_gdp.text.strip().split(')')[0] + ')')
        else:
            country_details.append("")
    else:
        country_details.append("No Data Provided")
        
    heading2 = soup.find("h3", class_="mt30", string="Real GDP per capita")
    if heading2:
        real_gdp_capita = heading2.find_next_sibling("p")
        if real_gdp_capita:
            country_details.append(real_gdp_capita.text.strip().split(')')[0] + ')')
        else:
            country_details.append("")
    else:
        country_details.append("No Data Provided")
                
    heading3 = soup.find("h3", class_="mt30", string="Population")
    if heading3:
        total_pop = heading3.find_next_sibling("p")
        if total_pop:
            country_details.append(total_pop.text.strip().split(')')[0] + ')')
        else:
            country_details.append("")
    else:
        country_details.append("No Data Provided")
        
    heading4 = soup.find("h3", class_="mt30", string="Population growth rate")
    if heading4:
        pop_growth = heading4.find_next_sibling("p")
        if pop_growth:
            country_details.append(pop_growth.text.strip()[0:19])
        else:
            country_details.append("")
    else:
        country_details.append("No Data Provided")
    
    # return the completed list of scraped information
    return country_details

# generate a list of URLs
url_list = get_urls_from_single_page(my_url)

def write_csv(url_list):
    # open new file for writing
    with open('ciainfo.csv', 'w', newline='', encoding='utf-8') as csvfile:
        # make a Python CSV writer object
        c = csv.writer(csvfile)
        
        # write the column headings row
        c.writerow(['Country', 'Real GDP', 'Real GDP Per Capita', 'Total Population', 'Population Growth Rate', 'url'])
        
        # loop through the urls to run the scrape_one_country() function for EACH url
        for country_url in url_list:
            # insert a 1-second time delay inside the loop
            time.sleep(1)
            
            # scrape the country data
            country_details = scrape_one_country(country_url)
            
            # write the scraped data to the CSV
            c.writerow([country_details[0], country_details[1], country_details[2], country_details[3], country_details[4], country_url])
    
url_list = get_urls_from_single_page(my_url)
write_csv(url_list)