# Udacity Data Engineering Capstone Project

## Extract public Gapminder data

### Web scraping for Gapminder data

In [85]:
import os
import pandas as pd
import numpy as np
import json
import re
import requests
from bs4 import BeautifulSoup

In [86]:
# Download and parse the HTML
start_url = 'https://github.com/Gapminder/gapminder-offline/tree/development/ddf--gapminder--systema_globalis'

In [87]:
# Download the HTML from start_url
downloaded_html = requests.get(start_url)

In [88]:
# Parse the HTML with BeautifulSoup and create a soup object
soup = BeautifulSoup(downloaded_html.text)

In [89]:
# Get list of links from github page
links_with_href = [a['href'] for a in soup.find_all('a', href=True) if a.text]

In [120]:
url_prefix = 'https://raw.githubusercontent.com/Gapminder/gapminder-offline/'

In [121]:
def create_file_href_dataframe(links_with_href, url_prefix):
    """
    Populate dataframe with list of CSV files to download from github/Gapminder repository
    
    returns: pandas dataframe
    """
    expr1 = '^.*\.(csv|CSV)$'
    p = re.compile(expr1)
    
    indicator_list = []
    file_href_list = []

    for l in links_with_href:
        b = p.match(l)
        if b:
            file_name = l.split('/')[-1]
            file_href2 = l.split('/')[-2]
            file_href3 = l.split('/')[-3]
            file_href_final = f"{url_prefix}{file_href3}/{file_href2}/{file_name}"
            
            attrs = re.split('--', file_name)
            if len(attrs) >= 3:
                indicator = attrs[2]
                indicator_list.append(indicator)
                file_href_list.append(file_href_final)

    data = {'indicator':indicator_list,
            'file_href':file_href_list}
    df = pd.DataFrame(data)
    return df

In [124]:
df = create_file_href_dataframe(links_with_href,url_prefix)

In [125]:
df.head()

Unnamed: 0,indicator,file_href
0,adults_with_hiv_percent_age_15_49,https://raw.githubusercontent.com/Gapminder/ga...
1,age_at_1st_marriage_women,https://raw.githubusercontent.com/Gapminder/ga...
2,aged_15_24_employment_rate_percent,https://raw.githubusercontent.com/Gapminder/ga...
3,aged_15_24_unemployment_rate_percent,https://raw.githubusercontent.com/Gapminder/ga...
4,aged_15_64_labour_force_participation_rate_per...,https://raw.githubusercontent.com/Gapminder/ga...


In [126]:
df.loc[:,['file_href']]

Unnamed: 0,file_href
0,https://raw.githubusercontent.com/Gapminder/ga...
1,https://raw.githubusercontent.com/Gapminder/ga...
2,https://raw.githubusercontent.com/Gapminder/ga...
3,https://raw.githubusercontent.com/Gapminder/ga...
4,https://raw.githubusercontent.com/Gapminder/ga...
...,...
539,https://raw.githubusercontent.com/Gapminder/ga...
540,https://raw.githubusercontent.com/Gapminder/ga...
541,https://raw.githubusercontent.com/Gapminder/ga...
542,https://raw.githubusercontent.com/Gapminder/ga...


In [127]:
df.set_index('indicator', inplace=True)

In [128]:
df.head()

Unnamed: 0_level_0,file_href
indicator,Unnamed: 1_level_1
adults_with_hiv_percent_age_15_49,https://raw.githubusercontent.com/Gapminder/ga...
age_at_1st_marriage_women,https://raw.githubusercontent.com/Gapminder/ga...
aged_15_24_employment_rate_percent,https://raw.githubusercontent.com/Gapminder/ga...
aged_15_24_unemployment_rate_percent,https://raw.githubusercontent.com/Gapminder/ga...
aged_15_64_labour_force_participation_rate_percent,https://raw.githubusercontent.com/Gapminder/ga...


# Download Gapminder csv files

In [19]:
#!pip install wget

Collecting wget
  Downloading wget-3.2.zip (10 kB)
Building wheels for collected packages: wget
  Building wheel for wget (setup.py): started
  Building wheel for wget (setup.py): finished with status 'done'
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9686 sha256=d65687fb645e8c352e9fd0e0790227a90f5c24a79822abc6728ae18b7b0fb8e8
  Stored in directory: c:\users\mwalb\appdata\local\pip\cache\wheels\bd\a8\c3\3cf2c14a1837a4e04bd98631724e81f33f462d86a1d895fae0
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [131]:
import wget
import csv

In [138]:
def download_csv_files(url_input_file):
    """
    Reads list of href links to a csv file from the url_input_file. Then download each csv file using wget
    
    args: url_input_file
        csv file containing a list of href urls to a specific csv file
        
    """
    # reading csv file
    with open(url_input_file, 'r', encoding='utf8', newline='') as csv_file:
        # creating a csv reader object
        csv_reader = csv.reader(csv_file)
        next(csv_reader)

        # extracting each data row one by one and download csv file
        for line in csv_reader:
            filename = wget.download(line[1], './data')
            print(f"Downloaded {filename}")

In [142]:
download_csv_files('remaining_files.csv')

100% [............................................................................] 478425 / 478425Downloaded ./data/ddf--datapoints--life_expectancy_male--by--geo--time (1).csv
100% [............................................................................] 647220 / 647220Downloaded ./data/ddf--datapoints--life_expectancy_years--by--geo--time.csv
100% [................................................................................] 8555 / 8555Downloaded ./data/ddf--datapoints--literacy_rate_adult_female_percent_of_females_ages_15_above--by--geo--time.csv
100% [................................................................................] 8541 / 8541Downloaded ./data/ddf--datapoints--literacy_rate_adult_male_percent_of_males_ages_15_and_above--by--geo--time.csv
100% [................................................................................] 8602 / 8602Downloaded ./data/ddf--datapoints--literacy_rate_adult_total_percent_of_people_ages_15_and_above--by--geo--time.csv
100% [