In [6]:
# Importing the required libraries
import pandas as pd
import requests
import sqlite3 as sql
import numpy as np
from datetime import datetime
from bs4  import BeautifulSoup as bs

In [7]:
# Code for ETL operations on Country-GDP data
def extract(url, table_attribs):
    ''' This function extracts the required
    information from the website and saves it to a dataframe. The
    function returns the dataframe for further processing. '''
    
    log_progress("Extraction started")

    html=requests.get(url).text
    data=bs(html, "html.parser")
    log_progress("Extraction: data loaded by BeautifulSoup")
    tables=data.find_all("table")
    rows=tables[2].find_all("tr")
    df=pd.DataFrame(columns=table_attribs)
    k=0
    for n in rows:
        if k>2:
            cols=n.find_all("td")
            # print(cols[0].find_all("a")[0].contents[0])
            # print(cols[2].contents[0])
            dict={table_attribs[0]:cols[0].find_all("a")[0].contents[0],
                table_attribs[1]:cols[2].contents[0]}
            df_temp=pd.DataFrame(dict, index=[0])
            df=pd.concat([df,df_temp], ignore_index=True)
            k+=1
        k+=1 
    log_progress("Extraction: dataframe returned")
    return df

def transform(df,column):
    ''' This function converts the GDP information from Currency
    format to float value, transforms the information of GDP from
    USD (Millions) to USD (Billions) rounding to 2 decimal places.
    The function returns the transformed dataframe.'''
    log_progress("Transformation started.")

    df[column]=df[column].str.replace(",","").replace("—","")
    df[column]=pd.to_numeric(df[column],errors='coerce')
    df["GDP_USD_millions"]=round(df[column]/1000,2)
    df.rename(columns={column:"GDP_USD_billions"},inplace=True)
    log_progress("Transformation: returned dataframe")

    return df

def load_to_csv(df, csv_path):
    ''' This function saves the final dataframe as a `CSV` file 
    in the provided path. Function returns nothing.'''
    log_progress("Load phase: csv created")
    df.to_csv(csv_path)

def load_to_db(df, sql_connection, table_name):
    ''' This function saves the final dataframe as a database table
    with the provided name. Function returns nothing.'''
    conn=sql.connect(sql_connection)
    df.to_sql(table_name,conn,if_exists='replace', index=False)
    log_progress("Load phase: db created")

    
def run_query(query_statement, db_name):
    ''' This function runs the stated query on the database table and
    prints the output on the terminal. Function returns nothing. '''
    sql_connection=sql.connect(db_name)
    print(pd.read_sql(query_statement, sql_connection))
    log_progress("SQL query executed.")


def log_progress(message):
    ''' This function logs the mentioned message at a given stage of the code execution to a 
    log file. Function returns nothing'''
    timestamp_format = '%Y-%h-%d-%H:%M:%S' 
    now = datetime.now()
    timestamp = now.strftime(timestamp_format) 
    with open("log_file.txt","a") as f: 
        f.write(timestamp + ',' + message + '\n')

In [8]:
#ENTITIES
url="https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
columns=["Country","GDP_USD_millions"]
db_name="./World_Economies.db"
table_name="Countries_by_GDP"
csv_path="./Countries_by_GDP.csv"
query="SELECT * FROM 'Countries_by_GDP'"

In [9]:
data=extract(url,columns)
transform(data,"GDP_USD_millions")
load_to_db(data,db_name,table_name)
run_query(query,db_name)

           Country  GDP_USD_billions
0    United States          26854.60
1            China          19373.59
2            Japan           4409.74
3          Germany           4308.85
4            India           3736.88
..             ...               ...
208       Anguilla               NaN
209       Kiribati              0.25
210          Nauru              0.15
211     Montserrat               NaN
212         Tuvalu              0.06

[213 rows x 2 columns]


In [10]:
#Alternative functions
"""
def extract(url, table_attribs):
    page = requests.get(url).text
    data = BeautifulSoup(page,'html.parser')
    df = pd.DataFrame(columns=table_attribs)
    tables = data.find_all('tbody')
    rows = tables[2].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col)!=0:
            if col[0].find('a') is not None and '—' not in col[2]:
                data_dict = {"Country": col[0].a.contents[0],
                             "GDP_USD_millions": col[2].contents[0]}
                df1 = pd.DataFrame(data_dict, index=[0])
                df = pd.concat([df,df1], ignore_index=True)
    return df

def transform(df):
    GDP_list = df["GDP_USD_millions"].tolist()
    GDP_list = [float("".join(x.split(','))) for x in GDP_list]
    GDP_list = [np.round(x/1000,2) for x in GDP_list]
    df["GDP_USD_millions"] = GDP_list
    df=df.rename(columns = {"GDP_USD_millions":"GDP_USD_billions"})
    return df

def load_to_csv(df, csv_path):
    df.to_csv(csv_path)

def load_to_db(df, sql_connection, table_name):
    df.to_sql(table_name, sql_connection, if_exists='replace', index=False)

def run_query(query_statement, sql_connection):
    print(query_statement)
    query_output = pd.read_sql(query_statement, sql_connection)
    print(query_output)

def log_progress(message): 
    timestamp_format = '%Y-%h-%d-%H:%M:%S' # Year-Monthname-Day-Hour-Minute-Second 
    now = datetime.now() # get current timestamp 
    timestamp = now.strftime(timestamp_format) 
    with open("./etl_project_log.txt","a") as f: 
        f.write(timestamp + ' : ' + message + '\n')


#PROCESSO
###
###
        
log_progress('Preliminaries complete. Initiating ETL process')

df = extract(url, table_attribs)

log_progress('Data extraction complete. Initiating Transformation process')

df = transform(df)

log_progress('Data transformation complete. Initiating loading process')

load_to_csv(df, csv_path)

log_progress('Data saved to CSV file')

sql_connection = sqlite3.connect('World_Economies.db')

log_progress('SQL Connection initiated.')

load_to_db(df, sql_connection, table_name)

log_progress('Data loaded to Database as table. Running the query')

query_statement = f"SELECT * from {table_name} WHERE GDP_USD_billions >= 100"
run_query(query_statement, sql_connection)

log_progress('Process Complete.')

sql_connection.close()
"""

'\ndef extract(url, table_attribs):\n    page = requests.get(url).text\n    data = BeautifulSoup(page,\'html.parser\')\n    df = pd.DataFrame(columns=table_attribs)\n    tables = data.find_all(\'tbody\')\n    rows = tables[2].find_all(\'tr\')\n    for row in rows:\n        col = row.find_all(\'td\')\n        if len(col)!=0:\n            if col[0].find(\'a\') is not None and \'—\' not in col[2]:\n                data_dict = {"Country": col[0].a.contents[0],\n                             "GDP_USD_millions": col[2].contents[0]}\n                df1 = pd.DataFrame(data_dict, index=[0])\n                df = pd.concat([df,df1], ignore_index=True)\n    return df\n\ndef transform(df):\n    GDP_list = df["GDP_USD_millions"].tolist()\n    GDP_list = [float("".join(x.split(\',\'))) for x in GDP_list]\n    GDP_list = [np.round(x/1000,2) for x in GDP_list]\n    df["GDP_USD_millions"] = GDP_list\n    df=df.rename(columns = {"GDP_USD_millions":"GDP_USD_billions"})\n    return df\n\ndef load_to_csv(d