In [1]:
from ETL import ETL
from modules.Transformation import Transformation
from modules.DataSource import DataSourceConfig
from modules.Encoder import EncoderConfig
from modules.Loader import LoaderConfig
from modules.Utils import Utils
from ETL import FlowConfig

# Datasource Config

url = "https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"

# Flow Config

flow_config = FlowConfig()

flow1 = flow_config.create_flow(
    ID_datasource=f'Website Scraping {url}',
    ID_encoder='Basic HTML Tags',
    ID_ingestion=f'Scraping Ingestion for {url}',
    ID_control='1 Hour Update',
    ID_loader='Json File Loader on export/list_of_countries_nominal_capital.json')

flow_config.add_flow(flow1)

# Def Custom Scraping

def scraping(soap):
    data = []
    tables = soap.find_all('tbody')
    rows = tables[2].find_all('tr')
    for row in rows:
        col = row.find_all('td')
        if len(col)!=0:
            if col[0].find('a') is not None and '—' not in col[2]:
                data_dict = {"Country": col[0].a.contents[0],
                             "GDP_USD_millions": col[2].contents[0]}
                data.append(data_dict)
    return data

Utils().save_function('scraping_functions/extract_data_test.pkl', scraping) # I can save the scraping function

custom_scraping = Utils().load_function('scraping_functions/extract_data_test.pkl') # I can load a scraping function

# Pipeline Object Creation

pipeline = ETL("ETL Pipeline")

# Create Pipeline Elements

## Flow 1

pipeline.create_datasource(f'Website Scraping {url}', DataSourceConfig().scraping_source_config(url))
pipeline.create_encoder('Basic HTML Tags', EncoderConfig().html_encoder_config(['Country','GDP_USD_millions'], ['Country','GDP_USD_millions']))
pipeline.create_ingestion(f'Scraping Ingestion for {url}', custom_scraping)
pipeline.create_process("Transformation Basic for HTML", [Transformation().default_scraping_to_pivot_table])
pipeline.create_control('1 Hour Update',5,3600)
pipeline.create_loader('Json File Loader on export/list_of_countries_nominal_capital.json', LoaderConfig().json_file_loader(f'export/list_of_countries_nominal_capital.json'))

# Set Pipeline flow

pipeline.set_flow(flow_config.get_flow_config())

# Pipeline Start 

pipeline.start()

Linking Pipeline flows
Content has been successfully appended to the file.
Running Ingestion Scraping Ingestion for https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29...
Content has been successfully appended to the file.
Start extraction for Scraping Ingestion for https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29


In [1]:
import pandas as pd
from modules.Utils import Utils

df = pd.DataFrame(Utils().read_json('export/list_of_countries_nominal_capital.json'))

df

Unnamed: 0,Country,GDP_USD_millions
0,United States,26854599
1,China,19373586
2,Japan,4409738
3,Germany,4308854
4,India,3736882
...,...,...
186,Marshall Islands,291
187,Palau,262
188,Kiribati,248
189,Nauru,151


In [3]:
pd.DataFrame(Utils().read_json('export/www.mendo.care.json'))

Unnamed: 0,text,url,date_time,ingestion_ID,a,h2,h3,h4,h5,h6,img,p
0,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:37:35.892864,Scraping Ingestion for http://www.mendo.care,,,,,,,,
1,,https://mendo.care/wp-content/uploads/2023/03/...,2023-11-01 19:37:35.892864,Scraping Ingestion for http://www.mendo.care,,,,,,,,
2,,,2023-11-01 19:37:35.883895,Scraping Ingestion for http://www.mendo.care,,,,,,,,
3,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:37:35.892864,Scraping Ingestion for http://www.mendo.care,,,,,,,,
4,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:37:35.891866,Scraping Ingestion for http://www.mendo.care,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
256,fashion and accessories,https://www.mendo.care/wp-content/uploads/2023...,2023-11-01 19:37:35.892864,Scraping Ingestion for http://www.mendo.care,,,,,,,fashion and accessories,
257,info@mendo.care,mailto:info@mendo.care,2023-11-01 19:37:35.886866,Scraping Ingestion for http://www.mendo.care,info@mendo.care,,,,,,,
258,info@mendo.care,mailto:info@mendo.care,2023-11-01 19:37:35.888866,Scraping Ingestion for http://www.mendo.care,info@mendo.care,,,,,,,
259,terms and conditions,https://www.mendo.care/terms-and-conditions,2023-11-01 19:37:35.888866,Scraping Ingestion for http://www.mendo.care,terms and conditions,,,,,,,
