In [1]:
from ETL import ETL
from modules.Transformation import Transformation
from modules.DataSource import DataSourceConfig
from modules.Encoder import EncoderConfig
from modules.Loader import LoaderConfig
from ETL import FlowConfig

# Encoder Config

tags = ["tr", "td"]

# Datasource Config

url = "https://web.archive.org/web/20230902185326/https://en.wikipedia.org/wiki/List_of_countries_by_GDP_%28nominal%29"
url2 = "http://www.google.com"
url3 = "https://edition.cnn.com"

# Flow Config

flow_config = FlowConfig()

flow1 = flow_config.create_flow(
    ID_datasource=f'Website Scraping {url}',
    ID_encoder='Basic HTML Tags',
    ID_ingestion=f'Scraping Ingestion for {url}',
    ID_control='1 Hour Update',
    ID_loader=f'Json File Loader on export/{url[7:]}.json')

flow2 = flow_config.create_flow(
    ID_datasource=f'Website Scraping {url2}',
    ID_encoder='Basic HTML Tags',
    ID_ingestion=f'Scraping Ingestion for {url2}',
    ID_control='1 Hour Update',
    ID_loader=f'Json File Loader on export/{url2[7:]}.json',
    ID_process="Transformation Basic for HTML")

flow3 = flow_config.create_flow(
    ID_datasource=f'Website Scraping {url3}',
    ID_encoder='Basic HTML Tags',
    ID_ingestion=f'Scraping Ingestion for {url3}',
    ID_control='1 Hour Update',
    ID_loader=f'Json File Loader on export/{url3[7:]}.json')

flow_config.add_flow(flow1)
flow_config.add_flow(flow2)
flow_config.add_flow(flow3) # No transformation

# Pipeline Object Creation

pipeline = ETL("ETL Pipeline")

# Create Pipeline Elements

## Flow 1

pipeline.create_datasource(f'Website Scraping {url}', DataSourceConfig().scraping_source_config(url))
pipeline.create_encoder('Basic HTML Tags', EncoderConfig().html_encoder_config(tags, tags))
pipeline.create_ingestion(f'Scraping Ingestion for {url}')
pipeline.create_process("Transformation Basic for HTML", [Transformation().default_scraping_to_pivot_table])
pipeline.create_control('1 Hour Update',3600,3600)
pipeline.create_loader(f'Json File Loader on export/{url[7:]}.json', LoaderConfig().json_file_loader(f'export/wikipedia.json'))

## Flow 2

pipeline.create_datasource(f'Website Scraping {url2}', DataSourceConfig().scraping_source_config(url2))
pipeline.create_ingestion(f'Scraping Ingestion for {url2}')
pipeline.create_loader(f'Json File Loader on export/{url2[7:]}.json', LoaderConfig().json_file_loader(f'export/{url2[7:]}.json'))

## Flow 3

pipeline.create_datasource(f'Website Scraping {url3}', DataSourceConfig().scraping_source_config(url3))
pipeline.create_ingestion(f'Scraping Ingestion for {url3}')
pipeline.create_loader(f'Json File Loader on export/{url3[7:]}.json', LoaderConfig().json_file_loader(f'export/{url3[7:]}.json'))

# Set Pipeline flow

pipeline.set_flow(flow_config.get_flow_config())

# Pipeline Start 

pipeline.start()

AttributeError: 'Transformation' object has no attribute 'scraping_to_pivot_table'

In [5]:
import pandas as pd
from modules.Utils import Utils

df = pd.DataFrame(Utils().read_json('export/wikipedia.json'))

df[0:500]

Unnamed: 0,text,url,date_time,ingestion_ID,td,tr
0,,,2023-11-01 23:27:43.753182,Scraping Ingestion for https://web.archive.org...,,
1,,,2023-11-01 23:27:43.763181,Scraping Ingestion for https://web.archive.org...,,
2,$50–100 billion$25–50 billion$5–25 billion< $5...,,2023-11-01 23:27:43.763181,Scraping Ingestion for https://web.archive.org...,$50–100 billion$25–50 billion$5–25 billion< $5...,
3,$750 billion – $1 trillion$500–750 billion$250...,,2023-11-01 23:27:43.763181,Scraping Ingestion for https://web.archive.org...,$750 billion – $1 trillion$500–750 billion$250...,
4,"(Nominal, Atlas method) per capita(PPP) per ca...",,2023-11-01 23:27:43.772182,Scraping Ingestion for https://web.archive.org...,"(Nominal, Atlas method) per capita(PPP) per ca...",
...,...,...,...,...,...,...
495,53954,,2023-11-01 23:27:43.767150,Scraping Ingestion for https://web.archive.org...,53954,
496,539223,,2023-11-01 23:27:43.764173,Scraping Ingestion for https://web.archive.org...,539223,
497,54622,,2023-11-01 23:27:43.768150,Scraping Ingestion for https://web.archive.org...,54622,
498,541,,2023-11-01 23:27:43.772182,Scraping Ingestion for https://web.archive.org...,541,


In [2]:
pd.DataFrame(Utils().read_json('export/edition.cnn.com.json'))

Unnamed: 0,original_tag,mapping_tag,text,url,date_time,ingestion_ID
0,h2,h2,Hundreds of Gazans and foreign nationals leave...,,2023-11-01 19:37:35.705864,Scraping Ingestion for https://edition.cnn.com
1,h2,h2,More top stories,,2023-11-01 19:37:35.705864,Scraping Ingestion for https://edition.cnn.com
2,h2,h2,Featured,,2023-11-01 19:37:35.705864,Scraping Ingestion for https://edition.cnn.com
3,h2,h2,Featured Sections,,2023-11-01 19:37:35.706878,Scraping Ingestion for https://edition.cnn.com
4,h2,h2,CNN Business,,2023-11-01 19:37:35.706878,Scraping Ingestion for https://edition.cnn.com
...,...,...,...,...,...,...
475,img,img,Gyasi directs actor Angela Basset for the 2024...,https://media.cnn.com/api/v1/images/stellar/pr...,2023-11-01 19:37:35.721895,Scraping Ingestion for https://edition.cnn.com
476,img,img,Simit is the most popular street food in Istan...,https://media.cnn.com/api/v1/images/stellar/pr...,2023-11-01 19:37:35.721895,Scraping Ingestion for https://edition.cnn.com
477,img,img,hornussen story gallery,https://media.cnn.com/api/v1/images/stellar/pr...,2023-11-01 19:37:35.721895,Scraping Ingestion for https://edition.cnn.com
478,img,img,SAOTA beyond 1,https://media.cnn.com/api/v1/images/stellar/pr...,2023-11-01 19:37:35.721895,Scraping Ingestion for https://edition.cnn.com


In [3]:
pd.DataFrame(Utils().read_json('export/www.mendo.care.json'))

Unnamed: 0,text,url,date_time,ingestion_ID,a,h2,h3,h4,h5,h6,img,p
0,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:37:35.892864,Scraping Ingestion for http://www.mendo.care,,,,,,,,
1,,https://mendo.care/wp-content/uploads/2023/03/...,2023-11-01 19:37:35.892864,Scraping Ingestion for http://www.mendo.care,,,,,,,,
2,,,2023-11-01 19:37:35.883895,Scraping Ingestion for http://www.mendo.care,,,,,,,,
3,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:37:35.892864,Scraping Ingestion for http://www.mendo.care,,,,,,,,
4,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:37:35.891866,Scraping Ingestion for http://www.mendo.care,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
256,fashion and accessories,https://www.mendo.care/wp-content/uploads/2023...,2023-11-01 19:37:35.892864,Scraping Ingestion for http://www.mendo.care,,,,,,,fashion and accessories,
257,info@mendo.care,mailto:info@mendo.care,2023-11-01 19:37:35.886866,Scraping Ingestion for http://www.mendo.care,info@mendo.care,,,,,,,
258,info@mendo.care,mailto:info@mendo.care,2023-11-01 19:37:35.888866,Scraping Ingestion for http://www.mendo.care,info@mendo.care,,,,,,,
259,terms and conditions,https://www.mendo.care/terms-and-conditions,2023-11-01 19:37:35.888866,Scraping Ingestion for http://www.mendo.care,terms and conditions,,,,,,,
