In [1]:
from ETL import ETL
from modules.Transformation import Transformation
from modules.DataSource import DataSourceConfig
from modules.Encoder import EncoderConfig
from modules.Loader import LoaderConfig
from ETL import FlowConfig

# Encoder Config

tags = ["h1", "h2", "h3", "h4", "h5", "h6", "p", "a", "img", ""]

# Datasource Config

url = "http://www.mendo.care"
url2 = "http://www.google.com"
url3 = "https://edition.cnn.com"

# Flow Config

flow_config = FlowConfig()

flow1 = flow_config.create_flow(
    ID_datasource=f'Website Scraping {url}',
    ID_encoder='Basic HTML Tags',
    ID_ingestion=f'Scraping Ingestion for {url}',
    ID_control='1 Hour Update',
    ID_loader=f'Json File Loader on export/{url[7:]}.json',
    ID_process="Transformation Basic for HTML")

flow2 = flow_config.create_flow(
    ID_datasource=f'Website Scraping {url2}',
    ID_encoder='Basic HTML Tags',
    ID_ingestion=f'Scraping Ingestion for {url2}',
    ID_control='1 Hour Update',
    ID_loader=f'Json File Loader on export/{url2[7:]}.json',
    ID_process="Transformation Basic for HTML")

flow3 = flow_config.create_flow(
    ID_datasource=f'Website Scraping {url3}',
    ID_encoder='Basic HTML Tags',
    ID_ingestion=f'Scraping Ingestion for {url3}',
    ID_control='1 Hour Update',
    ID_loader=f'Json File Loader on export/{url3[7:]}.json',
    ID_process="Transformation Basic for HTML")

flow_config.add_flow(flow1)
flow_config.add_flow(flow2)
flow_config.add_flow(flow3)

# Pipeline Object Creation

pipeline = ETL("ETL Pipeline")

# Create Pipeline Elements

## Flow 1

pipeline.create_datasource(f'Website Scraping {url}', DataSourceConfig().scraping_source_config(url))
pipeline.create_encoder('Basic HTML Tags', EncoderConfig().html_encoder_config(tags, tags))
pipeline.create_ingestion(f'Scraping Ingestion for {url}')
pipeline.create_process("Transformation Basic for HTML", [Transformation().scraping_to_pivot_table])
pipeline.create_control('1 Hour Update',10,10)
pipeline.create_loader(f'Json File Loader on export/{url[7:]}.json', LoaderConfig().json_file_loader(f'export/{url[7:]}.json'))

## Flow 2

pipeline.create_datasource(f'Website Scraping {url2}', DataSourceConfig().scraping_source_config(url2))
pipeline.create_ingestion(f'Scraping Ingestion for {url2}')
pipeline.create_loader(f'Json File Loader on export/{url2[7:]}.json', LoaderConfig().json_file_loader(f'export/{url2[7:]}.json'))

## Flow 2

pipeline.create_datasource(f'Website Scraping {url3}', DataSourceConfig().scraping_source_config(url3))
pipeline.create_ingestion(f'Scraping Ingestion for {url3}')
pipeline.create_loader(f'Json File Loader on export/{url3[7:]}.json', LoaderConfig().json_file_loader(f'export/{url3[7:]}.json'))

# Set Pipeline flow

pipeline.set_flow(flow_config.get_flow_config())

# Pipeline Start 

pipeline.start()

Linking Pipeline flows
Flow index: 1 saved and ready to be run.
Flow index: 2 saved and ready to be run.
Flow index: 3 saved and ready to be run.
Running Ingestion Scraping Ingestion for http://www.mendo.care...
Start extraction for Scraping Ingestion for http://www.mendo.care
Running Ingestion Scraping Ingestion for http://www.google.com...
Start extraction for Scraping Ingestion for http://www.google.com
Running Ingestion Scraping Ingestion for https://edition.cnn.com...
Start extraction for Scraping Ingestion for https://edition.cnn.com
Ingestion Scraping Ingestion for http://www.google.com successfully completed!
Running Transformation Transformation Basic for HTML for Ingestion Scraping Ingestion for http://www.google.com...
Running pivot_table transformation.
Trasformation Transformation Basic for HTML for ingestion Scraping Ingestion for http://www.google.com Done!
Loading data from Ingestion Scraping Ingestion for http://www.google.com...
DataFrame exported as JSON: export/www.

In [2]:
import pandas as pd
from modules.Utils import Utils

pd.DataFrame(Utils().read_json('export/www.google.com.json'))

Unnamed: 0,text,url,date_time,ingestion_ID,a,img,p
0,Anmelden,https://accounts.google.com/ServiceLogin?hl=de...,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,Anmelden,,
1,Bilder,https://www.google.com/imghp?hl=de&tab=wi,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,Bilder,,
2,Datenschutzerklärung,/intl/de/policies/privacy/,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,Datenschutzerklärung,,
3,Drive,https://drive.google.com/?tab=wo,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,Drive,,
4,Einstellungen,/preferences?hl=de,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,Einstellungen,,
5,English,http://www.google.com/setprefs?sig=0_YSBC3pon5...,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,English,,
6,Erweiterte Suche,/advanced_search?hl=de-CH&authuser=0,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,Erweiterte Suche,,
7,Français,http://www.google.com/setprefs?sig=0_YSBC3pon5...,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,Français,,
8,Gmail,https://mail.google.com/mail/?tab=wm,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,Gmail,,
9,Google,/images/branding/googlelogo/1x/googlelogo_whit...,2023-11-01 19:16:13.922966,Scraping Ingestion for http://www.google.com,,Google,


In [4]:
pd.DataFrame(Utils().read_json('export/edition.cnn.com.json'))

Unnamed: 0,text,url,date_time,ingestion_ID,a,h2,h3,img,p
0,,#,2023-11-01 19:16:04.314989,Scraping Ingestion for https://edition.cnn.com,,,,,
1,,#,2023-11-01 19:16:04.316985,Scraping Ingestion for https://edition.cnn.com,,,,,
2,,https://edition.cnn.com,2023-11-01 19:16:04.314989,Scraping Ingestion for https://edition.cnn.com,,,,,
3,,https://edition.cnn.com,2023-11-01 19:16:04.316985,Scraping Ingestion for https://edition.cnn.com,,,,,
4,,https://facebook.com/CNN,2023-11-01 19:16:04.315984,Scraping Ingestion for https://edition.cnn.com,,,,,
...,...,...,...,...,...,...,...,...,...
437,‘He understood me’: Death of China’s former pr...,/2023/11/01/china/china-li-keqiang-death-xi-di...,2023-11-01 19:16:04.315984,Scraping Ingestion for https://edition.cnn.com,‘He understood me’: Death of China’s former pr...,,,,
438,‘I wish I was that clean.’ ‘Succession’ star o...,/2023/10/29/sport/fisher-stevens-david-beckham...,2023-11-01 19:16:04.315984,Scraping Ingestion for https://edition.cnn.com,‘I wish I was that clean.’ ‘Succession’ star o...,,,,
439,‘Opening a chest and just pouring money into i...,/2023/11/01/investing/munger-interview-buffett...,2023-11-01 19:16:04.315984,Scraping Ingestion for https://edition.cnn.com,‘Opening a chest and just pouring money into i...,,,,
440,‘She wasn’t too interested in talking’: Cop pu...,/videos/entertainment/2023/10/31/cop-pulls-ove...,2023-11-01 19:16:04.315984,Scraping Ingestion for https://edition.cnn.com,‘She wasn’t too interested in talking’: Cop pu...,,,,


In [5]:
pd.DataFrame(Utils().read_json('export/www.mendo.care.json'))

Unnamed: 0,text,url,date_time,ingestion_ID,a,h2,h3,h4,h5,h6,img,p
0,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:16:04.452987,Scraping Ingestion for http://www.mendo.care,,,,,,,,
1,,https://mendo.care/wp-content/uploads/2023/03/...,2023-11-01 19:16:04.452987,Scraping Ingestion for http://www.mendo.care,,,,,,,,
2,,,2023-11-01 19:16:04.446953,Scraping Ingestion for http://www.mendo.care,,,,,,,,
3,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:16:04.452987,Scraping Ingestion for http://www.mendo.care,,,,,,,,
4,,"data:image/svg+xml,%3Csvg%20xmlns='http://www....",2023-11-01 19:16:04.452987,Scraping Ingestion for http://www.mendo.care,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
237,fashion and accessories,https://www.mendo.care/wp-content/uploads/2023...,2023-11-01 19:16:04.452987,Scraping Ingestion for http://www.mendo.care,,,,,,,fashion and accessories,
238,info@mendo.care,mailto:info@mendo.care,2023-11-01 19:16:04.448953,Scraping Ingestion for http://www.mendo.care,info@mendo.care,,,,,,,
239,info@mendo.care,mailto:info@mendo.care,2023-11-01 19:16:04.449953,Scraping Ingestion for http://www.mendo.care,info@mendo.care,,,,,,,
240,terms and conditions,https://www.mendo.care/terms-and-conditions,2023-11-01 19:16:04.449953,Scraping Ingestion for http://www.mendo.care,terms and conditions,,,,,,,
