# Data Collection

🎯 Collect all text from https://www.aclcf.org website

In [1]:
# Import libraries
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import re
from collections import deque
from tqdm import tqdm

## Helper Functions

In [2]:
def collect_all_urls(base_url):
    """Collect all URLS from a website and its subpages"""
    
    # Set to store all collected URLs
    collected_urls = set()
    
    # Queue to manage the URLs to be crawled
    queue = deque([base_url])
    
    # Set ti jeeo track of visited URLs
    visited_urls = set([base_url])
    
    while queue:
        # Get the next URL to crawl
        url = queue.popleft()
        
        # Send request to the URL
        response = requests.get(url)
        content = response.text
        
        # Parse the HTML content
        soup = BeautifulSoup(content, "html.parser")
    
        # Extract all the links (anchor tags) from the page
        anchor_tags = soup.find_all('a')
        
        # Collect URLs from the current page
        for anchor in anchor_tags:
            href = anchor.get('href')
            if href is not None and href:
                
                # Normalize the URL by resolving relative URLs
                absolute_url = urljoin(url, href)
                
                # Parse the normalized URL
                parsed_url = urlparse(absolute_url)
                
                # Check if it's a valid URL and has an HTTP or HTTPS scheme
                if parsed_url.scheme in ('http', 'https') and parsed_url.netloc and parsed_url.netloc.startswith('www.aclcf.org'):
                    
                    # Print URL and add it to the list
                    print(f"absolute_url: {absolute_url}")
                    collected_urls.add(absolute_url)
                    
                    # Enqueue the URL for further crawling if it hasn't been visited
                    if absolute_url not in visited_urls:
                        queue.append(absolute_url)
                        visited_urls.add(absolute_url)    
    
    return collected_urls

In [3]:
def scrape_page_content(url):
    """Collect data from a single URL
    """
    
    # Send a GET request to the URL and parse the HTML response
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Extract teh desired content from the page
    text = soup.body.text.strip()
    
    # Remove non-ASCII characters
    text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\xff]','', text)
    
    text = url + "\n" + " | " + "\n" + text
    
    return text

In [4]:
def scrape_all_content(collected_urls):
    """
    Collect content from all connected urls and store it to different file
    """
    
    content = []
    for i, url in tqdm(enumerate(collected_urls)):
        
        # Scrape page content (text)
        text = scrape_page_content(url)
        
        # Get the filename from the URL
        file_path = os.path.join("../data/", str(i) + ".txt")
        
        # Write the scraped content to a file
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(text)
            
    return None

# RUN

In [5]:
# Get all URLs
collected_urls = collect_all_urls(base_url="https://www.aclcf.org")

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/
absolute_url: https://www.aclcf.org/en/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/typhoon-project/
absolute_url: https://www.aclcf.org/typhoon-project/
absolute_url: https://www.aclcf.org/typhoon-project/
absolute_url: https://www.aclcf.org/typhoon-project/
absolute_url: https://www.aclcf.org/thalassa/seachange-greek-islands/
absolute_url: https://www.aclcf.org/thalassa/seachange-greek-islands/
absolute_url: https://www.aclcf.org

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/category/ekdilosis/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/ekdilosis/plastika-mias-chrisis-simera-kanonas-avrio-exeresi/
absolute_url: https://www.aclcf.org/ekdilosis/plastika-sti-georgia/
absolute_url: https://www.aclcf.org/ekdilosis/katharismi-akton-ke-meta-schediazontas-ta-epomena-vimata/
absolute_url: https://www.aclcf.org/ekdilosis/10th-vmw-global-table-from-alienatio

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/the-team/evi-lazou/
absolute_url: https://www.aclcf.org/the-team/evi-lazou/
absolute_url: https://www.aclcf.org/the-team/evi-lazou/
absolute_url: https://www.aclcf.org/the-team/evi-lazou/
absolute_url: https://www.aclcf.org/the-team/chloi-maria-laskaridi/
absolute_url: https://www.aclcf.org/the-team/chloi-maria-laskaridi/
absolute_url: htt

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/thalassa/seachange-greek-islands/
absolute_url: https://www.aclcf.org/sea/seachange-greek-islands-2/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolut

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/the-team/pegki-xirotagarou/
absolute_url: https://www.aclcf.org/the-team/peggy-xirotagarou/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/energo-os-politis-e

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/nea/energo-os-politis-ena-ekpedevtiko-programma-gia-ti-diamorfosi-ton-mathiton-trion-os-energon-politon/
absolute_url: https://www.aclcf.org/news/acting-as-a-citizen-an-educational-program-to-form-students-as-active-citizens/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ella

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/thalassa/programma-paraktion-ke-ipovrichion-katharismon/
absolute_url: https://www.aclcf.org/sea/coastal-and-underwater-cleanup-project/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalas

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/ekpedefsi/ekpedevtika-programmata/
absolute_url: https://www.aclcf.org/education/educational-programs/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absol

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/category/ekdilosis/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/events-en/single-use-plastics-today-a-rule-tomorrow-an-exemption/
absolute_url: https://www.aclcf.org/events-en/plastics-in-agriculture/
absolute_url: https://www.aclcf.org/events-en/coastal-cleanups-whats-next-planning-the-way-forward/
absolute_url: https://www.aclcf.o

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/ekdilosis/athens-refugee-workshop/
absolute_url: https://www.aclcf.org/events-en/athens-refugee-workshop-2/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/


absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/sinergasies/panepistimio-patras/
absolute_url: https://www.aclcf.org/collaborations/university-of-patras-laboratory-of-marine-geology-and-physical-oceanography-of-the-department-of-geology/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/alla-programmata/diafania/
absolute_url: https://www.aclcf.org/other-programs/transparency/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: ht

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/typhoon-project-2/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-gr

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/the-team/pegki-xirotagarou/
absolute_url: https://www.aclcf.org/the-team/peggy-xirotagarou/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece/
absolute_url: htt

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/nea/energo-os-politis-ena-ekpedevtiko-programma-gia-ti-diamorfosi-ton-mathiton-trion-os-energon-politon/
absolute_url: https://www.aclcf.org/news/acting-as-a-citizen-an-educational-program-to-form-students-as-active-citizens/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-col

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/%ce%b1%cf%84%ce%b1%ce%be%ce%b9%ce%bd%cf%8c%ce%bc%ce%b7%cf%84%ce%b1/adidas-end-plastic-waste/
absolute_url: https://www.aclcf.org/choris-katigoria-en/adidas-end-plastic-waste-2/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parou

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/thalassa/seachange-greek-islands/
absolute_url: https://www.aclcf.org/sea/seachange-greek-islands-2/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publi

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/nea/1032/
absolute_url: https://www.aclcf.org/news/eco-agents-a-new-series-of-environmental-podcasts/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publ

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/ekdilosis/katharismi-akton-ke-meta-schediazontas-ta-epomena-vimata/
absolute_url: https://www.aclcf.org/events-en/coastal-cleanups-whats-next-planning-the-way-forward/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece/
absolu

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/sinergasies/ipourgio-perivallontos-ke-energias-ellada-choris-plastika-mias-chrisis/
absolute_url: https://www.aclcf.org/collaborations/ministry-of-environment-and-energy-single-use-plastic-free-greece/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pand

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/alla-programmata/diafania/
absolute_url: https://www.aclcf.org/other-programs/transparency/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pa

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/nea/to-serifos-sunset-race-xana-presvevtis-tou-seachange-greek-islands/
absolute_url: https://www.aclcf.org/news/serifos-sunset-race-ambassador-of-the-seachange-greek-islands/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/nea/to-serifos-sunset-race-xana-presvevtis-tou-seachange-greek-islands/
absolute_url: https://www.aclcf.org/news/serifos-sunset-race-ambassador-of-the-seachange-greek-islands/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parous

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/%ce%b1%cf%84%ce%b1%ce%be%ce%b9%ce%bd%cf%8c%ce%bc%ce%b7%cf%84%ce%b1/mnimonio-sinergasias-kinofelous-idrimatos-athanasios-k-laskaridis-ke-ipourgiou-ethnikis-aminas/
absolute_url: https://www.aclcf.org/choris-katigoria-en/memorandum-of-cooperation-between-the-athanasios-c-laskaridis-charitable-foundation-and-the-ministry-of-national-defence/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/nea/trust-the-change-ena-neo-programma-ipostirixis-gia-perivallontika-ipefthines-epichirisis/
absolute_url: https://www.aclcf.org/news/trust-the-change-a-new-sustaining-program-for-environmentally-responsible-businesses/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collecti

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/nea/o-tifonas-taxidevi-stis-tesseris-gonies-tis-elladas/
absolute_url: https://www.aclcf.org/news/typhoon-sails-to-the-four-corners-of-greece/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-

absolute_url: https://www.aclcf.org/en
absolute_url: https://www.aclcf.org/about-en/the-foundation/
absolute_url: https://www.aclcf.org/category/sea/
absolute_url: https://www.aclcf.org/entrepreneurship/
absolute_url: https://www.aclcf.org/category/education/
absolute_url: https://www.aclcf.org/category/events-en/
absolute_url: https://www.aclcf.org/category/collaborations/
absolute_url: https://www.aclcf.org/category/other-programs/
absolute_url: https://www.aclcf.org/nea/diadiktiaki-imerida-to-perivallon-stin-epochi-tis-pandimias/
absolute_url: https://www.aclcf.org/news/online-workshop-the-environment-in-the-pandemic-era/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/supporters/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/team-en/
absolute_url: https://www.aclcf.org/category/news/
absolute_url: https://www.aclcf.org/news/presentation-of-collective-publication-pandemic-sea-and-man-in-greece/
absolute_url: h

absolute_url: https://www.aclcf.org
absolute_url: https://www.aclcf.org/about/foundation/
absolute_url: https://www.aclcf.org/category/thalassa/
absolute_url: https://www.aclcf.org/epichirimatikotita/
absolute_url: https://www.aclcf.org/ekpedefsi/
absolute_url: https://www.aclcf.org/ekdilosis/
absolute_url: https://www.aclcf.org/sinergasies/
absolute_url: https://www.aclcf.org/alla-programmata/
absolute_url: https://www.aclcf.org/nea/diadiktiaki-imerida-to-perivallon-stin-epochi-tis-pandimias/
absolute_url: https://www.aclcf.org/news/online-workshop-the-environment-in-the-pandemic-era/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/partners/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/team/
absolute_url: https://www.aclcf.org/category/nea/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-ekdosis-pandimia-thalassa-ke-anthropos-stin-ellada/
absolute_url: https://www.aclcf.org/nea/parousiasi-sillogikis-e

In [6]:
print(f"There are {len(collected_urls)} collected URLs")
collected_urls

There are 155 collected URLs


{'https://www.aclcf.org',
 'https://www.aclcf.org/',
 'https://www.aclcf.org/%ce%b1%cf%84%ce%b1%ce%be%ce%b9%ce%bd%cf%8c%ce%bc%ce%b7%cf%84%ce%b1/adidas-end-plastic-waste/',
 'https://www.aclcf.org/%ce%b1%cf%84%ce%b1%ce%be%ce%b9%ce%bd%cf%8c%ce%bc%ce%b7%cf%84%ce%b1/mnimonio-sinergasias-kinofelous-idrimatos-athanasios-k-laskaridis-ke-ipourgiou-ethnikis-aminas/',
 'https://www.aclcf.org/%ce%b1%cf%84%ce%b1%ce%be%ce%b9%ce%bd%cf%8c%ce%bc%ce%b7%cf%84%ce%b1/o-devteros-marathonios-kenotomias-anaptixis-liseon-ke-efarmogon-gia-mia-viosimi-kathimerinotita/',
 'https://www.aclcf.org/%ce%b1%cf%84%ce%b1%ce%be%ce%b9%ce%bd%cf%8c%ce%bc%ce%b7%cf%84%ce%b1/o-tifonas-sti-limno/',
 'https://www.aclcf.org/about-en/the-foundation/',
 'https://www.aclcf.org/about/foundation/',
 'https://www.aclcf.org/alla-programmata/',
 'https://www.aclcf.org/alla-programmata/1821/',
 'https://www.aclcf.org/alla-programmata/diafania/',
 'https://www.aclcf.org/alla-programmata/politistiki-klironomia/',
 'https://www.aclcf.org/all

In [7]:
# Collect and Store data from all pages
scrape_all_content(collected_urls)

155it [00:35,  4.39it/s]


# DEV

In [None]:
response = requests.get("https://www.aclcf.org/sinergasies/ipourgio-politismou-ke-athlitismou/")
soup = BeautifulSoup(response.text, "html.parser")

In [None]:
soup

In [None]:
soup.body.text.strip()