In [11]:
import os
import re
import time
import requests
from urllib.parse import urljoin, urlparse
from collections import deque
from bs4 import BeautifulSoup
from tqdm import tqdm

visited_urls = set()  # Set to keep track of visited URLs

def url_to_filename(url):
    filename = re.sub(r'^(http|https)://', '', url)
    filename = filename.replace('/', '_')
    filename = re.sub(r'[^a-zA-Z0-9\-_]', '_', filename)
    max_length = 255
    return filename[:max_length]

def scrape_text_bfs(start_url, base_url, max_depth=4):
    queue = deque([(start_url, 0)])  # Queue stores tuples of (url, current_depth)

    while queue:
        url, depth = queue.popleft()
        print(f"Scraping {url} at depth {depth}")
        
        # Check if the URL has already been visited or if it exceeds max depth
        if url in visited_urls or depth > max_depth:
            continue
        
        # Mark the URL as visited
        visited_urls.add(url)

        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Failed to retrieve {url}: {e}")
            continue

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract and save text
        text = soup.get_text(separator=' ', strip=True)
        with open(os.path.join('/app/BFS', url_to_filename(url) + '.txt'), 'w') as f:
            f.write(text)

        # If the current depth is less than max_depth, find and add links to the queue
        if depth < max_depth:
            for link in soup.find_all('a', href=True):
                href = link['href']
                next_url = urljoin(base_url, href)
                if urlparse(next_url).netloc == urlparse(base_url).netloc and next_url not in visited_urls:
                    queue.append((next_url, depth + 1))
                    time.sleep(0.5)  # Sleep for 500ms to avoid hammering the server



In [9]:
def url_to_filename(url):
    # Remove the scheme (http, https, etc.)
    filename = re.sub(r'^(http|https)://', '', url)
    # Replace slashes with underscores
    filename = filename.replace('/', '_')
    # Remove or replace invalid characters, including periods
    filename = re.sub(r'[^a-zA-Z0-9\-_]', '_', filename)
    # Limit filename length to avoid issues with very long filenames
    max_length = 255  # Typical maximum filename length in many filesystems
    return filename[:max_length]

# Example usage
url = "http://example.com/some/page?name=example&param=value"
filename = url_to_filename(url)
print(filename)

example_com_some_page_name_example_param_value


In [31]:
del visited_urls

In [13]:

URLs = [
    "https://health.uoregon.edu/",
    "https://hr.uoregon.edu/",
    "https://counseling.uoregon.edu/",
    "https://safe.uoregon.edu/",
    "https://engage.uoregon.edu/",
    "https://advising.uoregon.edu/",
    "https://financialaid.uoregon.edu/",
    "https://fyp.uoregon.edu/",
    "https://housing.uoregon.edu/",
    "https://environment.uoregon.edu/"

]

for start_url in tqdm(URLs):
    visited_urls = set()
    scrape_text_bfs(start_url, start_url, max_depth=2)

  0%|          | 0/9 [00:00<?, ?it/s]

Scraping https://hr.uoregon.edu/ at depth 0
Scraping https://hr.uoregon.edu/#main-content at depth 1
Scraping https://hr.uoregon.edu/new-employees at depth 1
Scraping https://hr.uoregon.edu/employees at depth 1
Scraping https://hr.uoregon.edu/supervisors-hr-partners at depth 1
Scraping https://hr.uoregon.edu/careers-uo at depth 1
Scraping https://hr.uoregon.edu/about-hr at depth 1
Scraping https://hr.uoregon.edu/university-human-resources-directory at depth 1
Scraping https://hr.uoregon.edu/search at depth 1
Scraping https://hr.uoregon.edu/search at depth 1
Scraping https://hr.uoregon.edu/benefits at depth 1
Scraping https://hr.uoregon.edu/learn-about-your-benefits at depth 1
Scraping https://hr.uoregon.edu/uo-group-insurance at depth 1
Scraping https://hr.uoregon.edu/medical-insurance at depth 1
Scraping https://hr.uoregon.edu/dental-insurance at depth 1
Scraping https://hr.uoregon.edu/vision-insurance at depth 1
Scraping https://hr.uoregon.edu/life-insurance at depth 1
Scraping https

  self.handle_startendtag(tag, attrs)


Scraping https://hr.uoregon.edu/sites/default/files/classified-performance-appraisal-it.docx at depth 2
Scraping https://hr.uoregon.edu/content/managing-employee-performance-performance-evaluation-and-other-forms-feedback at depth 2
Scraping https://hr.uoregon.edu/seiu-performance-review-instructions-it at depth 2
Scraping https://hr.uoregon.edu/seiu-performance-review-instructions at depth 2
Scraping https://hr.uoregon.edu/faculty-performance-review at depth 2
Scraping https://hr.uoregon.edu/oa-performance-management at depth 2
Scraping https://hr.uoregon.edu/veterans at depth 2
Scraping https://hr.uoregon.edu/supervisor-toolkit at depth 2
Scraping https://hr.uoregon.edu/international-employee-relations at depth 2
Scraping https://hr.uoregon.edu/life-events-0 at depth 2
Scraping https://hr.uoregon.edu/separation-employee-action-and-resources at depth 2
Scraping https://hr.uoregon.edu/holiday-guidelines at depth 2
Scraping https://hr.uoregon.edu/workplace-climate at depth 2
Scraping ht

 11%|█         | 1/9 [6:07:53<49:03:10, 22073.80s/it]

Scraping https://hr.uoregon.edu/content/hr-partners-meeting-highlights-resources at depth 2
Scraping https://hr.uoregon.edu/contact-information at depth 2
Scraping https://hr.uoregon.edu/jobs/available-positions?utm_source=banner-module&utm_campaign=banner at depth 2
Scraping https://hr.uoregon.edu/sites/default/files/NEO%20Icon.png at depth 2
Scraping https://hr.uoregon.edu/contact-information at depth 2
Scraping https://hr.uoregon.edu/jobs/available-positions?utm_source=banner-module&utm_campaign=banner at depth 2
Scraping https://hr.uoregon.edu/sites/default/files/talent_acquisition-devrs2592_uo_a395241_4col_480_0_2.jpg at depth 2
Scraping https://hr.uoregon.edu/content/officers-administration-compensation-information at depth 2
Scraping https://hr.uoregon.edu/content/career-and-fixed-term-faculty-pay-actions at depth 2
Scraping https://hr.uoregon.edu/content/seiu-compensation-information at depth 2
Scraping https://hr.uoregon.edu/content/teamsters-compensation-information at depth 

 22%|██▏       | 2/9 [6:21:18<18:35:36, 9562.36s/it] 

Scraping http://counseling.uoregon.edu at depth 2
Scraping http://counseling.uoregon.edu at depth 2
Scraping https://safe.uoregon.edu/ at depth 0
Scraping https://safe.uoregon.edu/#main-content at depth 1
Scraping https://safe.uoregon.edu/search at depth 1
Scraping https://safe.uoregon.edu/search at depth 1
Scraping https://safe.uoregon.edu/info at depth 1
Scraping https://safe.uoregon.edu/rights at depth 1
Scraping https://safe.uoregon.edu/laws-resources at depth 1
Scraping https://safe.uoregon.edu/know-your-options at depth 1
Scraping https://safe.uoregon.edu/help at depth 1
Scraping https://safe.uoregon.edu/university at depth 1
Scraping https://safe.uoregon.edu/police at depth 1
Scraping https://safe.uoregon.edu/community-support at depth 1
Scraping https://safe.uoregon.edu/services at depth 1
Scraping https://safe.uoregon.edu/definitions at depth 1
Scraping https://safe.uoregon.edu/sexual-misconduct at depth 1
Scraping https://safe.uoregon.edu/sexual-exploitation at depth 1
Scrapi

 33%|███▎      | 3/9 [6:24:17<8:47:47, 5277.98s/it] 

Scraping https://safe.uoregon.edu/sexual-misconduct at depth 2
Scraping https://safe.uoregon.edu/sexual-exploitation at depth 2
Scraping https://safe.uoregon.edu/dating-violence at depth 2
Scraping https://safe.uoregon.edu/stalking at depth 2
Scraping https://safe.uoregon.edu/frequently-asked-questions at depth 2
Scraping https://safe.uoregon.edu/confidentiality at depth 2
Scraping https://safe.uoregon.edu/about at depth 2
Scraping https://safe.uoregon.edu/about at depth 2
Scraping https://safe.uoregon.edu/sexual-misconduct at depth 2
Scraping https://safe.uoregon.edu/sexual-exploitation at depth 2
Scraping https://safe.uoregon.edu/dating-violence at depth 2
Scraping https://safe.uoregon.edu/stalking at depth 2
Scraping https://safe.uoregon.edu/frequently-asked-questions at depth 2
Scraping https://safe.uoregon.edu/confidentiality at depth 2
Scraping https://safe.uoregon.edu/sexual-misconduct at depth 2
Scraping https://safe.uoregon.edu/sexual-exploitation at depth 2
Scraping https://s

 44%|████▍     | 4/9 [6:26:03<4:29:40, 3236.17s/it]

Failed to retrieve https://engage.uoregon.edu/student_employment/summerbridge: 599 Server Error:  for url: https://engage.uoregon.edu/student_employment/summerbridge
Scraping https://advising.uoregon.edu/ at depth 0
Scraping https://advising.uoregon.edu/#main-content at depth 1
Scraping https://advising.uoregon.edu/search at depth 1
Scraping https://advising.uoregon.edu/search at depth 1
Scraping https://advising.uoregon.edu/connect-advisors at depth 1
Scraping https://advising.uoregon.edu/explore-majors at depth 1
Scraping https://advising.uoregon.edu/oaa#contact at depth 1
Scraping https://advising.uoregon.edu/exploring-roadmap at depth 1
Scraping https://advising.uoregon.edu/prep-advising at depth 1
Scraping https://advising.uoregon.edu/questions-exploring-students at depth 1
Scraping https://advising.uoregon.edu/vitanavis at depth 1
Scraping https://advising.uoregon.edu/navigate-student at depth 1
Scraping https://advising.uoregon.edu/declare-or-change-major-0 at depth 1
Scraping h

 56%|█████▌    | 5/9 [6:28:49<2:21:55, 2128.88s/it]

Scraping https://advising.uoregon.edu/#what-if at depth 2
Scraping https://advising.uoregon.edu/#resources at depth 2
Scraping https://financialaid.uoregon.edu/ at depth 0
Scraping https://financialaid.uoregon.edu/#main-content at depth 1
Scraping https://financialaid.uoregon.edu/search at depth 1
Scraping https://financialaid.uoregon.edu/search at depth 1
Scraping https://financialaid.uoregon.edu/cost_of_attendance at depth 1
Scraping https://financialaid.uoregon.edu/apply at depth 1
Scraping https://financialaid.uoregon.edu/scholarships at depth 1
Scraping https://financialaid.uoregon.edu/aid_types at depth 1
Scraping https://financialaid.uoregon.edu/checklist at depth 1
Scraping https://financialaid.uoregon.edu/faq at depth 1
Scraping https://financialaid.uoregon.edu/events at depth 1
Scraping https://financialaid.uoregon.edu/contact at depth 1
Scraping https://financialaid.uoregon.edu/cost_of_attendance at depth 1
Scraping https://financialaid.uoregon.edu/apply at depth 1
Scraping 

 67%|██████▋   | 6/9 [6:34:11<1:15:43, 1514.66s/it]

Scraping https://fyp.uoregon.edu/ at depth 0
Scraping https://fyp.uoregon.edu/#main-content at depth 1
Scraping https://fyp.uoregon.edu/search at depth 1
Scraping https://fyp.uoregon.edu/search at depth 1
Scraping https://fyp.uoregon.edu/first-year-interest-groups-2 at depth 1
Scraping https://fyp.uoregon.edu/week-welcome-meetings at depth 1
Scraping https://fyp.uoregon.edu/what-fig at depth 1
Scraping https://fyp.uoregon.edu/how-join-figfye at depth 1
Scraping https://fyp.uoregon.edu/fall-2024-figs at depth 1
Scraping https://fyp.uoregon.edu/fig-frequently-asked-questions at depth 1
Scraping https://fyp.uoregon.edu/first-year-and-transfer-seminars-2 at depth 1
Scraping https://fyp.uoregon.edu/fyetransfer-communities-2 at depth 1
Scraping https://fyp.uoregon.edu/common-read at depth 1
Scraping https://fyp.uoregon.edu/sail at depth 1
Scraping https://fyp.uoregon.edu/about-first-year-programs at depth 1
Scraping https://fyp.uoregon.edu/work-first-year-experience-programs at depth 1
Scrap

 78%|███████▊  | 7/9 [6:39:22<37:22, 1121.00s/it]  

Scraping https://fyp.uoregon.edu/faculty-binder-materials at depth 2
Scraping https://fyp.uoregon.edu/content/fa-alumni-blog at depth 2
Scraping https://fyp.uoregon.edu/staff-directory-0 at depth 2
Scraping http://fyp.uoregon.edu/ at depth 2
Scraping https://fyp.uoregon.edu/content/fa-alumni-blog at depth 2
Scraping https://fyp.uoregon.edu/staff-directory-0 at depth 2
Scraping https://fyp.uoregon.edu/content/fa-alumni-blog at depth 2
Scraping https://fyp.uoregon.edu/staff-directory-0 at depth 2
Scraping http://fyp.uoregon.edu/ at depth 2
Scraping https://fyp.uoregon.edu/staff-directory-0 at depth 2
Scraping https://fyp.uoregon.edu/staff-directory-0 at depth 2
Scraping http://fyp.uoregon.edu/ at depth 2
Scraping http://fyp.uoregon.edu/ at depth 2
Scraping https://housing.uoregon.edu/ at depth 0
Scraping https://housing.uoregon.edu/#main-content at depth 1
Scraping https://housing.uoregon.edu/search at depth 1
Scraping https://housing.uoregon.edu/search at depth 1
Scraping https://housin

 89%|████████▉ | 8/9 [7:14:53<24:02, 1442.63s/it]

Failed to retrieve https://housing.uoregon.edu/starrezportalx: 404 Client Error: Not Found for url: https://housing.uoregon.edu/starrezportalx
Scraping https://housing.uoregon.edu/starrezportalx at depth 2
Scraping https://housing.uoregon.edu at depth 2
Scraping https://environment.uoregon.edu/ at depth 0
Scraping https://environment.uoregon.edu/#main-content at depth 1
Scraping https://environment.uoregon.edu/search at depth 1
Scraping https://environment.uoregon.edu/search at depth 1
Scraping https://environment.uoregon.edu/research-and-awards at depth 1
Scraping https://environment.uoregon.edu/news-feed at depth 1
Failed to retrieve https://environment.uoregon.edu/news-feed: 500 Server Error: Internal Server Error for url: https://environment.uoregon.edu/news-feed
Scraping https://environment.uoregon.edu/events-feed at depth 1
Scraping https://environment.uoregon.edu/our-team at depth 1
Scraping https://environment.uoregon.edu/ei-faculty-advisory-committee at depth 1
Scraping https:

100%|██████████| 9/9 [7:25:00<00:00, 2966.70s/it]

Scraping https://environment.uoregon.edu/?page=53 at depth 2
Scraping https://environment.uoregon.edu/environmental-news at depth 2
Scraping https://environment.uoregon.edu/uos-cass-moseley-testifies-congress at depth 2





In [None]:

URLs = [
    "https://outdoorprogram.uoregon.edu/",
    "https://studentlife.uoregon.edu/",
    "https://rec.uoregon.edu/",
    "https://health.uoregon.edu/",
    "https://hr.uoregon.edu/",
    "https://counseling.uoregon.edu/",
    "https://safe.uoregon.edu/",
    "https://engage.uoregon.edu/",
    "https://advising.uoregon.edu/",
    "https://financialaid.uoregon.edu/",
    "https://fyp.uoregon.edu/",
    "https://housing.uoregon.edu/",
    "https://environment.uoregon.edu/"

]

"https://health.uoregon.edu/",