In [1]:
!pip install langchain
!pip install langchain_huggingface

Collecting langchain
  Downloading langchain-0.3.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.10 (from langchain)
  Downloading langchain_core-0.3.10-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.134-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.4.0,>=0.3.10->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading orjson-3.10.7-cp310-cp310

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import torch
from transformers import pipeline
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading

model_id = "meta-llama/Llama-3.2-1B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    max_new_tokens=128,
)

visited_urls = set()

visited_lock = threading.Lock()
file_lock = threading.Lock()
counter_lock = threading.Lock()

processed_links = 0
MAX_LINKS = 100

def process_text_with_llm(text, url):
    """
    Process the scraped text using the language model.
    """
    messages = [
        {"role": "system", "content": "You are a language model that cleans up and summarizes text without losing important information."},
        {"role": "user", "content": text},
    ]

    try:
        output = pipe(messages[1]["content"], max_new_tokens=256)
        cleaned_text = output[0]["generated_text"]

        with file_lock:
            with open('processed_text.txt', 'a', encoding='utf-8') as f:
                f.write(f"{cleaned_text.strip()}\n")
                f.write(f"{'-'*80}\n\n")

    except torch.cuda.OutOfMemoryError:
        torch.cuda.empty_cache()
        print(f"Out of memory when processing URL: {url}")
    except Exception as e:
        print(f"Error during LLM processing: {e}")

def scrape_text(url, max_depth=3, depth=0):
    """
    Recursively scrape text from the given URL and its child links.
    """
    global processed_links

    with counter_lock:
        if processed_links >= MAX_LINKS:
            return

    with visited_lock:
        if url in visited_urls or depth > max_depth:
            return
        visited_urls.add(url)

    print(f"Processing link {processed_links+1}/{MAX_LINKS}: {url}")

    try:
        headers = {'User-Agent': 'Mozilla/5.0 (compatible; WebScraper/1.0)'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        encoding = response.encoding or 'utf-8'
        page_content = response.content.decode(encoding, errors='replace')

        soup = BeautifulSoup(page_content, 'html.parser')

    except requests.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return
    except UnicodeDecodeError as e:
        print(f"Encoding error at {url}: {e}")
        return

    page_text = soup.get_text(separator=' ', strip=True)
    page_text = page_text.encode('ascii', 'ignore').decode('utf-8')

    with counter_lock:
        processed_links += 1

    process_text_with_llm(page_text, url)
    with counter_lock:
        if processed_links >= MAX_LINKS:
            return

    links_to_scrape = []
    for link_tag in soup.find_all('a', href=True):
        href = link_tag['href']
        child_url = urljoin(url, href)

        if urlparse(child_url).netloc != urlparse(url).netloc:
            continue

        with counter_lock:
            if processed_links >= MAX_LINKS:
                break

        links_to_scrape.append(child_url)

    return links_to_scrape

def main(starting_url):
    with ThreadPoolExecutor(max_workers=3) as executor:
        futures = {executor.submit(scrape_text, starting_url): starting_url}

        while futures and processed_links < MAX_LINKS:
            for future in as_completed(futures):
                try:
                    new_links = future.result()
                    if new_links:
                        for link in new_links:
                            if processed_links >= MAX_LINKS:
                                break
                            futures[executor.submit(scrape_text, link)] = link

                except Exception as exc:
                    print(f"Error during scraping: {exc}")

                del futures[future]

if __name__ == "__main__":
    starting_url = input("Enter the URL to scrape: ")
    with open('processed_text.txt', 'w', encoding='utf-8') as f:
        f.write('')
    main(starting_url)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Enter the URL to scrape: https://www.cmu.edu/about/index.html
Processing link 1/100: https://www.cmu.edu/about/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 2/100: https://www.cmu.edu/
Processing link 2/100: https://www.cmu.edu/index.html
Processing link 2/100: https://www.cmu.edu/visit/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 5/100: https://www.cmu.edu/student-admission/index.html
Processing link 5/100: http://www.cmu.edu/leadership/
Processing link 5/100: http://www.cmu.edu/strategic-plan/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 8/100: https://www.cmu.edu/diversity/
Processing link 8/100: https://www.cmu.edu/about/cmu_fact_sheet_02.pdf
Processing link 8/100: http://www.cmu.edu/global/presence/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Token indices sequence length is longer than the specified maximum sequence length for this model (1422450 > 131072). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Out of memory when processing URL: https://www.cmu.edu/about/cmu_fact_sheet_02.pdf
Processing link 11/100: https://www.cmu.edu/about/awards.html


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Out of memory when processing URL: https://www.cmu.edu/about/awards.html
Processing link 12/100: https://www.cmu.edu/about/rankings.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 13/100: http://www.cmu.edu/brag
Processing link 13/100: https://www.cmu.edu/dietrich/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 15/100: http://www.cmu.edu/mcs


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 16/100: https://www.cmu.edu/events/index.html
Processing link 16/100: https://www.cmu.edu/jobs/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 18/100: https://www.cmu.edu/coronavirus/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 19/100: https://www.cmu.edu/directory-contact/index.html
Processing link 19/100: https://www.cmu.edu/feedback/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 21/100: https://www.cmu.edu/global/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 22/100: https://www.cmu.edu/health-safety/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 23/100: https://www.cmu.edu/news/
Processing link 23/100: https://www.cmu.edu/sitemap/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 25/100: https://www.cmu.edu/title-ix/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 26/100: https://www.cmu.edu/engage/alumni/
Processing link 26/100: https://www.cmu.edu/business-engagement/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 28/100: https://www.cmu.edu/faculty-staff/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 29/100: https://www.cmu.edu/current-students/index.html
Processing link 29/100: https://www.cmu.edu/legal/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 31/100: https://www.cmu.edu/social-media/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 32/100: https://www.cmu.edu/academics/index.html
Processing link 32/100: https://www.cmu.edu/academics/interdisciplinary-programs.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 34/100: https://www.cmu.edu/academics/learning-for-a-lifetime.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 35/100: https://www.cmu.edu/admission
Processing link 35/100: http://www.cmu.edu/graduate/admissions/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 37/100: https://www.cmu.edu/about/mission.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 38/100: https://www.cmu.edu/about/history.html
Processing link 38/100: https://www.cmu.edu/about/traditions.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 40/100: http://www.cmu.edu/diversity


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 41/100: https://www.cmu.edu/about/pittsburgh.html
Processing link 41/100: https://www.cmu.edu/visit/welcome-center.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Failed to retrieve https://www.cmu.edu/visit/welcome-center.html: 404 Client Error: Not Found for url: https://www.cmu.edu/visit/welcome-center.html
Processing link 41/100: https://www.cmu.edu/visit/maps-parking-transportation.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 43/100: https://www.cmu.edu/research/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 44/100: https://www.cmu.edu/research/centers-and-institutes.html
Processing link 44/100: https://www.cmu.edu/student-experience/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 46/100: https://www.cmu.edu/engage/give/opportunities/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 47/100: https://www.cmu.edu/#menu
Processing link 47/100: https://www.cmu.edu/news/stories/archives/2024/october/former-cmu-faculty-geoffrey-hinton-awarded-2024-nobel-prize-in-physics


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 49/100: https://www.cmu.edu/news/stories/archives/2024/april/three-cmu-students-awarded-2024-goldwater-scholarship


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 50/100: https://www.cmu.edu/news/stories/archives/2024/October/election-anxiety
Processing link 50/100: https://www.cmu.edu/regional-impact/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 52/100: https://www.cmu.edu/leadership/the-provost/provost-initiatives/sustainability.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 53/100: https://www.cmu.edu/strategic-plan/
Processing link 53/100: https://www.cmu.edu/visit/welcome-center/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 55/100: https://www.cmu.edu/index.html#menu


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 56/100: https://www.cmu.edu/admission/visit
Processing link 56/100: https://www.cmu.edu/visit/welcome-center


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 58/100: https://www.cmu.edu/visit/ambassadors.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 59/100: https://www.cmu.edu/global/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 60/100: http://www.cmu.edu/career/employers/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 61/100: http://www.cmu.edu/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 62/100: http://www.cmu.edu/leadership/index.html
Processing link 62/100: http://www.cmu.edu/leadership/board/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 64/100: http://www.cmu.edu/leadership/president/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 65/100: http://www.cmu.edu/leadership/senior-admin/index.html
Processing link 65/100: http://www.cmu.edu/leadership/the-provost/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 67/100: http://www.cmu.edu/leadership/deans/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 68/100: http://www.cmu.edu/leadership/deeper-conversations/index.html
Processing link 68/100: http://www.cmu.edu/leadership/assets/pdf/cmu-leadership-org-chart.pdf


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Out of memory when processing URL: http://www.cmu.edu/leadership/assets/pdf/cmu-leadership-org-chart.pdf
Processing link 70/100: https://www.cmu.edu/diversity/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 71/100: https://www.cmu.edu/regional-impact


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 72/100: http://www.cmu.edu/legal/


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 73/100: http://www.cmu.edu/leadership/board/chair.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 74/100: http://www.cmu.edu/leadership/board/officers-of-the-corporation.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 75/100: http://www.cmu.edu/leadership/board/voting-trustees.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 76/100: http://www.cmu.edu/leadership/board/emeriti-trustees.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 77/100: http://www.cmu.edu/leadership/board/office-of-the-board-of-trustees.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 78/100: http://www.cmu.edu/leadership/president/bio/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 79/100: http://www.cmu.edu/leadership/president/senior-staff/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 80/100: http://www.cmu.edu/leadership/president/campus-comms/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 81/100: http://www.cmu.edu/leadership/president/lecture-series/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 82/100: http://www.cmu.edu/leadership/president/lecture-series/index-old.html
Failed to retrieve http://www.cmu.edu/leadership/president/lecture-series/index-old.html: 404 Client Error: Not Found for url: https://www.cmu.edu/leadership/president/lecture-series/index-old.html
Processing link 82/100: http://www.cmu.edu/leadership/president/multimedia/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 83/100: http://www.cmu.edu/leadership/president/in-the-news/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 84/100: http://www.cmu.edu/leadership/president/past-pres/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 85/100: http://www.cmu.edu/leadership/president/contact/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 86/100: http://www.cmu.edu/leadership/the-provost/bio/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 87/100: http://www.cmu.edu/leadership/the-provost/provost-office/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 88/100: http://www.cmu.edu/leadership/the-provost/academic-leadership/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 89/100: http://www.cmu.edu/leadership/the-provost/provost-priorities/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 90/100: http://www.cmu.edu/leadership/the-provost/campus-comms/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 91/100: http://www.cmu.edu/leadership/the-provost/office-hours/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 92/100: http://www.cmu.edu/leadership/the-provost/open-searches/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 93/100: http://www.cmu.edu/leadership/the-provost/past-provosts/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 94/100: http://www.cmu.edu/leadership/the-provost/contact/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 95/100: http://www.cmu.edu/strategic-plan/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 96/100: http://www.cmu.edu/strategic-plan/#steering


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 97/100: http://www.cmu.edu/strategic-plan/assets/strategic-plan-2025.pdf


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Out of memory when processing URL: http://www.cmu.edu/strategic-plan/assets/strategic-plan-2025.pdf
Processing link 98/100: https://www.cmu.edu/leadership/senior-admin/vp-bios/oreilly-bio.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 99/100: https://www.cmu.edu/leadership/the-provost/academic-leadership/culyba-bio.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Processing link 100/100: https://www.cmu.edu/education-office/meet-the-vp/index.html


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
