In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [10]:
pip install requests



In [3]:
pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [64]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Store all extracted content
    scraped_data = {"text": [], "tables": [], "images": [], "links": [], "metadata": {}}

    # Extract text content (headings, paragraphs, list items)
    for tag in soup.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'li', 'span']):
        scraped_data['text'].append(tag.get_text(strip=True))

    # Extract table data
    for table in soup.find_all('table'):
        table_data = []
        for row in table.find_all('tr'):
            cells = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]
            table_data.append(cells)
        scraped_data['tables'].append(table_data)

    # Extract images
    for img in soup.find_all('img'):
        img_url = urljoin(url, img.get('src'))  # Handle relative URLs
        scraped_data['images'].append(img_url)

    # Extract hyperlinks
    for link in soup.find_all('a', href=True):
        full_url = urljoin(url, link['href'])  # Handle relative URLs
        scraped_data['links'].append({"text": link.get_text(strip=True), "url": full_url})

    for svg in soup.find_all('svg'):
        print(svg.prettify())

    # Extract metadata (title, description, keywords)
    scraped_data['metadata']['title'] = soup.title.string if soup.title else ""
    for meta in soup.find_all('meta'):
        if meta.get('name') == 'description':
            scraped_data['metadata']['description'] = meta.get('content', "")
        if meta.get('name') == 'keywords':
            scraped_data['metadata']['keywords'] = meta.get('content', "")

    return scraped_data

# Example usage with the given URLs
urls = [
    "https://www.washington.edu/",
    "https://www.stanford.edu/",
    "https://und.edu/"
]

website_data = {url: scrape_website(url) for url in urls}


<svg enable-background="new 0 0 18.776 51.062" focusable="false" height="51px" version="1.1" viewbox="0 0 18.776 51.062" width="19px" x="0px" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" y="0px">
 <g>
  <path d="M3.537,7.591C3.537,3.405,6.94,0,11.128,0c4.188,0,7.595,3.406,7.595,7.591c0,4.187-3.406,7.593-7.595,7.593C6.94,15.185,3.537,11.778,3.537,7.591z M5.245,7.591c0,3.246,2.643,5.885,5.884,5.885c3.244,0,5.89-2.64,5.89-5.885c0-3.245-2.646-5.882-5.89-5.882C7.883,1.71,5.245,4.348,5.245,7.591z" fill="#FFFFFF">
  </path>
  <rect fill="#FFFFFF" height="7.622" transform="matrix(0.7066 0.7076 -0.7076 0.7066 11.7842 2.0922)" width="1.902" x="2.418" y="11.445">
  </rect>
 </g>
 <path d="M3.501,47.864c0.19,0.194,0.443,0.29,0.694,0.29c0.251,0,0.502-0.096,0.695-0.29l5.691-5.691l5.692,5.691c0.192,0.194,0.443,0.29,0.695,0.29c0.25,0,0.503-0.096,0.694-0.29c0.385-0.382,0.385-1.003,0-1.388l-5.692-5.691l5.692-5.692c0.385-0.385,0.385-1.005,0-1.388c-0.3

In [53]:
print(website_data)

{'https://www.washington.edu/': {'text': ['MyUW', '', 'Calendar', '', 'Directories', '', 'Libraries', '', 'UW Medicine', '', 'Maps', '', 'UW News', '', 'Helpful Links', 'Computing/IT', 'Workday HCM', 'Husky Card', 'UW Bothell', 'UW Tacoma', 'UW Facebook', 'UW Twitter', 'Students', 'Parents', 'Faculty & Staff', 'Alumni', 'AboutAbout the UWDiversityGlobal ImpactInnovationLeadershipMapsPopulation HealthSustainabilityVisit', 'About the UW', 'Diversity', 'Global Impact', 'Innovation', 'Leadership', 'Maps', 'Population Health', 'Sustainability', 'Visit', 'AcademicsAcademic calendarAcademic departmentsColleges and schoolsCourse descriptionsRegistrationStudent guideTime schedule', 'Academic calendar', 'Academic departments', 'Colleges and schools', 'Course descriptions', 'Registration', 'Student guide', 'Time schedule', 'ApplyAdmissionsFinancial AidContinuing educationMajorsStudent housingTransfer studentsTuition and feesUndocumented studentsUW Online', 'Admissions', 'Financial Aid', 'Continui

In [54]:
def prepare_chunks(scraped_data, chunk_size=500):
    combined_content = []

    # Combine all text content
    combined_content.extend(scraped_data['text'])

    # Convert table data into text
    for table in scraped_data['tables']:
        table_text = "\n".join([" | ".join(row) for row in table])
        combined_content.append(table_text)

    # Add metadata
    if 'title' in scraped_data['metadata']:
        combined_content.append(f"Title: {scraped_data['metadata']['title']}")
    if 'description' in scraped_data['metadata']:
        combined_content.append(f"Description: {scraped_data['metadata']['description']}")
    if 'keywords' in scraped_data['metadata']:
        combined_content.append(f"Keywords: {scraped_data['metadata']['keywords']}")

    # Create chunks of fixed size
    text = " ".join(combined_content)
    chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    return chunks

# Prepare chunks for each website
all_chunks = {}
for url, data in website_data.items():
    all_chunks[url] = prepare_chunks(data)


In [55]:
print(all_chunks)

{'https://www.washington.edu/': ['MyUW  Calendar  Directories  Libraries  UW Medicine  Maps  UW News  Helpful Links Computing/IT Workday HCM Husky Card UW Bothell UW Tacoma UW Facebook UW Twitter Students Parents Faculty & Staff Alumni AboutAbout the UWDiversityGlobal ImpactInnovationLeadershipMapsPopulation HealthSustainabilityVisit About the UW Diversity Global Impact Innovation Leadership Maps Population Health Sustainability Visit AcademicsAcademic calendarAcademic departmentsColleges and schoolsCourse descriptionsRegistrati', 'onStudent guideTime schedule Academic calendar Academic departments Colleges and schools Course descriptions Registration Student guide Time schedule ApplyAdmissionsFinancial AidContinuing educationMajorsStudent housingTransfer studentsTuition and feesUndocumented studentsUW Online Admissions Financial Aid Continuing education Majors Student housing Transfer students Tuition and fees Undocumented students UW Online News & EventsUW NewsFeatured storiesArts UW

In [56]:
drive_folder = '/content/drive/MyDrive/all-images'

In [58]:
from urllib.request import urlretrieve
import os
def download_images(image_urls, output_folder=drive_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    for i, img_url in enumerate(image_urls):
        try:
            img_path = os.path.join(output_folder, f"image_{i}.jpg")
            urlretrieve(img_url, img_path)
            print(img_path)
        except Exception as e:
            print(f"Failed to download {img_url}: {e}")

# Download images for each website
for url, data in website_data.items():
    download_images(data['images'])



/content/drive/MyDrive/all-images/image_0.jpg
/content/drive/MyDrive/all-images/image_1.jpg
/content/drive/MyDrive/all-images/image_2.jpg
/content/drive/MyDrive/all-images/image_3.jpg
/content/drive/MyDrive/all-images/image_4.jpg
/content/drive/MyDrive/all-images/image_5.jpg
/content/drive/MyDrive/all-images/image_0.jpg
/content/drive/MyDrive/all-images/image_1.jpg
/content/drive/MyDrive/all-images/image_2.jpg
/content/drive/MyDrive/all-images/image_3.jpg
/content/drive/MyDrive/all-images/image_4.jpg
/content/drive/MyDrive/all-images/image_5.jpg
/content/drive/MyDrive/all-images/image_6.jpg
/content/drive/MyDrive/all-images/image_7.jpg
/content/drive/MyDrive/all-images/image_8.jpg
/content/drive/MyDrive/all-images/image_9.jpg
/content/drive/MyDrive/all-images/image_10.jpg
/content/drive/MyDrive/all-images/image_11.jpg
/content/drive/MyDrive/all-images/image_12.jpg
/content/drive/MyDrive/all-images/image_13.jpg
/content/drive/MyDrive/all-images/image_14.jpg
/content/drive/MyDrive/all-im