In [1]:
import requests
from bs4 import BeautifulSoup

def scrape_website(url):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Create a BeautifulSoup object to parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract the title of the webpage
            title = soup.title.text if soup.title else "No title found"
            
            # Extract all the text content from the webpage
            text_content = soup.get_text(separator='\n')
            
            # Extract all the links from the webpage
            links = [link.get('href') for link in soup.find_all('a')]
            
            # Extract all the images from the webpage
            images = [img.get('src') for img in soup.find_all('img')]
            
            # Create a dictionary to store the extracted information
            website_info = {
                'title': title,
                'text_content': text_content,
                'links': links,
                'images': images
            }
            
            return website_info
        else:
            print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
            return None
    
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while making the request: {e}")
        return None

In [2]:
url = 'https://python.langchain.com/docs/integrations/providers/ctransformers/'
scraped_info = scrape_website(url)
print(scraped_info)

{'title': 'C Transformers | 🦜️🔗 LangChain', 'text_content': "\n\n\n\n\n\n\n\n\n\nC Transformers | 🦜️🔗 LangChain\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nSkip to main content\nComponents\nIntegrations\nGuides\nAPI Reference\nMore\nPeople\nVersioning\nContributing\nTemplates\nCookbooks\nTutorials\nYouTube\n🦜️🔗\nLangSmith\nLangSmith Docs\nLangServe GitHub\nTemplates GitHub\nTemplates Hub\nLangChain Hub\nJS/TS Docs\n💬\nSearch\nProviders\nProviders\nAnthropic\nAWS\nGoogle\nHugging Face\nMicrosoft\nOpenAI\nMore\nAcreom\nActiveloop Deep Lake\nAI21 Labs\nAim\nAINetwork\nAirbyte\nAirtable\nAlchemy\nAleph Alpha\nAlibaba Cloud\nAnalyticDB\nAnnoy\nAnyscale\nApache Doris\nApify\nArangoDB\nArcee\nArcGIS\nArgilla\nArthur\nArxiv\nAssemblyAI\nAstra DB\nAtlas\nAwaDB\nAZLyrics\nBagelDB\nBaichuan\nBaidu\nBanana\nBaseten\nBeam\nBeautiful Soup\nBibTeX\nBiliBili\nBittensor\nBlackboard\nBrave Search\nBreebs (Open Knowledge)\nBrowserless\nByteDance\nCassandra\nCerebriumAI\nChaindesk\nChroma\nClarifai\nClearML\nClickHo