In [10]:
pip install beautifulsoup4 requests selenium pandas lxml webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [None]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

class UniversalScraper:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        self.scraped_data = {}
        
    def get_user_input(self):
        """Get scraping parameters from user"""
        print("=== Universal Web Scraper ===")
        url = input("Enter the URL to scrape: ").strip()
        
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url
            
        print("\nWhat elements do you want to scrape? (comma separated)")
        print("Examples: ratings, reviews, price, title, description, images, links")
        elements = input("Your elements: ").strip().lower().split(',')
        elements = [e.strip() for e in elements]
        
        print("\nDo you need to handle dynamic content? (JavaScript-rendered pages)")
        use_selenium = input("Use Selenium? (y/n): ").strip().lower() == 'y'
        
        custom_selectors = {}
        if input("Do you want to specify custom CSS selectors? (y/n): ").strip().lower() == 'y':
            for element in elements:
                selector = input(f"Enter CSS selector for {element} (or press Enter for auto-detection): ").strip()
                if selector:
                    custom_selectors[element] = selector
        
        output_file = input("Enter output filename (without extension): ").strip() or 'scraped_data'
        
        return url, elements, use_selenium, custom_selectors, output_file
    
    def scrape_website(self, url, elements, use_selenium, custom_selectors):
        """Main scraping function"""
        self.scraped_data['url'] = url
        self.scraped_data['domain'] = urlparse(url).netloc
        
        try:
            if use_selenium:
                self.scrape_with_selenium(url, elements, custom_selectors)
            else:
                response = self.session.get(url, timeout=10)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')
                self.extract_elements(soup, elements, custom_selectors)
                
        except Exception as e:
            self.scraped_data['error'] = str(e)
            
        return self.scraped_data
    
    def scrape_with_selenium(self, url, elements, custom_selectors):
        """Handle JavaScript-rendered pages with Selenium"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        
        driver = webdriver.Chrome(options=chrome_options)
        try:
            driver.get(url)
            # Wait for page to load
            time.sleep(3)
            
            # Scroll to bottom to load lazy-loaded content
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            
            # Get page source after JavaScript execution
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            self.extract_elements(soup, elements, custom_selectors)
            
        finally:
            driver.quit()
    
    def extract_elements(self, soup, elements, custom_selectors):
        """Extract specified elements from the page"""
        for element in elements:
            if element in custom_selectors:
                # Use user-provided selector
                self.scraped_data[element] = self.extract_with_selector(soup, custom_selectors[element])
            else:
                # Auto-detect based on common patterns
                if element == 'ratings':
                    self.scraped_data[element] = self.extract_ratings(soup)
                elif element == 'reviews':
                    self.scraped_data[element] = self.extract_reviews(soup)
                elif element == 'price':
                    self.scraped_data[element] = self.extract_price(soup)
                elif element == 'title':
                    self.scraped_data[element] = self.extract_title(soup)
                elif element == 'description':
                    self.scraped_data[element] = self.extract_description(soup)
                elif element == 'images':
                    self.scraped_data[element] = self.extract_images(soup)
                elif element == 'links':
                    self.scraped_data[element] = self.extract_links(soup)
                else:
                    # Generic element extraction
                    self.scraped_data[element] = self.extract_generic_element(soup, element)
    
    # Specialized extraction methods
    def extract_ratings(self, soup):
        """Auto-detect and extract ratings"""
        ratings = []
        
        # Check common rating patterns
        patterns = [
            {'selector': '[itemprop="ratingValue"]', 'attr': 'content'},
            {'selector': '.rating', 'attr': 'text'},
            {'selector': '.star-rating', 'attr': 'text'},
            {'selector': '[class*="rating"]', 'attr': 'text'},
            {'selector': 'meta[property="og:rating"]', 'attr': 'content'}
        ]
        
        for pattern in patterns:
            elements = soup.select(pattern['selector'])
            for el in elements:
                if pattern['attr'] == 'content':
                    value = el.get('content', '').strip()
                else:
                    value = el.get_text(strip=True)
                
                if value and (re.match(r'^\d+(\.\d+)?$', value) or '%' in value or 'star' in value.lower()):
                    ratings.append({
                        'value': value,
                        'element': str(el),
                        'selector': pattern['selector']
                    })
        
        return ratings if ratings else "No ratings found"
    
    def extract_reviews(self, soup):
        """Auto-detect and extract reviews"""
        reviews = []
        
        # Check common review patterns
        patterns = [
            {'selector': '[itemprop="review"]', 'text_selector': '[itemprop="reviewBody"]'},
            {'selector': '.review', 'text_selector': '.review-text'},
            {'selector': '.testimonial', 'text_selector': '.testimonial-content'},
            {'selector': '[class*="comment"]', 'text_selector': ''},
            {'selector': '[class*="review"]', 'text_selector': ''}
        ]
        
        for pattern in patterns:
            review_containers = soup.select(pattern['selector'])
            for container in review_containers:
                review = {}
                
                # Extract review text
                if pattern['text_selector']:
                    text_el = container.select_one(pattern['text_selector'])
                else:
                    text_el = container
                
                if text_el:
                    review['text'] = text_el.get_text(' ', strip=True)
                
                # Extract author if available
                author_el = container.select_one('[itemprop="author"], .review-author, .author, .user-name')
                if author_el:
                    review['author'] = author_el.get_text(strip=True)
                
                # Extract rating if available
                rating_el = container.select_one('[itemprop="ratingValue"], .review-rating, .rating-value')
                if rating_el:
                    review['rating'] = rating_el.get_text(strip=True)
                
                # Extract date if available
                date_el = container.select_one('[itemprop="datePublished"], .review-date, .date')
                if date_el:
                    review['date'] = date_el.get_text(strip=True)
                
                if review:
                    reviews.append(review)
        
        return reviews if reviews else "No reviews found"
    
    # Other specialized extraction methods
    def extract_price(self, soup):
        """Auto-detect and extract price"""
        patterns = [
            {'selector': '[itemprop="price"]', 'attr': 'content'},
            {'selector': '.price', 'attr': 'text'},
            {'selector': '[class*="price"]', 'attr': 'text'},
            {'selector': 'meta[property="product:price"]', 'attr': 'content'}
        ]
        
        for pattern in patterns:
            elements = soup.select(pattern['selector'])
            for el in elements:
                if pattern['attr'] == 'content':
                    value = el.get('content', '').strip()
                else:
                    value = el.get_text(strip=True)
                
                if value and any(c.isdigit() for c in value):
                    return {
                        'value': value,
                        'element': str(el),
                        'selector': pattern['selector']
                    }
        
        return "No price found"
    
    def extract_title(self, soup):
        """Extract page title"""
        title = soup.title.string if soup.title else ''
        og_title = soup.find('meta', property='og:title')
        if og_title and og_title.get('content'):
            return og_title['content']
        return title if title else "No title found"
    
    def extract_description(self, soup):
        """Extract page description"""
        description = soup.find('meta', attrs={'name': 'description'})
        if description and description.get('content'):
            return description['content']
        og_description = soup.find('meta', property='og:description')
        if og_description and og_description.get('content'):
            return og_description['content']
        return "No description found"
    
    def extract_images(self, soup):
        """Extract all images"""
        images = []
        for img in soup.find_all('img'):
            src = img.get('src', '')
            if src:
                images.append({
                    'src': urljoin(self.scraped_data['url'], src),
                    'alt': img.get('alt', ''),
                    'width': img.get('width', ''),
                    'height': img.get('height', '')
                })
        return images if images else "No images found"
    
    def extract_links(self, soup):
        """Extract all links"""
        links = []
        for a in soup.find_all('a', href=True):
            href = a['href']
            if href and not href.startswith(('javascript:', 'mailto:', 'tel:')):
                links.append({
                    'text': a.get_text(strip=True),
                    'href': urljoin(self.scraped_data['url'], href)
                })
        return links if links else "No links found"
    
    # Generic extraction methods
    def extract_with_selector(self, soup, selector):
        """Extract elements using custom CSS selector"""
        elements = soup.select(selector)
        if not elements:
            return f"No elements found with selector: {selector}"
        
        results = []
        for el in elements:
            result = {
                'text': el.get_text(' ', strip=True),
                'html': str(el),
                'attributes': dict(el.attrs)
            }
            results.append(result)
        
        return results[0] if len(results) == 1 else results
    
    def extract_generic_element(self, soup, element_name):
        """Attempt to auto-detect generic elements"""
        # Try common class patterns
        selectors = [
            f'.{element_name}',
            f'[class*="{element_name}"]',
            f'#{element_name}',
            f'[id*="{element_name}"]',
            f'[data-{element_name}]'
        ]
        
        for selector in selectors:
            elements = soup.select(selector)
            if elements:
                return [el.get_text(' ', strip=True) for el in elements]
        
        return f"No {element_name} elements found"
    
    def save_to_json(self, data, filename):
        """Save scraped data to JSON file"""
        with open(f'{filename}.json', 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"\nData successfully saved to {filename}.json")

def main():
    scraper = UniversalScraper()
    url, elements, use_selenium, custom_selectors, output_file = scraper.get_user_input()
    scraped_data = scraper.scrape_website(url, elements, use_selenium, custom_selectors)
    scraper.save_to_json(scraped_data, output_file)

if __name__ == "__main__":
    main()

=== Universal Web Scraper ===


Enter the URL to scrape:  https://skytraxratings.com/airlines/british-airways-rating



What elements do you want to scrape? (comma separated)
Examples: ratings, reviews, price, title, description, images, links


Your elements:  rating



Do you need to handle dynamic content? (JavaScript-rendered pages)


Use Selenium? (y/n):  y


In [14]:
import json

# Open and load the JSON file
with open('try.json', 'r') as file:
    data = json.load(file)

# View the content
print(data)


{'url': 'https://bytexl.app/courses/3zhbr3tk5/dbms-mysql/module/435n4egxv/introduction-of-relational-algebra/topic/435n4mr53/read-now-joins-in-sql', 'domain': 'bytexl.app', 'metadata': {'title': 'ByteXL', 'description': 'ByteXL Application', 'keywords': '', 'og': {}}}
