In [1]:
from robots import Robots
from sitemap import Sitemap

In [6]:
url = "https://internshala.com/"

In [7]:
data = Robots(url)

Saved to MongoDB for https://internshala.com/


In [8]:
data.sitemap_urls

['https://internshala.com/sitemap.xml']

In [7]:
crawler = Sitemap(start_url=url)

In [9]:
import requests

In [17]:
req = requests.get("https://amazon.com/sitemap.xml")

# Patten identifier completed

In [10]:
import time
import re
from typing import List, Dict, Any, Tuple, Optional
from collections import Counter
from urllib.parse import urlparse

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

from bs4 import BeautifulSoup, Tag
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager


class WebsitePatternFinder:
    """
    A powerful tool to dynamically identify patterns on websites,
    particularly focused on detecting card elements and repetitive structures.
    """
    
    def __init__(self, headless: bool = True):
        """Initialize the pattern finder with browser options."""
        self.options = Options()
        if headless:
            self.options.add_argument("--headless")
        self.options.add_argument("--no-sandbox")
        self.options.add_argument("--disable-dev-shm-usage")
        self.options.add_argument("--disable-gpu")
        self.options.add_argument("--window-size=1920,1080")
        
        # Initialize the WebDriver
        self.driver = None
        self.current_url = None
        self.soup = None
        self.domain = None
        
    def start_browser(self):
        """Start the browser with configured options."""
        try:
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=self.options)
            return True
        except WebDriverException as e:
            print(f"Failed to start browser: {e}")
            return False
            
    def close_browser(self):
        """Close the browser if it's open."""
        if self.driver:
            self.driver.quit()
            self.driver = None
            
    def load_page(self, url: str, wait_time: int = 10) -> bool:
        """
        Load a webpage and wait for it to be fully rendered.
        
        Args:
            url: The URL to load
            wait_time: Maximum time to wait for page to load in seconds
            
        Returns:
            bool: True if page loaded successfully, False otherwise
        """
        if not self.driver:
            if not self.start_browser():
                return False
                
        try:
            self.driver.get(url)
            self.current_url = url
            parsed_url = urlparse(url)
            self.domain = parsed_url.netloc
            
            # Wait for the page to load
            WebDriverWait(self.driver, wait_time).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            
            # Scroll down gradually to trigger lazy loading
            height = self.driver.execute_script("return document.body.scrollHeight")
            for i in range(1, 6):
                self.driver.execute_script(f"window.scrollTo(0, {height * i / 5});")
                time.sleep(0.5)
                
            # Scroll back to top
            self.driver.execute_script("window.scrollTo(0, 0);")
            
            # Get page source and parse with BeautifulSoup
            page_source = self.driver.page_source
            self.soup = BeautifulSoup(page_source, 'html.parser')
            return True
            
        except TimeoutException:
            print(f"Timeout while loading {url}")
            return False
        except WebDriverException as e:
            print(f"Error loading {url}: {e}")
            return False
            
    def find_card_patterns(self, min_occurrences: int = 3) -> List[Dict[str, Any]]:
        """
        Identify card-like patterns on the webpage.
        
        Args:
            min_occurrences: Minimum number of similar elements to consider as a pattern
            
        Returns:
            List of identified card patterns with their details
        """
        if not self.soup:
            print("No page loaded. Please load a page first.")
            return []
            
        # Strategy 1: Look for common container elements that appear multiple times
        candidates = []
        
        for tag_name in ['div', 'article', 'section', 'li']:
            elements = self.soup.find_all(tag_name)
            
            # Group elements by their class
            class_groups = {}
            for element in elements:
                if element.get('class'):
                    class_key = ' '.join(sorted(element.get('class')))
                    if class_key not in class_groups:
                        class_groups[class_key] = []
                    class_groups[class_key].append(element)
            
            # Add promising candidates
            for class_key, elements in class_groups.items():
                if len(elements) >= min_occurrences:
                    # Check if these elements have reasonable content
                    avg_depth = sum(len(str(el).split('<')) for el in elements) / len(elements)
                    avg_text_len = sum(len(el.get_text(strip=True)) for el in elements) / len(elements)
                    
                    if avg_depth > 5 and 20 < avg_text_len < 1000:
                        candidates.append({
                            'tag': tag_name,
                            'class': class_key,
                            'elements': elements,
                            'count': len(elements),
                            'avg_depth': avg_depth,
                            'avg_text_len': avg_text_len,
                            'score': avg_depth * min(len(elements), 10)
                        })
        
        # Sort candidates by score
        candidates.sort(key=lambda x: x['score'], reverse=True)
        
        # Process top candidates to extract patterns
        patterns = []
        
        for candidate in candidates[:5]:  # Process top 5 candidates
            pattern = self._analyze_card_pattern(candidate['elements'])
            if pattern:
                patterns.append({
                    'selector': f"{candidate['tag']}.{candidate['class'].replace(' ', '.')}",
                    'count': candidate['count'],
                    'pattern': pattern,
                    'sample_content': self._extract_sample_content(candidate['elements'][0], pattern),
                    'elements': candidate['elements'][:3]  # Include first 3 elements as examples
                })
        
        return patterns
    
    def _analyze_card_pattern(self, elements: List[Tag]) -> Dict[str, Any]:
        """
        Analyze a set of similar elements to identify their common structure.
        
        Args:
            elements: List of similar BeautifulSoup elements
            
        Returns:
            Dictionary describing the common pattern
        """
        # Collect common child tags
        child_tags = []
        child_classes = []
        field_candidates = {}
        
        for element in elements[:5]:  # Analyze first 5 elements
            # Get direct children
            children = element.find_all(recursive=False)
            child_tags.extend([child.name for child in children])
            
            # Get all descendants one level deeper with classes
            descendants = element.find_all()
            for desc in descendants:
                if desc.get('class'):
                    class_str = ' '.join(sorted(desc.get('class')))
                    child_classes.append(f"{desc.name}.{class_str}")
                    
                # Check for potential card fields
                if desc.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'img', 'a']:
                    text = desc.get_text(strip=True)
                    if desc.name not in field_candidates:
                        field_candidates[desc.name] = []
                    
                    # For images, get src attribute
                    if desc.name == 'img' and desc.get('src'):
                        field_candidates[desc.name].append({
                            'tag': desc.name,
                            'content': desc.get('src'),
                            'attrs': {k: v for k, v in desc.attrs.items() if k in ['class', 'alt']}
                        })
                    # For links, get href
                    elif desc.name == 'a' and desc.get('href'):
                        field_candidates[desc.name].append({
                            'tag': desc.name,
                            'content': text,
                            'href': desc.get('href'),
                            'attrs': {k: v for k, v in desc.attrs.items() if k in ['class']}
                        })
                    # For text elements
                    elif text and len(text) < 500:
                        field_candidates[desc.name].append({
                            'tag': desc.name,
                            'content': text,
                            'attrs': {k: v for k, v in desc.attrs.items() if k in ['class']}
                        })
        
        # Count occurrences
        tag_counts = Counter(child_tags)
        class_counts = Counter(child_classes)
        
        # Identify common structure
        common_tags = [tag for tag, count in tag_counts.items() if count >= len(elements) / 2]
        common_classes = [cls for cls, count in class_counts.items() if count >= len(elements) / 2]
        
        # Analyze field candidates to identify common fields
        fields = {}
        for tag, candidates in field_candidates.items():
            if len(candidates) >= len(elements) / 2:
                if tag == 'img':
                    fields['image'] = {'tag': tag, 'frequency': len(candidates) / len(elements)}
                elif tag == 'a':
                    fields['link'] = {'tag': tag, 'frequency': len(candidates) / len(elements)}
                elif tag.startswith('h'):
                    fields['title'] = {'tag': tag, 'frequency': len(candidates) / len(elements)}
                elif tag == 'p':
                    fields['description'] = {'tag': tag, 'frequency': len(candidates) / len(elements)}
                elif tag == 'span':
                    # Try to determine if it's price, date, or another attribute
                    contents = [c['content'] for c in candidates]
                    if any(re.search(r'\$|\€|\£|\¥', c) for c in contents):
                        fields['price'] = {'tag': tag, 'frequency': len(candidates) / len(elements)}
                    elif any(re.search(r'\d{1,2}[\/\-\.]\d{1,2}[\/\-\.]\d{2,4}', c) for c in contents):
                        fields['date'] = {'tag': tag, 'frequency': len(candidates) / len(elements)}
                    else:
                        fields['attribute'] = {'tag': tag, 'frequency': len(candidates) / len(elements)}
        
        # Check if this looks like a credit card or payment method section
        cc_indicators = ['credit', 'card', 'visa', 'mastercard', 'payment', 'cvv', 'expiry']
        cc_score = 0
        for element in elements:
            text = element.get_text(strip=True).lower()
            for indicator in cc_indicators:
                if indicator in text:
                    cc_score += 1
                    
        is_cc = cc_score >= len(elements) / 3
        
        # Check for price patterns
        price_pattern = False
        for element in elements:
            text = element.get_text(strip=True)
            if re.search(r'\$\d+\.?\d*|\d+\.?\d*\$|\€\d+\.?\d*|\d+\.?\d*\€', text):
                price_pattern = True
                break
        
        return {
            'common_tags': common_tags,
            'common_classes': common_classes[:5],  # Limit to top 5 for clarity
            'fields': fields,
            'is_card': len(fields) >= 2,  # Consider it a card if it has at least 2 identifiable fields
            'is_credit_card': is_cc,
            'has_price': price_pattern
        }
    
    def _extract_sample_content(self, element: Tag, pattern: Dict[str, Any]) -> Dict[str, str]:
        """Extract sample content from an element based on identified pattern."""
        sample = {}
        
        # Extract title
        if 'title' in pattern['fields']:
            title_tag = pattern['fields']['title']['tag']
            title_elem = element.find(title_tag)
            if title_elem:
                sample['title'] = title_elem.get_text(strip=True)
        
        # Extract image
        if 'image' in pattern['fields']:
            img_elem = element.find('img')
            if img_elem and img_elem.get('src'):
                sample['image'] = img_elem.get('src')
                if img_elem.get('alt'):
                    sample['image_alt'] = img_elem.get('alt')
        
        # Extract link
        if 'link' in pattern['fields']:
            link_elem = element.find('a')
            if link_elem and link_elem.get('href'):
                sample['link'] = link_elem.get('href')
                
        # Extract description
        if 'description' in pattern['fields']:
            desc_elem = element.find('p')
            if desc_elem:
                sample['description'] = desc_elem.get_text(strip=True)
                
        # Extract price
        if 'price' in pattern['fields'] or pattern['has_price']:
            # Try different approaches to find price
            price_text = None
            
            # Look for spans with currency symbols
            price_elem = element.find(string=re.compile(r'\$|\€|\£|\¥'))
            if price_elem:
                price_parent = price_elem.parent
                price_text = price_parent.get_text(strip=True)
            
            # If not found, look for elements with price-related classes
            if not price_text:
                for price_class in ['price', 'cost', 'amount']:
                    price_elem = element.find(class_=re.compile(price_class, re.I))
                    if price_elem:
                        price_text = price_elem.get_text(strip=True)
                        break
            
            if price_text:
                sample['price'] = price_text
        
        return sample
    
    def find_repetitive_elements(self, min_occurrences: int = 5) -> List[Dict[str, Any]]:
        """
        Find repetitive elements on the page beyond card-like patterns.
        
        Args:
            min_occurrences: Minimum number of occurrences to consider as repetitive
            
        Returns:
            List of repetitive element patterns
        """
        if not self.soup:
            print("No page loaded. Please load a page first.")
            return []
        
        patterns = []
        
        # Look for repetitive tag-class combinations
        elements_by_signature = {}
        
        for element in self.soup.find_all(True):  # All elements with tags
            # Skip very common elements that are rarely interesting
            if element.name in ['span', 'br', 'script', 'meta', 'link']:
                continue
                
            # Create a signature for this element type
            classes = element.get('class', [])
            class_str = ' '.join(sorted(classes)) if classes else ''
            signature = f"{element.name}:{class_str}"
            
            if signature not in elements_by_signature:
                elements_by_signature[signature] = []
            elements_by_signature[signature].append(element)
        
        # Analyze repetitive elements
        for signature, elements in elements_by_signature.items():
            if len(elements) >= min_occurrences:
                # Skip elements that are likely part of the page structure
                if elements[0].name in ['div', 'span'] and not elements[0].get('class'):
                    continue
                
                # Check content diversity - repetitive elements should have somewhat different content
                texts = [el.get_text(strip=True) for el in elements[:10]]
                unique_texts = set(texts)
                
                # Require at least 50% unique content in the samples
                if len(unique_texts) >= len(texts) * 0.5 or elements[0].name == 'img':
                    tag, class_str = signature.split(':')
                    
                    # Detect element type and role
                    element_type = self._detect_element_type(elements)
                    
                    patterns.append({
                        'selector': f"{tag}{('.' + class_str.replace(' ', '.')) if class_str else ''}",
                        'count': len(elements),
                        'type': element_type,
                        'samples': [self._extract_element_data(el) for el in elements[:3]]
                    })
        
        # Sort by count (most repetitive first)
        patterns.sort(key=lambda x: x['count'], reverse=True)
        
        return patterns[:10]  # Return top 10 patterns
    
    def _detect_element_type(self, elements: List[Tag]) -> str:
        """Detect the type of repetitive element based on its characteristics."""
        sample = elements[0]
        
        if sample.name == 'img':
            return 'image'
        elif sample.name == 'a':
            return 'link'
        elif sample.name in ['ul', 'ol']:
            return 'list'
        elif sample.name in ['table', 'tr']:
            return 'table'
        elif sample.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            return 'heading'
        elif sample.name == 'button':
            return 'button'
        elif sample.name == 'input':
            return f"input-{sample.get('type', 'text')}"
        elif sample.name == 'form':
            return 'form'
        elif sample.find('img') and sample.find('a'):
            return 'media-link'
        elif len(elements) > 10 and all(len(el.get_text(strip=True)) < 100 for el in elements[:10]):
            return 'list-item'
        else:
            return 'container'
    
    def _extract_element_data(self, element: Tag) -> Dict[str, str]:
        """Extract relevant data from an element based on its type."""
        data = {
            'text': element.get_text(strip=True)[:100] if len(element.get_text(strip=True)) > 0 else '[No text]'
        }
        
        # Add relevant attributes based on tag type
        if element.name == 'img':
            data['src'] = element.get('src', '')
            data['alt'] = element.get('alt', '')
        elif element.name == 'a':
            data['href'] = element.get('href', '')
        elif element.name == 'input':
            data['type'] = element.get('type', 'text')
            data['name'] = element.get('name', '')
        
        # Add classes if present
        if element.get('class'):
            data['classes'] = ' '.join(element.get('class'))
            
        return data
    
    def detect_form_fields(self) -> List[Dict[str, Any]]:
        """
        Detect form fields on the page, particularly focused on payment forms.
        
        Returns:
            List of form fields with their details
        """
        if not self.soup:
            print("No page loaded. Please load a page first.")
            return []
            
        forms = self.soup.find_all('form')
        
        form_data = []
        for i, form in enumerate(forms):
            fields = []
            
            # Find all input elements
            inputs = form.find_all(['input', 'select', 'textarea'])
            
            for inp in inputs:
                field_type = inp.name
                if inp.name == 'input':
                    field_type = inp.get('type', 'text')
                
                field = {
                    'tag': inp.name,
                    'type': field_type,
                    'name': inp.get('name', ''),
                    'id': inp.get('id', ''),
                    'placeholder': inp.get('placeholder', ''),
                    'required': 'required' in inp.attrs or inp.get('required') == 'required',
                    'classes': ' '.join(inp.get('class', [])),
                }
                
                # Check if this might be a card-related field
                is_card_field = False
                card_field_type = None
                
                # Check attributes for card-related keywords
                for attr in ['name', 'id', 'placeholder']:
                    value = field.get(attr, '').lower()
                    if not value:
                        continue
                        
                    # Check for card number indicators
                    if any(term in value for term in ['card', 'credit', 'cc']):
                        is_card_field = True
                        if any(term in value for term in ['number', 'num', 'no']):
                            card_field_type = 'card_number'
                            
                    # Check for CVV/CVC
                    elif any(term in value for term in ['cvv', 'cvc', 'security', 'verification']):
                        is_card_field = True
                        card_field_type = 'cvv'
                        
                    # Check for expiry date
                    elif any(term in value for term in ['exp', 'expiry', 'expiration']):
                        is_card_field = True
                        card_field_type = 'expiry'
                        
                    # Check for cardholder name
                    elif any(term in value for term in ['holder', 'name']) and any(term in value for term in ['card', 'credit', 'cc']):
                        is_card_field = True
                        card_field_type = 'cardholder_name'
                
                if is_card_field:
                    field['is_card_field'] = True
                    field['card_field_type'] = card_field_type
                
                fields.append(field)
            
            # Find labels associated with fields
            labels = form.find_all('label')
            for label in labels:
                if label.get('for'):
                    for field in fields:
                        if field['id'] == label.get('for'):
                            field['label'] = label.get_text(strip=True)
            
            # Detect form purpose
            form_text = form.get_text(strip=True).lower()
            form_purpose = 'general'
            
            if any(term in form_text for term in ['shipping', 'delivery', 'address']):
                form_purpose = 'shipping'
            elif any(term in form_text for term in ['payment', 'credit card', 'debit card', 'billing']):
                form_purpose = 'payment'
            elif any(term in form_text for term in ['login', 'sign in', 'signin']):
                form_purpose = 'login'
            elif any(term in form_text for term in ['register', 'sign up', 'signup', 'create account']):
                form_purpose = 'registration'
            elif any(term in form_text for term in ['search']):
                form_purpose = 'search'
            elif any(term in form_text for term in ['contact', 'message', 'feedback']):
                form_purpose = 'contact'
                
            # Check if this is a payment form
            payment_indicators = sum(1 for f in fields if f.get('is_card_field', False))
            is_payment_form = payment_indicators >= 2 or form_purpose == 'payment'
            
            form_data.append({
                'id': form.get('id', f'form_{i}'),
                'action': form.get('action', ''),
                'method': form.get('method', 'get'),
                'fields_count': len(fields),
                'fields': fields,
                'purpose': form_purpose,
                'is_payment_form': is_payment_form
            })
        
        return form_data
    
    def export_patterns_to_csv(self, file_path: str) -> bool:
        """
        Export detected patterns to a CSV file.
        
        Args:
            file_path: Path to save the CSV file
            
        Returns:
            bool: True if export was successful, False otherwise
        """
        if not self.soup:
            print("No patterns detected. Load a page and detect patterns first.")
            return False
            
        try:
            # Get patterns
            card_patterns = self.find_card_patterns()
            repetitive_elements = self.find_repetitive_elements()
            form_fields = self.detect_form_fields()
            
            # Prepare data for cards
            card_data = []
            for pattern in card_patterns:
                row = {
                    'pattern_type': 'card',
                    'selector': pattern['selector'],
                    'count': pattern['count'],
                    'is_credit_card': pattern['pattern'].get('is_credit_card', False),
                    'has_price': pattern['pattern'].get('has_price', False),
                    'fields': ','.join(pattern['pattern'].get('fields', {}).keys())
                }
                card_data.append(row)
                
            # Prepare data for repetitive elements
            element_data = []
            for element in repetitive_elements:
                row = {
                    'pattern_type': 'repetitive',
                    'selector': element['selector'],
                    'count': element['count'],
                    'element_type': element['type']
                }
                element_data.append(row)
                
            # Prepare data for forms
            form_data = []
            for form in form_fields:
                row = {
                    'pattern_type': 'form',
                    'form_id': form['id'],
                    'purpose': form['purpose'],
                    'is_payment_form': form['is_payment_form'],
                    'fields_count': form['fields_count']
                }
                form_data.append(row)
                
            # Combine all data
            all_data = card_data + element_data + form_data
            
            # Create DataFrame and export
            df = pd.DataFrame(all_data)
            df['url'] = self.current_url
            df['domain'] = self.domain
            
            df.to_csv(file_path, index=False)
            return True
            
        except Exception as e:
            print(f"Error exporting patterns: {e}")
            return False
    
    def generate_pattern_report(self) -> Dict[str, Any]:
        """
        Generate a comprehensive report of all patterns found on the page.
        
        Returns:
            Dictionary containing all pattern information
        """
        if not self.soup:
            print("No page loaded. Please load a page first.")
            return {}
            
        # Get patterns
        card_patterns = self.find_card_patterns()
        repetitive_elements = self.find_repetitive_elements()
        form_fields = self.detect_form_fields()
        
        # Count observed occurrences of different element types
        element_counts = {}
        for tag in self.soup.find_all(True):
            if tag.name not in element_counts:
                element_counts[tag.name] = 0
            element_counts[tag.name] += 1
        
        # Sort by frequency
        sorted_elements = sorted(element_counts.items(), key=lambda x: x[1], reverse=True)
        
        return {
            'url': self.current_url,
            'domain': self.domain,
            'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
            'cards': card_patterns,
            'repetitive_elements': repetitive_elements,
            'forms': form_fields,
            'element_frequency': dict(sorted_elements[:20])  # Top 20 elements by frequency
        }


# Example usage
if __name__ == "__main__":
    # Initialize pattern finder
    finder = WebsitePatternFinder(headless=True)
    
    url = "https://internshala.com/internships/work-from-home-internships/"
    if finder.load_page(url):
        # Find patterns
        print("Finding card patterns...")
        cards = finder.find_card_patterns()
        print(f"Found {len(cards)} card patterns.")
        
        print("\nFinding repetitive elements...")
        elements = finder.find_repetitive_elements()
        print(f"Found {len(elements)} repetitive element patterns.")
        
        print("\nDetecting form fields...")
        forms = finder.detect_form_fields()
        print(f"Found {len(forms)} forms.")
        
        # Generate report
        report = finder.generate_pattern_report()
        print("\nReport generated successfully.")
        
        # Export to CSV
        finder.export_patterns_to_csv("patterns.csv")
        print("Patterns exported to patterns.csv")
        
    # Close browser
    finder.close_browser()

Finding card patterns...
Found 5 card patterns.

Finding repetitive elements...
Found 10 repetitive element patterns.

Detecting form fields...
Found 5 forms.

Report generated successfully.
Patterns exported to patterns.csv
