In [1]:
import requests
from bs4 import BeautifulSoup
import logging
import time
import random
import re
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Set up logging
logging.basicConfig(level=logging.INFO)

# Google Sheets API setup
def setup_google_sheets(sheet_url, credentials_file):
    scope = ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/drive"]
    credentials = ServiceAccountCredentials.from_json_keyfile_name(credentials_file, scope)
    client = gspread.authorize(credentials)
    sheet = client.open_by_url(sheet_url)
    return sheet.sheet1

def scrape_phone_number(url):
    max_retries = 5
    base_delay = 10  # Initial delay in seconds

    for attempt in range(max_retries):
        try:
            # Validate URL
            if not url.startswith(('http://', 'https://')):
                logging.error(f"Invalid URL skipped: {url}")
                return 'Invalid URL'

            # Request URL
            response = requests.get(url)
            
            if response.status_code == 429:
                # Handle rate limit errors with exponential backoff
                delay = base_delay * (2 ** attempt) + random.uniform(1, 5)
                logging.warning(f"Rate limit hit for URL {url}. Retrying in {delay:.2f} seconds.")
                time.sleep(delay)
                continue

            response.raise_for_status()

            # Parse HTML
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract phone numbers
            phone_numbers = set()
            phone_regex = re.compile(r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}')
            
            # Search text within the page
            for text in soup.stripped_strings:
                phone_numbers.update(phone_regex.findall(text))
            
            # Search within href attributes
            for a_tag in soup.find_all('a', href=True):
                phone_numbers.update(phone_regex.findall(a_tag['href']))

            logging.info(f"Phone numbers found: {list(phone_numbers)}")

            return ', '.join(phone_numbers) if phone_numbers else 'Not Found'
        
        except requests.RequestException as e:
            logging.error(f"Request failed for URL {url}: {e}")
            return 'Error'
        
        # Delay to avoid hitting rate limits
        time.sleep(random.uniform(5, 10))

    return 'Error after retries'

def update_google_sheet(sheet, phone_numbers, start_row):
    cell_range = f'H{start_row}:H{start_row + len(phone_numbers) - 1}'
    cell_values = [[phone] for phone in phone_numbers]
    sheet.update(cell_range, cell_values, value_input_option='USER_ENTERED')

def batch_process_urls(sheet, batch_size=150):
    urls = sheet.col_values(6)[1:]  # Skip header
    total_urls = len(urls)
    logging.info(f"Total URLs to process: {total_urls}")

    for i in range(0, total_urls, batch_size):
        batch_urls = urls[i:i + batch_size]
        logging.info(f"Processing batch {i // batch_size + 1} with {len(batch_urls)} URLs")

        phone_numbers = []
        for url in batch_urls:
            phone_number = scrape_phone_number(url)
            phone_numbers.append(phone_number)
        
        # Update Google Sheet with phone numbers for this batch
        update_google_sheet(sheet, phone_numbers, start_row=i + 2)
        
        # Delay to avoid hitting rate limits
        time.sleep(random.uniform(10, 20))

if __name__ == "__main__":
    # Google Sheets details
    sheet_url = "https://docs.google.com/spreadsheets/d/1_W7-ZMmRIQFIQVuECr4E7oAg1jsAJA6wDct0i1Ni_Wc/edit?gid=944023105#gid=944023105"
    credentials_file = "C:\\Users\\username\\Downloads\\file_name.json"
    
    # Setup Google Sheets
    sheet = setup_google_sheets(sheet_url, credentials_file)

    # Process URLs in batches
    batch_process_urls(sheet, batch_size=150)
    
    logging.info("Google Sheet updated successfully.")


INFO:root:Total URLs to process: 753
INFO:root:Processing batch 1 with 150 URLs
INFO:root:Phone numbers found: ['(614) 866-7392', '4871104007', '4873717007']
INFO:root:Phone numbers found: ['303-750-0502', '(720) 802-6023', '(303) 750-0502', '(303) 429-9206']
INFO:root:Phone numbers found: ['1648458915', '(256) 929-1420', '5520024476', '2499014957', '1587774523']
INFO:root:Phone numbers found: ['1689246401', '1492321085', '9284867992', '1622475452']
INFO:root:Phone numbers found: ['(903) 438-9174', '(903) 919-3760', '1000639061', '(903) 517-2427']
INFO:root:Phone numbers found: ['954 743 2101', '(754) 205-4839', '(786) 763-1875', '(954) 743-2101']
INFO:root:Phone numbers found: ['(304) 241-5566']
INFO:root:Phone numbers found: ['1256171680']
INFO:root:Phone numbers found: ['(678) 388-1733', '180321-1925']
INFO:root:Phone numbers found: ['317-530-2381', '(317) 530-2381', '1000635890']
INFO:root:Phone numbers found: ['(612) 333-9953', '612-333-9953']
INFO:root:Phone numbers found: ['404 