In [3]:
import sys
!{sys.executable} -m pip install selenium webdriver-manager


Collecting selenium
  Downloading selenium-4.35.0-py3-none-any.whl.metadata (7.4 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting websocket-client~=1.8.0 (from selenium)
  Using cached websocket_client-1.8.0-py3-none-any.whl.metadata (8.0 kB)
Collecting sortedcontainers (from trio~=0.30.0->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting cffi>=1.14 (from trio~=0.30.0->selenium)
  Using cached cffi-1.17.1-cp313-cp313-win_amd64.whl.metadata (1.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto

In [22]:
import sys
!{sys.executable} -m pip install selenium webdriver-manager pandas

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import re

def scrape_hospitals_doctors(location, scroll_times=15, wait_time=3):
    """
    Scrape hospitals and doctors around a location from Google Maps using Selenium.
    Returns a DataFrame with name, rating, address, phone, hours, and other details.
    """
    # --- Configure Chrome ---
    options = Options()
    options.add_argument("--start-maximized")
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    # options.add_argument("--headless")  # uncomment for headless mode
   
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
   
    try:
        # --- Open Google Maps with more specific search ---
        search_queries = [
            f"{location} hospitals",
            f"{location} doctors",
            f"{location} clinics",
            f"{location} medical centers"
        ]
        
        all_data = []
        
        for search_query in search_queries:
            print(f"Searching for: {search_query}")
            url = f"https://www.google.com/maps/search/{search_query.replace(' ', '+')}"
            driver.get(url)
            time.sleep(5)
            
            try:
                # Wait for results to load
                wait = WebDriverWait(driver, 10)
                wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[role="feed"]')))
                
                # Find the scrollable results container
                results_container = driver.find_element(By.CSS_SELECTOR, '[role="feed"]')
                
                # Scroll to load more results
                last_height = driver.execute_script("return arguments[0].scrollHeight", results_container)
                
                for scroll_attempt in range(scroll_times):
                    print(f"Scrolling attempt {scroll_attempt + 1}/{scroll_times}")
                    driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", results_container)
                    time.sleep(wait_time)
                    
                    # Check if we've reached the bottom
                    new_height = driver.execute_script("return arguments[0].scrollHeight", results_container)
                    if new_height == last_height:
                        print("Reached bottom of results")
                        break
                    last_height = new_height
                
                # Extract all result cards
                cards = driver.find_elements(By.CSS_SELECTOR, '[role="feed"] > div > div')
                print(f"Found {len(cards)} potential cards")
                
                for i, card in enumerate(cards):
                    try:
                        # Skip sponsored content and ads
                        if card.find_elements(By.CSS_SELECTOR, '[aria-label="Sponsored"]'):
                            continue
                        
                        # Extract name (main heading)
                        name = None
                        name_elements = card.find_elements(By.CSS_SELECTOR, '.qBF1Pd, .fontHeadlineSmall')
                        if name_elements:
                            name = name_elements[0].text.strip()
                        
                        if not name:
                            continue
                            
                        # Extract rating and review count
                        rating = None
                        review_count = None
                        
                        # Method 1: Look for the rating number in span with class MW4etd
                        rating_elements = card.find_elements(By.CSS_SELECTOR, '.MW4etd')
                        if rating_elements:
                            try:
                                rating_text = rating_elements[0].text.strip().replace(',', '.')
                                print(f"DEBUG: Found rating element with text: '{rating_text}'")
                                if re.match(r'^\d+(\.\d+)?$', rating_text):
                                    rating = float(rating_text)
                                    print(f"DEBUG: Extracted rating: {rating}")
                            except Exception as e:
                                print(f"DEBUG: Error extracting rating: {e}")

                        # Extract review count
                        review_count = None
                        review_elements = card.find_elements(By.CSS_SELECTOR, '.UY7F9')
                        if review_elements:
                            try:
                                review_text = review_elements[0].text.strip()
                                review_match = re.search(r'\((\d+(?:,\d+)*)\)', review_text)
                                if review_match:
                                    review_count = int(review_match.group(1).replace(',', ''))
                            except Exception:
                                pass
                        
                        # Method 3: Fallback - look for aria-label with star rating
                        if rating is None:
                            star_elements = card.find_elements(By.CSS_SELECTOR, '[role="img"][aria-label*="star"]')
                            if star_elements:
                                aria_label = star_elements[0].get_attribute('aria-label')
                                if aria_label:
                                    # Extract rating number and review count
                                    rating_match = re.search(r'(\d+\.?\d*)\s*stars?', aria_label)
                                    review_match = re.search(r'(\d+(?:,\d+)*)\s*[Rr]eviews?', aria_label)
                                    
                                    if rating_match:
                                        rating = float(rating_match.group(1))
                                    if review_match and review_count is None:
                                        review_count = int(review_match.group(1).replace(',', ''))
                        
                        # Extract category/type
                        category = None
                        category_elements = card.find_elements(By.CSS_SELECTOR, '.W4Efsd span')
                        for elem in category_elements:
                            text = elem.text.strip()
                            if text and text not in ['·', ''] and not re.match(r'^\d', text):
                                category = text
                                break
                        
                        # Extract address
                        address = None
                        address_elements = card.find_elements(By.CSS_SELECTOR, '.W4Efsd')
                        for addr_elem in address_elements:
                            addr_text = addr_elem.text.strip()
                            if addr_text and '·' in addr_text:
                                # Split by · and look for address-like content
                                parts = addr_text.split('·')
                                for part in parts:
                                    part = part.strip()
                                    if (len(part) > 10 and 
                                        any(char.isdigit() for char in part) and 
                                        not part.startswith('0') and 
                                        'stars' not in part.lower()):
                                        address = part
                                        break
                                if address:
                                    break
                        
                        # Extract phone number
                        phone = None
                        phone_elements = card.find_elements(By.CSS_SELECTOR, '.UsdlK')
                        if phone_elements:
                            phone = phone_elements[0].text.strip()
                        
                        # Extract hours status
                        hours_status = None
                        hours_elements = card.find_elements(By.CSS_SELECTOR, '[style*="color: rgba(25,134,57"], [style*="color: rgba(220,54,46"]')
                        if hours_elements:
                            hours_status = hours_elements[0].text.strip()
                        
                        # Extract website
                        website = None
                        website_elements = card.find_elements(By.CSS_SELECTOR, 'a[data-value="Website"]')
                        if website_elements:
                            website = website_elements[0].get_attribute('href')
                        
                        # Extract review snippet
                        review_snippet = None
                        review_elements = card.find_elements(By.CSS_SELECTOR, '.ah5Ghc span')
                        if review_elements:
                            review_snippet = review_elements[0].text.strip().replace('"', '')
                        
                        # Only add if we have meaningful data
                        if name and (category or address or phone):
                            data_entry = {
                                "name": name,
                                "category": category,
                                "rating": rating,
                                "review_count": review_count,
                                "address": address,
                                "phone": phone,
                                "hours_status": hours_status,
                                "website": website,
                                "review_snippet": review_snippet,
                                "search_query": search_query
                            }
                            all_data.append(data_entry)
                            print(f"Extracted: {name} - {category} - Rating: {rating}")
                    
                    except Exception as e:
                        # Continue with next card if current one fails
                        continue
                
            except TimeoutException:
                print(f"Timeout waiting for results for query: {search_query}")
                continue
        
        # Create DataFrame and remove duplicates
        if all_data:
            df = pd.DataFrame(all_data)
            # Remove duplicates based on name and address
            df = df.drop_duplicates(subset=['name', 'address'], keep='first')
            df = df.reset_index(drop=True)
            print(f"Total unique results found: {len(df)}")
            return df
        else:
            print("No data found")
            return pd.DataFrame()
   
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return pd.DataFrame()
    
    finally:
        driver.quit()

def save_results(df, filename="hospitals_doctors_agadir.csv"):
    """Save results to CSV file"""
    if not df.empty:
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Results saved to {filename}")
        
        # Print summary statistics
        print(f"\n=== SUMMARY ===")
        print(f"Total results: {len(df)}")
        print(f"With ratings: {df['rating'].notna().sum()}")
        print(f"With phone numbers: {df['phone'].notna().sum()}")
        print(f"With addresses: {df['address'].notna().sum()}")
        print(f"With websites: {df['website'].notna().sum()}")
        
        if 'category' in df.columns:
            print(f"\nCategories found:")
            print(df['category'].value_counts().head(10))
    else:
        print("No results to save")

# --- Run the scraper ---
if __name__ == "__main__":
    location = "elfara7 agadir"
    print(f"Starting scraping for location: {location}")
    
    # Run the scraper
    df_results = scrape_hospitals_doctors(location, scroll_times=20, wait_time=3)
    
    if not df_results.empty:
        # Display first 10 results
        print(f"\n=== FIRST 10 RESULTS ===")
        print(df_results.head(10).to_string())
        
        # Save to CSV
        save_results(df_results)
        
        # Optional: Filter only hospitals and clinics
        medical_keywords = ['hospital', 'clinic', 'medical', 'doctor', 'health', 'emergency']
        medical_df = df_results[df_results['category'].str.contains('|'.join(medical_keywords), case=False, na=False)]
        
        if not medical_df.empty:
            print(f"\n=== FILTERED MEDICAL FACILITIES ({len(medical_df)} results) ===")
            save_results(medical_df, "medical_facilities_agadir.csv")
    else:
        print("No results found. Try checking your internet connection or adjusting the search parameters.")

Starting scraping for location: elfara7 agadir
Searching for: elfara7 agadir hospitals
Scrolling attempt 1/20
Scrolling attempt 2/20
Reached bottom of results
Found 15 potential cards
DEBUG: Found rating element with text: '3.0'
DEBUG: Extracted rating: 3.0
Extracted: Hôpital Privé d'Agadir - (283) - Rating: 3.0
DEBUG: Found rating element with text: '3.1'
DEBUG: Extracted rating: 3.1
Extracted: AKDITAL AGADIR - Hôpital International d’Agadir - (575) - Rating: 3.1
DEBUG: Found rating element with text: '4.0'
DEBUG: Extracted rating: 4.0
Extracted: Clinique Argana - (122) - Rating: 4.0
DEBUG: Found rating element with text: '2.8'
DEBUG: Extracted rating: 2.8
Extracted: CLINIQUE AL HOUDA - (164) - Rating: 2.8
DEBUG: Found rating element with text: '3.0'
DEBUG: Extracted rating: 3.0
Extracted: Clinique Cheikh Saadi - (146) - Rating: 3.0
DEBUG: Found rating element with text: '2.1'
DEBUG: Extracted rating: 2.1
Extracted: Hôpital Hassan II Agadir - (177) - Rating: 2.1
DEBUG: Found rating el

In [24]:
df_results.columns

Index(['name', 'category', 'rating', 'review_count', 'address', 'phone',
       'hours_status', 'website', 'review_snippet', 'search_query'],
      dtype='object')

=== RATING EXTRACTION METHODS ===

Example: 3.0 stars 283 Reviews
  Method 1 (aria-label): Rating = 3.0, Reviews = 283
  Method 2 (hidden span): Rating = 3.0
  Method 3 (parentheses): Reviews = 283

Example: 4.4 stars 14 Reviews
  Method 1 (aria-label): Rating = 4.4, Reviews = 14
  Method 2 (hidden span): Rating = 4.4
  Method 3 (parentheses): Reviews = 14


Starting scraping for location: elfara7 agadir
Searching for: elfara7 agadir hospitals
Scrolling attempt 1/20
Reached bottom of results
Found 14 potential cards
Extracted: Hôpital Privé d'Agadir - (283)
Extracted: AKDITAL AGADIR - Hôpital International d’Agadir - (575)
Extracted: CLINIQUE AL HOUDA - (164)
Extracted: Clinique Cheikh Saadi - (146)
Extracted: Hôpital Hassan II Agadir - (177)
Extracted: CENTRE INTERNATIONAL D’ONCOLOGIE D’AGADIR - (14)
Extracted: Clinique Internationale - (163)
Extracted: Dr AMGHAR FATIHA Endocrinologue Diabétologue Nutritionniste - Agadir -عيادة أمراض الغدد،السكري ،التغذية- - (83)
Searching for: elfara