# WEEK 2: Data Scraping Automation

## What is Data Scraping Automation
Data scraping automation is the process of using software to extract information from websites by replicating human-like actions such as clicking, scrolling, typing, and navigating pages.

## Core Behaviour
- Click
- Scroll
- Input
- Hover
- Delay
- Other

## Google Maps Data Scraping A
https://www.google.com/maps

Task: Retrieve data for places that serve ramen in Malang City.

Output: Title, address, and URLS

In [None]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=options)

url = "https://www.google.com/maps/search/ramen+dekat+malang/"
driver.get(url)

WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="article"]'))
)

time.sleep(3.2)  

result = []
no_new_counter = 0
seen_links = set()

while len (result) < 20 and no_new_counter < 4:
    items = driver.find_elements(By.CSS_SELECTOR, 'div[role="article"]')
    new_found = 0

    for item in items:
        title_el = item.find_element(By.CSS_SELECTOR, 'div.qBF1Pd')
        nama = title_el.text

        link_el = item.find_element(By.CSS_SELECTOR, 'a.hfpxzc')
        link = link_el.get_attribute('href')
        
        if link in seen_links:
            continue
        seen_links.add(link)

        alamat = "Tidak Ditemukan"
        els = item.find_elements(By.CSS_SELECTOR, 'span:last-child')

        for el in els:
            txt = el.text.strip()
            if txt and any (kata in txt.lower() for kata in ['jl.', 'jln.' , 'jalan', 'no.', 'rt', 'rw', 'kota', 'kabupaten', 'provinsi']):
                alamat = txt
                break

        result.append({'nama': nama, 'alamat': alamat, 'link': link})
        new_found += 1

    if len(result) >= 20:
        break

    container = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="feed"]'))
    )
    driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', container)
    time.sleep(1.4)

    ActionChains(driver).send_keys(Keys.END).perform()
    time.sleep(0.6)
    if new_found == 0:
        no_new_counter += 1
    else:
        no_new_counter = 0

driver.quit()

with open('ramen_malang.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['nama', 'alamat', 'link'])
    writer.writeheader()
    writer.writerows(result)

## Google Maps Data Scraping B
https://www.google.com/maps

Task: Retrieve data for places that serve nasi ayam within 1 km of Alun-Alun Kota Malang.

Output: Title, address, coordinate, and URLS

In [None]:
import time
import csv
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import math

center_lat = -7.9826145
center_lng = 112.6308113

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) ** 2
    c = 2 * math.asin(math.sqrt(a))
    return R * c

def extract_coordinates(url):
    match = re.search(r"!3d([-+]?\d*\.\d+)!4d([-+]?\d*\.\d+)", url)
    if match:
        lat = float(match.group(1))
        lng = float(match.group(2))
        return lat, lng
    return None, None

options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=options)

url = f"https://www.google.com/maps/search/nasi+ayam+di+dekat+alun+alun+malang/@{center_lat},{center_lng},17z/data=!3m1!4b1?entry=ttu&g_ep=EgoyMDI1MTIwMS4wIKXMDSoASAFQAw%3D%3D"
driver.get(url)

WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="article"]'))
)
time.sleep(3.2)

result = []
final_result = []

no_new_counter = 0 
seen_links = set()
while len(result) < 20 and no_new_counter < 4:
    items = driver.find_elements(By.CSS_SELECTOR, 'div[role="article"]')
    new_found = 0

    for item in items:
        title_el = item.find_element(By.CSS_SELECTOR, 'div.qBF1Pd')
        nama = title_el.text

        link_el = item.find_element(By.CSS_SELECTOR, 'a.hfpxzc')
        link = link_el.get_attribute('href')

        if link in seen_links:
            continue
        seen_links.add(link)

        alamat = "Tidak Ditemukan"
        els = item.find_elements(By.CSS_SELECTOR, 'span:last-child')

        for el in els:
            txt = el.text.strip()
            if txt and any(kata in txt.lower() for kata in ['jl.', 'jln.' , 'jalan', 'no.', 'rt', 'rw', 'kota', 'kabupaten', 'provinsi']):
                alamat = txt
                break

        result.append({'nama': nama, 'alamat': alamat, 'link': link})
        new_found += 1

    if len(result) >= 20:
        break

    container = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="feed"]'))
    )
    driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', container)
    time.sleep(1.4)

    ActionChains(driver).send_keys(Keys.END).perform()
    time.sleep(0.6)
    if new_found == 0:
        no_new_counter += 1
    else:
        no_new_counter = 0

    
    for tempat in result:
        lat, lng = extract_coordinates(tempat['link'])
        if lat is None or lng is None:
            continue
        jarak = haversine(center_lat, center_lng, lat, lng)
        if jarak <= 1.0:
            final_result.append({
                'nama': tempat['nama'],
                'alamat': tempat['alamat'],
                'link': tempat['link'],
                'jarak_km': round(jarak, 2)
            })
driver.quit()

with open('nasi_ayam_malang_dekat_alun_alun.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['nama', 'alamat', 'link', 'jarak_km'])
    writer.writeheader()
    writer.writerows(final_result)

## Google Maps Data Scraping C (Challenge)
https://www.google.com/maps

Task: Retrieve all reviews from the Malang Strudel branch that is closest to Malang Town Square.

Output: Username, photos included in the review, user’s, posting time, star rating, and review text.

In [None]:
import time
import re
import os
import requests
import math
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC

CENTER_LAT = -7.957211
CENTER_LNG = 112.618347

options = Options()
options.add_argument('--start-maximized')
options.add_argument("disable-dev-shm-usage")

driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": """
        Object.defineProperty(navigator, 'webdriver', {get: () => false});
        window.navigator.chrome = { runtime: {} };
        Object.defineProperty(navigator, 'languages', {get: () => ['id-ID', 'id']});
        Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
      """
})

driver.get("https://www.google.com/maps/search/malang+strudel+dekat+malang+town+square/")
time.sleep(6)
wait.until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="article"]'))
)

results = []
seen_links = set()
scroll_attempts_without_new = 0
max_attempts_without_new = 4

while scroll_attempts_without_new < max_attempts_without_new:
    items = driver.find_elements(By.CSS_SELECTOR, 'div[role="article"]')
    new_found = 0

    for item in items:
        link_el = item.find_element(By.CSS_SELECTOR, 'a.hfpxzc')
        link = link_el.get_attribute('href')

        if link in seen_links:
            continue
        seen_links.add(link)

        results.append({'link': link})
        new_found += 1

    if new_found == 0:
        scroll_attempts_without_new += 1
    else:
        scroll_attempts_without_new = 0

    container = wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="feed"]'))
    )
    driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', container)
    time.sleep(2)

candidates = []
R = 6371

IMAGE_FOLDER = 'foto_ulasan_strudel'
if not os.path.exists(IMAGE_FOLDER):
    os.makedirs(IMAGE_FOLDER)

for r in results:
    match = re.search(r"!3d([-+]?\d*\.\d+)!4d([-+]?\d*\.\d+)", r['link'])
    if match:
        lat = float(match.group(1))
        lng = float(match.group(2))

        lat1, lon1, lat2, lon2 = map(math.radians, [CENTER_LAT, CENTER_LNG, lat, lng])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
        c = 2 * math.asin(math.sqrt(a))
        distance = R * c

        candidates.append({**r, 'jarak_km': round(distance, 2)})

    if not candidates:
        driver.quit()

    closest = min(candidates, key=lambda x: x['jarak_km'])
    driver.get(closest['link'])
    time.sleep(2.1)

    review_tab = wait.until(
        EC.element_to_be_clickable((By.XPATH, '//button[.//div[contains(text(), "Ulasan")]]')))
    driver.execute_script("arguments[0].click();", review_tab)
    time.sleep(2)

    panel = driver.find_element(By.XPATH, '//div[contains(@class, "m6QErb") and contains(@class, "DxyBCb") and contains(@class, "kA9KIf") and contains(@class, "dS8AEf")]')

    for _ in range(10):
        driver.execute_script('arguments[0].scrollBy(0,1400)', panel)
        time.sleep(1)

    for _ in range(10):
        buttons = panel.find_elements(By.XPATH, './/button[.//span[contains(text(), "lainya")]]')
        for btn in buttons:
            try:
                driver.execute_script("arguments[0].click();", btn)
                time.sleep(1)
            except:
                pass
        time.sleep(2)
    
    reviews = driver.find_elements(By.XPATH, './/div[contains(@class, "jftiEf") and contains(@class, "fontBodyMedium")]')[:10]
    hasil_ulasan = []
    for i, review in enumerate(reviews, start=1):
        nama = review.find_element(By.XPATH, './/div[contains(@class, "d4r55")]').text.strip()
        
        rating_el = review.find_element(By.XPATH, './/span[contains(@class, "kvMYJc")]//span')
        stars = rating_el.find_elements(By.XPATH, './/span[contains(@aria-label, "bintang")]')
        bintang_count = len(stars)
        
        ulasan_text = review.find_element(By.XPATH, './/span[contains(@class, "wiI7pd")]').text.strip()
        tanggal = review.find_element(By.XPATH, './/span[contains(@class, "rsqaWe")]').text.strip()
        
        foto_paths = []
        photo_buttons = review.find_elements(By.CSS_SELECTOR, 'div.KtCyie button.Tya61d')
        
        for idx, btn in enumerate(photo_buttons):
            style = btn.get_attribute('style')
            url_match = re.search(r'url\("?(.+?)"?\)', style)
            
            img_url = url_match.group(1)
            img_name = f"review_{i}_foto_{idx+1}.jpg"
            img_path = os.path.join(IMAGE_FOLDER, img_name)
            
            img_response = requests.get(img_url, timeout=10)
            with open(img_path, 'wb') as f:
                f.write(img_response.content)
            foto_paths.append(img_path)
        
        hasil_ulasan.append({
            'nomor': i,
            'nama': nama,
            'bintang': bintang_count,
            'ulasan': ulasan_text,
            'tanggal': tanggal,
            'foto_path': "; ".join(foto_paths) if foto_paths else "Tidak ada foto"
        })

csv_filename = 'ulasan_strudel_malang_town_square.csv'
with open(csv_filename, 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['nomor', 'nama', 'bintang', 'ulasan', 'tanggal', 'foto_path'])
    writer.writeheader()
    writer.writerows(hasil_ulasan)

driver.quit()


Cabang Malang Strudel terdekat ditemukan!
Jarak dari center point: 1.9 km
Link: https://www.google.com/maps/place/Malang+Strudel/data=!4m7!3m6!1s0x2dd6282680e35b09:0x8287e446862baa43!8m2!3d-7.9734574!4d112.623579!16s%2Fg%2F11c5zsdwcl!19sChIJCVvjgCYo1i0RQ6orhkbkh4I?authuser=0&hl=id&rclk=1

Memuat ulasan...
Membuka ulasan lengkap...

Memproses 10 review...

Review 1 (Yusuf Noufal Rahman): Ditemukan 2 foto
  ✓ Foto 1 disimpan: foto_ulasan_strudel\review_1_foto_1.jpg
  ✓ Foto 2 disimpan: foto_ulasan_strudel\review_1_foto_2.jpg
Review 2 (Ilman Alqarni): Ditemukan 2 foto
  ✓ Foto 1 disimpan: foto_ulasan_strudel\review_2_foto_1.jpg
  ✓ Foto 2 disimpan: foto_ulasan_strudel\review_2_foto_2.jpg
Review 3 (Ratih SWP): Ditemukan 4 foto
  ✓ Foto 1 disimpan: foto_ulasan_strudel\review_3_foto_1.jpg
  ✓ Foto 2 disimpan: foto_ulasan_strudel\review_3_foto_2.jpg
  ✓ Foto 3 disimpan: foto_ulasan_strudel\review_3_foto_3.jpg
  ✓ Foto 4 disimpan: foto_ulasan_strudel\review_3_foto_4.jpg
Review 4 (Chia Lara): 

## 