# WEEK 2: Data Scraping Automation

## What is Data Scraping Automation
Data scraping automation is the process of using software to extract information from websites by replicating human-like actions such as clicking, scrolling, typing, and navigating pages.

## Core Behaviour
- Click
- Scroll
- Input
- Hover
- Delay
- Other

## Google Maps Data Scraping A
https://www.google.com/maps

Task: Retrieve data for places that serve ramen in Malang City.

Output: Title, address, and URLS

In [None]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys

options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=options)

url = "https://www.google.com/maps/search/ramen+dekat+malang/"
driver.get(url)

WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="article"]'))
)

time.sleep(3.2)  

result = []
no_new_counter = 0
seen_links = set()

while len (result) < 20 and no_new_counter < 4:
    items = driver.find_elements(By.CSS_SELECTOR, 'div[role="article"]')
    new_found = 0

    for item in items:
        title_el = item.find_element(By.CSS_SELECTOR, 'div.qBF1Pd')
        nama = title_el.text

        link_el = item.find_element(By.CSS_SELECTOR, 'a.hfpxzc')
        link = link_el.get_attribute('href')
        
        if link in seen_links:
            continue
        seen_links.add(link)

        alamat = "Tidak Ditemukan"
        els = item.find_elements(By.CSS_SELECTOR, 'span:last-child')

        for el in els:
            txt = el.text.strip()
            if txt and any (kata in txt.lower() for kata in ['jl.', 'jln.' , 'jalan', 'no.', 'rt', 'rw', 'kota', 'kabupaten', 'provinsi']):
                alamat = txt
                break

        result.append({'nama': nama, 'alamat': alamat, 'link': link})
        new_found += 1

    if len(result) >= 20:
        break

    container = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="feed"]'))
    )
    driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', container)
    time.sleep(1.4)

    ActionChains(driver).send_keys(Keys.END).perform()
    time.sleep(0.6)
    if new_found == 0:
        no_new_counter += 1
    else:
        no_new_counter = 0

driver.quit()

with open('ramen_malang.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['nama', 'alamat', 'link'])
    writer.writeheader()
    writer.writerows(result)

## Google Maps Data Scraping B
https://www.google.com/maps

Task: Retrieve data for places that serve nasi ayam within 1 km of Alun-Alun Kota Malang.

Output: Title, address, coordinate, and URLS

In [None]:
import time
import csv
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
import math

center_lat = -7.9826145
center_lng = 112.6308113

def haversine(lat1, lon1, lat2, lon2):
    R = 6371  
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2) ** 2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon / 2) ** 2
    c = 2 * math.asin(math.sqrt(a))
    return R * c

def extract_coordinates(url):
    match = re.search(r"!3d([-+]?\d*\.\d+)!4d([-+]?\d*\.\d+)", url)
    if match:
        lat = float(match.group(1))
        lng = float(match.group(2))
        return lat, lng
    return None, None

options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')
driver = webdriver.Chrome(options=options)

url = f"https://www.google.com/maps/search/nasi+ayam+di+dekat+alun+alun+malang/@{center_lat},{center_lng},17z/data=!3m1!4b1?entry=ttu&g_ep=EgoyMDI1MTIwMS4wIKXMDSoASAFQAw%3D%3D"
driver.get(url)

WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="article"]'))
)
time.sleep(3.2)

result = []
final_result = []

no_new_counter = 0 
seen_links = set()
while len(result) < 20 and no_new_counter < 4:
    items = driver.find_elements(By.CSS_SELECTOR, 'div[role="article"]')
    new_found = 0

    for item in items:
        title_el = item.find_element(By.CSS_SELECTOR, 'div.qBF1Pd')
        nama = title_el.text

        link_el = item.find_element(By.CSS_SELECTOR, 'a.hfpxzc')
        link = link_el.get_attribute('href')

        if link in seen_links:
            continue
        seen_links.add(link)

        alamat = "Tidak Ditemukan"
        els = item.find_elements(By.CSS_SELECTOR, 'span:last-child')

        for el in els:
            txt = el.text.strip()
            if txt and any(kata in txt.lower() for kata in ['jl.', 'jln.' , 'jalan', 'no.', 'rt', 'rw', 'kota', 'kabupaten', 'provinsi']):
                alamat = txt
                break

        result.append({'nama': nama, 'alamat': alamat, 'link': link})
        new_found += 1

    if len(result) >= 20:
        break

    container = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="feed"]'))
    )
    driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', container)
    time.sleep(1.4)

    ActionChains(driver).send_keys(Keys.END).perform()
    time.sleep(0.6)
    if new_found == 0:
        no_new_counter += 1
    else:
        no_new_counter = 0

    
    for tempat in result:
        lat, lng = extract_coordinates(tempat['link'])
        if lat is None or lng is None:
            continue
        jarak = haversine(center_lat, center_lng, lat, lng)
        if jarak <= 1.0:
            final_result.append({
                'nama': tempat['nama'],
                'alamat': tempat['alamat'],
                'link': tempat['link'],
                'jarak_km': round(jarak, 2)
            })
driver.quit()

with open('nasi_ayam_malang_dekat_alun_alun.csv', 'w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['nama', 'alamat', 'link', 'jarak_km'])
    writer.writeheader()
    writer.writerows(final_result)

## Google Maps Data Scraping C (Challenge)
https://www.google.com/maps

Task: Retrieve all reviews from the Malang Strudel branch that is closest to Malang Town Square.

Output: Username, photos included in the review, userâ€™s, posting time, star rating, and review text.

## 