In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
driver = webdriver.Chrome()
driver.get("https://www.coursera.org/search?query=technology")
time.sleep(2)

data = []
visited = set()
scroll_attempts = 0
max_scrolls = 10
headers = {"User-Agent": "Mozilla/5.0"}

while scroll_attempts < max_scrolls:
    cards = driver.find_elements(By.CSS_SELECTOR, "li.cds-9.cds-grid-item")
    new_data = []

    for card in cards:
        try:
            title = card.find_element(By.CSS_SELECTOR, "h3.cds-CommonCard-title").text.strip()
        except:
            continue

        try:
            institution = card.find_element(By.CSS_SELECTOR, "p.cds-ProductCard-partnerNames").text.strip()
        except:
            institution = ""

        key = (title, institution)
        if key in visited:
            continue
        visited.add(key)

        try:
            metadata = card.find_element(By.CSS_SELECTOR, "div.cds-CommonCard-metadata > p").text.strip()
        except:
            metadata = ""

        try:
            rating = card.find_element(By.CSS_SELECTOR, "span.css-6ecy9b").text.strip()
        except:
            rating = ""

        try:
            link = card.find_element(By.CSS_SELECTOR, "a.cds-CommonCard-titleLink").get_attribute("href")
        except:
            link = ""

        # Check course type
        if "Professional Certificate" in metadata:
            tipe = "Professional Certificate"
        elif "Specialization" in metadata:
            tipe = "Specialization"
        elif "Course" in metadata:
            tipe = "Course"
        else:
            continue

        # Open the detail page
        category, subcategory = "", ""
        description, skills = "", ""
        modules = []

        try:
            r = requests.get(link, headers=headers, timeout=15)
            soup = BeautifulSoup(r.text, "html.parser")

            # Category & subcategory
            breadcrumb_items = soup.select('nav[aria-label="Breadcrumbs"] ol li a')
            texts = [item.get_text(strip=True) for item in breadcrumb_items]
            category = texts[2] if len(texts) >= 3 else ""
            subcategory = texts[3] if len(texts) >= 4 else ""

            # Description
            desc_tag = soup.find("p", class_="css-4s48ix")
            description = desc_tag.get_text(strip=True) if desc_tag else None

            # Skills
            skill_items = soup.select("ul[class*='css-yk0mzy'] a")
            skills = ", ".join([s.get_text(strip=True) for s in skill_items if s.get_text(strip=True)])

            # Modules
            number = None
            first_block = soup.find("div", class_="css-dwgey1")
            if first_block:
                link_tag = first_block.find("a")
                if link_tag:
                    words = link_tag.text.strip().split()
                    if words and words[0].isdigit():
                        number = int(words[0])

            module_names = []
            module_descriptions = []
            
            items = soup.select('[data-testid="accordion-item"]')
            for idx, item in enumerate(items):
                if number is not None and idx >= number:
                    break

                judul_el = item.select_one('h3')
                judul = judul_el.get_text(strip=True) if judul_el else ""
                module_names.append(judul)

                mod_description = ""

                if tipe in ["Specialization", "Professional Certificate"]:
                    desc_container = item.select_one('.css-15ekt44')
                    if desc_container:
                        list_items = desc_container.select('li')
                        if list_items:
                            mod_description = "; ".join([li.get_text(strip=True) for li in list_items])
                        else:
                            para_items = desc_container.select('p')
                            mod_description = " ".join([p.get_text(strip=True) for p in para_items if p.get_text(strip=True)])

                elif tipe == "Course":
                    desc_container = item.select_one('.css-15ekt44')
                    if desc_container:
                        first_element = desc_container.find(recursive=False)
                        if first_element:
                            raw_text = first_element.get_text(strip=True)
                            mod_description = raw_text.split("What's included")[0].strip()

                module_descriptions.append(mod_description)

        except Exception as e:
            print(f"⚠️ Gagal scraping detail dari {link}: {e}")

        # Tambahkan ke hasil
        new_data.append({
            "Title": title,
            "Institution": institution,
            "Metadata": metadata,
            "Rating": rating,
            "Link": link,
            "Category": category,
            "Subcategory": subcategory,
            "Description": description,
            "Skills": skills,
            "Modules Name": module_names,
            "Modules Description": module_descriptions
        })

    if new_data:
        data.extend(new_data)
        scroll_attempts = 0
    else:
        scroll_attempts += 1

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.execute_script("window.scrollBy(0, -1500);")
    time.sleep(2)

driver.quit()

df = pd.DataFrame(data)
df.to_csv("data/courses_data_raw.csv", index=False, encoding="utf-8-sig")