In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def scrape_webpage(url, content_selector):
    print(f"Scraping {url} dengan selector '{content_selector}'...")
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, timeout=15, verify=False, headers=headers)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')

        relevant_content_elements = soup.select(content_selector)

        if not relevant_content_elements:
            print(f"Warning: No relevant element '{content_selector}' di {url}")

        extracted_texts = []
        for i, element_content in enumerate(relevant_content_elements):
            title_text = f"Toggle {i+1}" # Judul default
            try:
                parent_toggle_item = element_content.find_parent(class_='elementor-toggle-item')
                if parent_toggle_item:
                    title_div = parent_toggle_item.find(class_='elementor-tab-title')
                    if title_div:
                        title_anchor = title_div.find('a', class_='elementor-toggle-title')
                        if title_anchor:
                            title_text = title_anchor.get_text(strip=True)
            except Exception as e:
                print(f"Tidak bisa mendapatkan judul untuk bagian toggle: {e}")


            text = element_content.get_text(separator=' ', strip=True)

            if text:
                extracted_texts.append({
                    'source_url': url,
                    'toggle_title': title_text,
                    'raw_text': text,
                    'source_type': 'website-toggle-content'
                })

        print(f"Berhasil mendapatkan {len(extracted_texts)} item teks (dari toggle) dari {url}.")
        return extracted_texts

    except requests.exceptions.SSLError as e:
        print(f"SSL Error saat scraping {url}: {e}")
        return []
    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return []
    except Exception as e:
        print(f"Error tidak terduga saat scraping {url}: {e}")
        return []

target_pages = [
    {
        'url': 'https://penmaba.unj.ac.id/jalur-masuk-sarjana-diploma/',
        'selector': '.elementor-widget-text-editor .elementor-widget-container'
    },

    {
        'url': 'https://penmaba.unj.ac.id/snbp/',
        'selector': '.elementor-toggle .elementor-tab-content'
    },
    {
        'url': 'https://penmaba.unj.ac.id/snbp/',
        'selector': '.elementor-element.elementor-element-1f43bb67 > .elementor-widget-container'
    },
        {
        'url': 'https://penmaba.unj.ac.id/sbmptn/',
        'selector': '.elementor-element.elementor-element-6995efd3 > .elementor-widget-container'
    },
        {
        'url': 'https://penmaba.unj.ac.id/sbmptn/',
        'selector': '.elementor-toggle .elementor-tab-content'
    },
]


all_scraped_data = []
for page_info in target_pages:
    scraped_data = scrape_webpage(page_info['url'], page_info['selector'])
    if scraped_data:
        all_scraped_data.extend(scraped_data)

if all_scraped_data:
    df = pd.DataFrame(all_scraped_data)
    df_deduplicated = df.drop_duplicates(subset=['raw_text'])
    df_deduplicated.to_csv('unj_website_data_deduplicated.csv', index=False, encoding='utf-8')
    print(f"\nData setelah deduplikasi disimpan ke unj_website_data_deduplicated.csv. Jumlah total item: {len(df_deduplicated)}")
else:
    print("\nTidak ada data yang berhasil di-scrape dari website.")

Scraping https://penmaba.unj.ac.id/jalur-masuk-sarjana-diploma/ dengan selector '.elementor-widget-text-editor .elementor-widget-container'...
Berhasil mendapatkan 3 item teks (dari toggle) dari https://penmaba.unj.ac.id/jalur-masuk-sarjana-diploma/.
Scraping https://penmaba.unj.ac.id/snbp/ dengan selector '.elementor-toggle .elementor-tab-content'...
Berhasil mendapatkan 10 item teks (dari toggle) dari https://penmaba.unj.ac.id/snbp/.
Scraping https://penmaba.unj.ac.id/snbp/ dengan selector '.elementor-element.elementor-element-1f43bb67 > .elementor-widget-container'...
Berhasil mendapatkan 1 item teks (dari toggle) dari https://penmaba.unj.ac.id/snbp/.
Scraping https://penmaba.unj.ac.id/sbmptn/ dengan selector '.elementor-element.elementor-element-6995efd3 > .elementor-widget-container'...
Berhasil mendapatkan 1 item teks (dari toggle) dari https://penmaba.unj.ac.id/sbmptn/.
Scraping https://penmaba.unj.ac.id/sbmptn/ dengan selector '.elementor-toggle .elementor-tab-content'...
Berha