<a href="https://colab.research.google.com/github/naufalnashif/scraping-syntax-repo/blob/main/scraping_syntax.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Scraping Syntax


### About

Extract data from Kampus Merdeka, Tokopedia, News Headlines, E-Commerce, and Playstore Reviews. Analyze trends with Python scripts. Demo: [Hugging Face](https://huggingface.co/naufalnashif).

#### Features
- **Kampus Merdeka Scraper:** Extracts courses and schedules.
- **Tokopedia Scraper:** Gathers product details and reviews.
- **News Headline Scraper:** Retrieves latest news headlines.
- **E-Commerce Scraper:** Extracts product details and prices.
- **Playstore Reviews Scraper:** Captures user feedback.

#### Usage
Use Python scripts for each platform. Adapt and integrate for your projects.

#### Application Demo
Check the live demo on [Hugging Face](https://huggingface.co/naufalnashif) for hands-on experience.

**Note:** Respect website policies and legal considerations.


## Scraping Kampus Merdeka

### Install dependencies

In [None]:
!pip install requests beautifulsoup4 pandas

### Import Dependencies

In [None]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd

In [None]:
jumlah = 1000

try:
    url = f'https://api.kampusmerdeka.kemdikbud.go.id/magang/browse/opportunities?opportunity_type=MSIB&activity_type=&offset=25&limit={jumlah}'
    # url = f'https://api.kampusmerdeka.kemdikbud.go.id/mbkm/ref-kegiatans/msib-configs-no-auth?'
    # url = f'https://api.kampusmerdeka.kemdikbud.go.id/mbkm/ref-kegiatans/msib-configs-no-auth/upcoming'
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15"
    ]
    # Mendapatkan User-Agent acak
    random_user_agent = random.choice(user_agents)
    # Menggunakan User-Agent dalam permintaan HTTP
    headers = {
        "User-Agent": random_user_agent,
        "Accept-Language": "en-US,en;q=0.5"
    }
    timeout = 10
    response = requests.get(url, headers=headers, timeout=timeout)
    df = pd.DataFrame(response.json()['data'])

except requests.exceptions.RequestException as e:
    print(f"An error occurred: {e}")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     1000 non-null   object
 1   opportunity_type       1000 non-null   object
 2   name                   1000 non-null   object
 3   activity_type          1000 non-null   object
 4   location               1000 non-null   object
 5   months_duration        1000 non-null   int64 
 6   start_registration     1000 non-null   object
 7   end_registration       1000 non-null   object
 8   mitra_id               1000 non-null   object
 9   mitra_name             1000 non-null   object
 10  mitra_brand_name       1000 non-null   object
 11  logo                   1000 non-null   object
 12  published_time         1000 non-null   object
 13  credits_count          1000 non-null   int64 
 14  location_kotakab_code  1000 non-null   object
 15  participants_count    

In [None]:
df

Unnamed: 0,id,opportunity_type,name,activity_type,location,months_duration,start_registration,end_registration,mitra_id,mitra_name,mitra_brand_name,logo,published_time,credits_count,location_kotakab_code,participants_count,certified,start_duration,end_duration
0,d32bec2d-03f0-11ef-883f-0ead411fa680,MSIB,Research & Development,WFO,Kota Jakarta Timur,4,2024-06-01T00:00:00Z,2024-07-26T00:00:00Z,4727fda9-8233-4f13-879f-101bcc9947c6,PT Bintang Toedjoe,Bintang Toedjoe,https://storage.googleapis.com/kampusmerdeka_k...,2024-05-27T10:00:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z
1,2b236276-03f0-11ef-883f-0ead411fa680,MSIB,Engineering,WFO,Kota Jakarta Timur,4,2024-06-01T00:00:00Z,2024-07-26T00:00:00Z,4727fda9-8233-4f13-879f-101bcc9947c6,PT Bintang Toedjoe,Bintang Toedjoe,https://storage.googleapis.com/kampusmerdeka_k...,2024-05-27T10:00:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z
2,0cadce22-03ee-11ef-883f-0ead411fa680,MSIB,Quality Control,WFO,Kota Jakarta Timur,4,2024-06-01T00:00:00Z,2024-07-26T00:00:00Z,4727fda9-8233-4f13-879f-101bcc9947c6,PT Bintang Toedjoe,Bintang Toedjoe,https://storage.googleapis.com/kampusmerdeka_k...,2024-05-27T10:00:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z
3,fa19bda9-03ce-11ef-883f-0ead411fa680,MSIB,Human Resources Development,WFO,Kota Jakarta Timur,4,2024-06-01T00:00:00Z,2024-07-26T00:00:00Z,4727fda9-8233-4f13-879f-101bcc9947c6,PT Bintang Toedjoe,Bintang Toedjoe,https://storage.googleapis.com/kampusmerdeka_k...,2024-05-27T10:00:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z
4,f284cdea-03ee-11ef-883f-0ead411fa680,MSIB,Quality Assurance,WFO,Kota Jakarta Timur,4,2024-06-01T00:00:00Z,2024-07-26T00:00:00Z,4727fda9-8233-4f13-879f-101bcc9947c6,PT Bintang Toedjoe,Bintang Toedjoe,https://storage.googleapis.com/kampusmerdeka_k...,2024-05-27T10:00:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,edb7c56d-049b-11ef-bb03-ceeddaa1b367,MSIB,Andev,WFO,Kab. Bekasi,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,823ec9fb-a929-460a-8af6-8a90e0dd777a,Yayasan Insan Indonesia Kompeten,Gerakan Nasional Indonesia Kompeten (GNIK),https://storage.googleapis.com/kampusmerdeka_k...,2024-05-20T11:14:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z
996,b4cd5b51-0491-11ef-bb03-ceeddaa1b367,MSIB,Human Resources (HR),WFO,Kab. Bandung Barat,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,823ec9fb-a929-460a-8af6-8a90e0dd777a,Yayasan Insan Indonesia Kompeten,Gerakan Nasional Indonesia Kompeten (GNIK),https://storage.googleapis.com/kampusmerdeka_k...,2024-05-20T11:14:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z
997,75c36709-06f0-11ef-8ce9-8a7c35fd8d18,MSIB,Staff Quality Engineering,WFO,Kab. Bogor,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,823ec9fb-a929-460a-8af6-8a90e0dd777a,Yayasan Insan Indonesia Kompeten,Gerakan Nasional Indonesia Kompeten (GNIK),https://storage.googleapis.com/kampusmerdeka_k...,2024-05-20T11:14:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z
998,f9734c25-04a4-11ef-9b3e-022ae5f04a49,MSIB,Bidang Dokumentasi Pembiayaan,WFO,Kota Jakarta Timur,4,2024-04-16T00:00:00Z,2024-06-10T00:00:00Z,823ec9fb-a929-460a-8af6-8a90e0dd777a,Yayasan Insan Indonesia Kompeten,Gerakan Nasional Indonesia Kompeten (GNIK),https://storage.googleapis.com/kampusmerdeka_k...,2024-05-20T11:14:00Z,0,,0,False,0001-01-01T00:00:00Z,0001-01-01T00:00:00Z


##Scraping Tokopedia

In [None]:
from urllib.parse import quote
import random

def scrape_tokped(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)
    while len(products) < num_items :
        url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='

        user_agents = [
          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
          "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
          "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
        ]
        # Mendapatkan User-Agent acak
        random_user_agent = random.choice(user_agents)
        # Menggunakan User-Agent dalam permintaan HTTP
        headers = {
            "User-Agent": random_user_agent,
            "Accept-Language": "en-US,en;q=0.5"
        }
        timeout = 10
        try :
            response = requests.get(url, headers = headers, timeout = timeout)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')

            product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)

            for product_info in product_container_list:
                link = product_info['href']
                title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
                title = title_element.text.strip() if title_element else None

                harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
                harga = harga_element.text.strip() if harga_element else None

                terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
                terjual = terjual_element.text if terjual_element else None

                rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
                rating = rating_element.text if rating_element else None

                toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
                toko = toko_element.text.strip() if toko_element else None

                asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
                asal_product = asal_product_element.text.strip() if asal_product_element else None

                products.append({
                    'link': link,
                    'produk' : title,
                    'harga' : harga,
                    'terjual' : terjual,
                    'rating' : rating,
                    'toko' : toko,
                    'asal_product' : asal_product,
                })
            if len(products) >= num_items:
                products = products[:num_items]
                break

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")
        page += 1
    return products

In [None]:
nama_barang = 'Iphone 12 Pro'
num_items = 100
hasil = scrape_tokped(nama_barang, num_items)

df = pd.DataFrame(hasil)
df

Unnamed: 0,link,produk,harga,terjual,rating,toko,asal_product
0,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,Iphone 12 Pro 128gb / 256gb /512gb Bekas Fullset,Rp6.650.000,1rb+ terjual,4.9,Boboy Store,Jakarta Barat
1,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,iPhone 12 Pro/PM Second Original 256GB/128GB S...,Rp7.560.000,2 terjual,5.0,OPVO Phone Store,Tangerang
2,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,IPHONE 12 PRO MAX PACIFIC BLUE 256GB MULUS LIK...,Rp8.650.000,60+ terjual,4.9,NERDBOSS GADGET,Jakarta Utara
3,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,IPHONE 12 PRO 256 GB PACIFIC BLUE SINYAL ALL O...,Rp7.800.000,2 terjual,5.0,Healthy Co,Jakarta Barat
4,https://ta.tokopedia.com/promo/v1/clicks/8a-xg...,iPhone 12 Pro MAX 128GB 256GB 512GB Second Ex ...,Rp6.520.000,250+ terjual,4.8,kino phone cell,Jakarta Pusat
...,...,...,...,...,...,...,...
95,https://www.tokopedia.com/khaylagadget/iphone-...,"Iphone 12 pro max 256gb allop,",Rp9.500.000,,,Mikhayla gadget store,Surakarta
96,https://www.tokopedia.com/endelepet-store/ipho...,Iphone 12 Pro 256Gb Second Req Khusus,Rp10.000.000,,,endelepet-store,Jakarta Pusat
97,https://www.tokopedia.com/elharveystore/iphone...,iPhone 12 PRO 256GB inter - part full original,Rp9.200.000,1 terjual,5.0,ElHarveyStore,Jakarta Barat
98,https://www.tokopedia.com/sheacel/iphone-12-pr...,Iphone 12 pro graphite 128gb ex ibox,Rp9.700.000,,,Shea flashing cellular,Jakarta Pusat


In [None]:
from collections import Counter
print(Counter(df['terjual']))

Counter({None: 18, '1 terjual': 10, '250+ terjual': 8, '30+ terjual': 5, '60+ terjual': 4, '18 terjual': 4, '100+ terjual': 4, '4 terjual': 4, '3 terjual': 4, '750+ terjual': 3, '500+ terjual': 3, '50+ terjual': 3, '20 terjual': 2, '7 terjual': 2, '22 terjual': 2, '15 terjual': 2, '5 terjual': 2, '80+ terjual': 2, '2 terjual': 2, '1rb+ terjual': 1, '25 terjual': 1, '16 terjual': 1, '23 terjual': 1, '9 terjual': 1, '11 terjual': 1, '70+ terjual': 1, '90+ terjual': 1, '17 terjual': 1, '10 terjual': 1, '12 terjual': 1, '14 terjual': 1, '40+ terjual': 1, '24 terjual': 1, '28 terjual': 1, '13 terjual': 1})


## Scraping Playstore Review (With Streamlit Framework)
Aplication demo on :https://huggingface.co/spaces/naufalnashif/scraping-playstore-reviews/

### Install Dependencies

In [None]:
!pip install streamlit google-play-scraper

In [None]:
import streamlit as st
import pandas as pd
#from google_play_scraper import app, Sort, reviews, permission, reviews_all, search
from google_play_scraper import app, Sort, reviews, reviews_all, permissions, search
import re

#---------------------------------------------func----------------------------------

@st.cache_data
def get_url_by_app_name(nama_apl):
    """
    Mengembalikan URL aplikasi berdasarkan nama aplikasi dari kamus.
    Parameters:
    - nama_apl (str): Nama aplikasi yang dicari.
    - aplikasi_dict (dict): Kamus yang memetakan nama aplikasi ke URL.
    Returns:
    - str or None: URL aplikasi atau None jika tidak ditemukan.
    """
    list_url = [
        'https://play.google.com/store/apps/details?id=com.shopee.id',
        'https://play.google.com/store/apps/details?id=com.tokopedia.tkpd',
        'https://play.google.com/store/apps/details?id=com.amazon.mShop.android.shopping',
        'https://play.google.com/store/apps/details?id=com.grabtaxi.passenger'
    ]
    aplikasi_dict = {
        'Shopee': list_url[0],
        'Tokopedia': list_url[1],
        'Amazon': list_url[2],
        'Grab': list_url[3]
    }
    return aplikasi_dict.get(nama_apl, None)

@st.cache_data
def extract_app_id(play_store_url):
    # Definisikan pola ekspresi reguler untuk menemukan ID aplikasi
    pattern = r'id=([a-zA-Z0-9._]+)'

    # Gunakan ekspresi reguler untuk mencocokkan pola dalam URL
    match = re.search(pattern, play_store_url)

    # Periksa apakah ada kecocokan dan kembalikan ID aplikasi jika ada
    if match:
        app_id = match.group(1)
        return app_id
    else:
        return None
@st.cache_data
def scraping_func(app_id, bahasa, negara, filter_score, jumlah):
    filter_score = None if filter_score == "Semua Rating" else filter_score

    rws, token = reviews(
        app_id,
        lang=bahasa,
        country=negara,
        sort=Sort.NEWEST,
        filter_score_with=filter_score,
        count=jumlah
    )

    scraping_done = bool(rws)

    return rws, token, scraping_done

@st.cache_data
def scraping_all_func(app_id, bahasa, negara, filter_score, sleep = 0):
    filter_score = None if filter_score == "Semua Rating" else filter_score

    rws = reviews_all(
        app_id,
        sleep_milliseconds=sleep, # defaults to 0
        lang=bahasa,
        country=negara,
        filter_score_with=filter_score,
    )

    scraping_done = bool(rws)

    return rws, scraping_done

@st.cache_data
def buat_chart(df, target_year):
    st.write(f"Bar Chart Tahun {target_year}:")

    # Ambil bulan
    df['at'] = pd.to_datetime(df['at'])  # Convert 'at' column to datetime
    df['month'] = df['at'].dt.month
    df['year'] = df['at'].dt.year

    # Filter DataFrame for the desired year
    df_filtered = df[df['year'] == target_year]

    # Check if data for the target year is available
    if df_filtered.empty:
        st.warning(f"Tidak ada data untuk tahun {target_year}.")
        return

    # Mapping nilai bulan ke nama bulan
    bulan_mapping = {
        1: f'Januari {target_year}',
        2: f'Februari {target_year}',
        3: f'Maret {target_year}',
        4: f'April {target_year}',
        5: f'Mei {target_year}',
        6: f'Juni {target_year}',
        7: f'Juli {target_year}',
        8: f'Agustus {target_year}',
        9: f'September {target_year}',
        10: f'Oktober {target_year}',
        11: f'November {target_year}',
        12: f'Desember {target_year}'
    }

    # Mengganti nilai dalam kolom 'month' menggunakan mapping
    df_filtered['month'] = df_filtered['month'].replace(bulan_mapping)

    # Menentukan warna untuk setiap kategori dalam kolom 'score'
    warna_score = {
        1: '#FF9AA2',
        2: '#FFB7B2',
        3: '#FFDAC1',
        4: '#E2F0CB',
        5: '#B5EAD7'
    }

    # Sorting unique scores
    unique_scores = sorted(df_filtered['score'].unique())

    # Ensure months are in the correct order
    months_order = [
        f'Januari {target_year}', f'Februari {target_year}', f'Maret {target_year}', f'April {target_year}', f'Mei {target_year}', f'Juni {target_year}',
        f'Juli {target_year}', f'Agustus {target_year}', f'September {target_year}', f'Oktober {target_year}', f'November {target_year}', f'Desember {target_year}'
    ]

    # Sort DataFrame based on the custom order of months
    df_filtered['month'] = pd.Categorical(df_filtered['month'], categories=months_order, ordered=True)
    df_filtered = df_filtered.sort_values('month')

    # Create a bar chart with stacking and manual colors
    st.bar_chart(
        df_filtered.groupby(['month', 'score']).size().unstack().fillna(0),
        color=[warna_score[score] for score in unique_scores]
    )
#--------------------------------------------UI---------------------------------------
# Streamlit UI
st.title("Data Everywhere : Scraping Playstore Reviews")
with st.expander("Scraping Settings :"):
    scrape = st.selectbox("PIlih Metode :", ("Semua Reviews", "Estimasi Data"), index = 1)
    aplikasi = st.radio(
        "Pilih Input :",
        ["Defaults", "Custom URL"], index = 0,
        captions = ["Shopee, Tokopedia, Amazon, Grab", "Tambahkan URL Manual"])
    if aplikasi == "Defaults" :
        nama_apl = st.selectbox("Pilih Aplikasi :", ('Shopee', 'Tokopedia', 'Amazon', 'Grab'))
        if nama_apl :
            url = get_url_by_app_name(nama_apl)
    elif aplikasi == "Custom URL":
        url = st.text_input("Masukkan URL Aplikasi Pada Web Playstore :", 'https://play.google.com/store/apps/details?id=com.shopee.id')
    if scrape == "Estimasi Data" :
        jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 10, max_value = 10000, step = 10, placeholder="Type a number...")
with st.expander("Preference Settings :"):
    if scrape == "Semua Reviews" :
        sleep = st.number_input("Masukkan sleep (milisecond) :", min_value = 1, max_value = 1000, step = 10, placeholder="Type a number...")
    bahasa = st.selectbox("Pilih Bahasa:", ('en', 'id'))
    negara = st.selectbox("Pilih Negara :", ('us', 'id'))
    filter_score = st.selectbox("Pilih Rating :", ('Semua Rating', 1, 2, 3, 4, 5))
    target_year = st.selectbox("Pilih Tahun Bar Chart :", (2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025), index = 4)
    download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")
#-------------------------------------------BE----------------------------------------

scraping_done = False
if url and bahasa and negara and filter_score and download_format:
    if st.button ("Mulai Scraping") :
        app_id = extract_app_id(url)
        if scrape == "Semua Reviews" :
            reviews, scraping_done = scraping_all_func(app_id, bahasa, negara, filter_score, sleep)
            df = pd.DataFrame(reviews)
        elif scrape == "Estimasi Data":
            reviews, token, scraping_done = scraping_func(app_id, bahasa, negara, filter_score, jumlah)
            df = pd.DataFrame(reviews)
        else :
            st.warning("Masukkan pilihan yang valid")
else :
    st.error("Mohon Masukkan Parameter.")

if scraping_done == True:
    with st.expander(f"Hasil Scraping {app_id}:"):
        st.write(df)

    buat_chart(df, target_year)

    if download_format == "XLSX":
        # Clean the data to remove illegal characters
        cleaned_data = df.applymap(lambda x: "".join(char for char in str(x) if char.isprintable()))

        # Save the cleaned data to Excel
        cleaned_data.to_excel(f"hasil_scraping_{app_id}.xlsx", index=False)

        # Provide the download button for the cleaned Excel file
        st.download_button(label=f"Unduh XLSX ({len(reviews)} data)", data=open(f"hasil_scraping_{app_id}.xlsx", "rb").read(), key="xlsx_download", file_name=f"hasil_scraping_{app_id}.xlsx")

    elif download_format == "CSV":
        csv = df.to_csv(index=False)

        # Provide the download button for the CSV file
        st.download_button(label=f"Unduh CSV ({len(reviews)} data)", data=csv, key="csv_download", file_name=f"hasil_scraping_{app_id}.csv")

    elif download_format == "JSON":
        json_data = df.to_json(orient="records")

        # Provide the download button for the JSON file
        st.download_button(label=f"Unduh JSON ({len(reviews)} data)", data=json_data, key="json_download", file_name=f"hasil_scraping_{app_id}.json")

else:
    st.info("Tidak ada data")

st.divider()
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
st.write('Terima kasih telah mencoba demo ini!')

## Scraping E-Commerce with Streamlit Framework (Klikindomaret, Tokopedia, Tokopedia(Selenium))
Aplication Demo on : https://huggingface.co/spaces/naufalnashif/scraping-ecommerce-2023/

### Install Dependencies

In [None]:
!pip install streamlit pandas numpy requests beautifulsoup4 matplotlib seaborn wordcloud datetime collections nltk urllib3 requests-html selenium

### Syntax

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import quote
import streamlit as st
import json
import time
import logging
import random
from requests_html import HTMLSession

from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By

logging.basicConfig(level=logging.DEBUG)

@st.cache_data
def scrape_klikindomaret(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)


    while len(products) < num_items :
        if len (products) > num_items :
            products = products[:num_items]
            break
        url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem="
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        product_list = soup.find_all('a', href=True)

        for product in product_list:

            product_href = product['href']
            if '/product/' in product_href:
                product_name = product.find('div', class_='title').text.strip()
                product_price = product.find('span', class_='normal price-value').text.strip()

                 # Cek apakah ada harga sebelum diskon dan persentase diskon
                discount_element = product.find('span', class_='strikeout disc-price')
                discount_percentage = ""
                original_price = ""
                if discount_element:
                    discount_percentage = discount_element.find('span', class_='discount').text.strip()
                    original_price = discount_element.text.replace(discount_percentage, '').strip()
                else:
                    # Jika tidak ada diskon, set discount_percentage ke "0%" dan original_price ke product_price
                    discount_percentage = "0%"
                    original_price = product_price

                product_link = f"https://www.klikindomaret.com{product_href}"
                products.append({
                    'product': product_name,
                    'original_price': original_price,
                    'discount_percentage': discount_percentage,
                    'price': product_price,
                    'link': product_link
                })
            if len (products) > num_items :
                products = products[:num_items]
                break
        page += 1

    return products

@st.cache_data
def scrape_tokped(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)
    while len(products) < num_items :
        url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US,en;q=0.9,id-ID;q=0.8,id;q=0.7,ja;q=0.6,ru;q=0.5,zh-CN;q=0.4,zh;q=0.3,af;q=0.2,nl;q=0.1',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
        }
        timeout = 10
        try :
            response = requests.get(url, headers = headers, timeout = timeout)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)

            for product_info in product_container_list:
                link = product_info['href']
                title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
                title = title_element.text.strip() if title_element else None

                harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
                harga = harga_element.text.strip() if harga_element else None

                terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
                terjual = terjual_element.text if terjual_element else None

                rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
                rating = rating_element.text if rating_element else None

                toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
                toko = toko_element.text.strip() if toko_element else None

                asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
                asal_product = asal_product_element.text.strip() if asal_product_element else None

                products.append({
                    'link': link,
                    'produk' : title,
                    'harga' : harga,
                    'terjual' : terjual,
                    'rating' : rating,
                    'toko' : toko,
                    'asal_product' : asal_product,
                })
            if len(products) >= num_items:
                products = products[:num_items]
                break

        except Exception as e:
            st.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
            st.write("Jalankan script ini di IDE/colab.research.google.com Anda :")
            code =  '''
                    !pip install beautifulsoup4
                    !pip install requests
                    !pip install streamlit
                    from bs4 import BeautifulSoup
                    import requests
                    from urllib.parse import quote
                    import pandas as pd
                    import streamlit as st
                    def scrape_tokped(nama_barang, num_items):
                        products = []
                        page = 1
                        query = quote(nama_barang)
                        while len(products) < num_items :
                            url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='

                            headers = {
                                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                            }
                            timeout = 10
                            try :
                                response = requests.get(url, headers = headers, timeout = timeout)
                                response.raise_for_status()

                                soup = BeautifulSoup(response.text, 'html.parser')

                                product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)

                                for product_info in product_container_list:
                                    link = product_info['href']
                                    title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
                                    title = title_element.text.strip() if title_element else None

                                    harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
                                    harga = harga_element.text.strip() if harga_element else None

                                    terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
                                    terjual = terjual_element.text if terjual_element else None

                                    rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
                                    rating = rating_element.text if rating_element else None

                                    toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
                                    toko = toko_element.text.strip() if toko_element else None

                                    asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
                                    asal_product = asal_product_element.text.strip() if asal_product_element else None

                                    products.append({
                                        'link': link,
                                        'produk' : title,
                                        'harga' : harga,
                                        'terjual' : terjual,
                                        'rating' : rating,
                                        'toko' : toko,
                                        'asal_product' : asal_product,
                                    })
                                if len(products) >= num_items:
                                    products = products[:num_items]
                                    break

                            except requests.exceptions.RequestException as e:
                                logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
                                break
                            except requests.exceptions.HTTPError as e:
                                logging.error(f"HTTP Error: {e}")
                                break
                            except Exception as e:
                                logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
                                break
                            page += 1
                        return products)

                    nama_barang = input("Masukkan nama barang: ")
                    num_items = int(input("Masukkan jumlah barang yang ingin diambil: "))
                    # Melakukan scraping menggunakan fungsi scrape_tokped
                    hasil = scrape_tokped(nama_barang, num_items)
                    pd.DataFrame(hasil)'''
            st.code(code, language='python')
            break
        page += 1
    return products

@st.cache_data
def scrape_tokped_with_selenium(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)

    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-notifications')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(options=options)

    while len(products) < num_items :
        try :
            url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='

            driver.get(url)
            # Eksekusi JavaScript untuk mengatur header
            #driver.execute_script(
                #"""
                #var xhr = new XMLHttpRequest();
                #xhr.open('GET', arguments[0], false);
                #xhr.setRequestHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36');
                #xhr.send(null);
                #"""
                #, url
            #)

            # Dapatkan sumber halaman setelah eksekusi JavaScript
            # Tunggu hingga halaman selesai dimuat (opsional, tergantung kebutuhan)
            driver.implicitly_wait(20)  # Tunggu maksimal 20 detik

            # Temukan elemen kontainer produk berdasarkan XPath atau CSS selector
            # Di sini, saya menggunakan XPath sebagai contoh:
            product_container_xpath = "//body//*"  # Ganti dengan XPath yang sesuai
            #html = driver.find_elements_by_xpath('//body//*')
            #html = driver.find_element(By.XPATH, product_container_xpath)
            html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
            st.write(html)
            # Gunakan BeautifulSoup untuk melakukan parsing HTML
            soup = BeautifulSoup(html, "html.parser")

            # Cari semua elemen yang sesuai
            product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href=True)

            for product_info in product_container_list:
                link = product_info['href']
                st.write(link)
                title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
                title = title_element.text.strip() if title_element else None

                harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
                harga = harga_element.text.strip() if harga_element else None

                terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
                terjual = terjual_element.text if terjual_element else None

                rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
                rating = rating_element.text if rating_element else None

                toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
                toko = toko_element.text.strip() if toko_element else None

                asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
                asal_product = asal_product_element.text.strip() if asal_product_element else None

                products.append({
                    'link': link,
                    'produk' : title,
                    'harga' : harga,
                    'terjual' : terjual,
                    'rating' : rating,
                    'toko' : toko,
                    'asal_product' : asal_product,
                })
            if len(products) >= num_items:
                products = products[:num_items]
                break

        except requests.exceptions.RequestException as e:
            logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
            st.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
            break
        except requests.exceptions.HTTPError as e:
            logging.error(f"HTTP Error: {e}")
            st.error(f"HTTP Error: {e}")
            break
        except Exception as e:
            logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
            st.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
            break
        except WebDriverException as e:
            st.error(f"An error occurred: {e}")
            break
        finally:
            if driver:
                driver.quit()
        page += 1
    return products
#---------------------------------------------------User Interface----------------------------------------------------------------------

# Streamlit UI
st.title("Scraping E-Commerce")

with st.expander("Settings :"):
    # Pilihan untuk memilih situs web
    selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com", "tokopedia.com", "tokopedia.com(selenium)"])

    nama_barang = st.text_input("Masukkan Nama Barang :")
    num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")

    download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
    st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")

# Variabel tersembunyi untuk menyimpan hasil scraping
hidden_data = []

scraping_done = False  # Tambahkan variabel ini

if selected_site == "klikindomaret.com":
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_klikindomaret(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True

if selected_site =="tokopedia.com":
    st.warning("Jika mengalami error karena sedang dalam pengembangan. Silahkan pilih situs yang lain", icon="⚠️")
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_tokped(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True

if selected_site == "tokopedia.com(selenium)":
    st.warning("Jika mengalami error karena sedang dalam pengembangan. Silahkan pilih situs yang lain", icon="⚠️")
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_tokped_with_selenium(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True

# Simpan DataFrame ke dalam file
output_file = f"scraped_{selected_site}_{nama_barang}.xlsx"
output_file_csv = f"scraped_{selected_site}_{nama_barang}.csv"
output_file_json = f"scraped_{selected_site}_{nama_barang}.json"


#---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------

# Tampilkan hasil scraping
if scraping_done:
    if hidden_data:
        # Menampilkan hasil sentimen dalam kotak yang dapat diperluas
        with st.expander(f"Hasil Scraping {selected_site} :"):
            st.write(pd.DataFrame(scraped_products))
        if download_format == "XLSX":
            df = pd.DataFrame(scraped_products)
            df.to_excel(output_file, index=False)
            st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file)
        elif download_format == "CSV":
            df = pd.DataFrame(scraped_products)
            csv = df.to_csv(index=False)
            st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv)
        elif download_format == "JSON":
            json_data = pd.DataFrame(scraped_products).to_json(orient="records")
            st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json)
    elif not hidden_data:
        st.warning(f"Tidak ada data pada query '{nama_barang}'", icon="⚠️")

if not scraping_done:
    st.write("Tidak ada data untuk diunduh.")

st.divider()
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
st.write('Terima kasih telah mencoba demo ini!')

## Scraping News Headline with Streamlit Framework (Cnbc, Detik, Tempo, )

In [None]:
#---------------------------------------------------Requirements----------------------------------------------------------------------
import streamlit as st
import pandas as pd
import random
import numpy as np
import re
import json
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import requests
from bs4 import BeautifulSoup
from datetime import date
import time
from collections import Counter
import nltk
from nltk.corpus import stopwords


#---------------------------------------------------Scraping Function----------------------------------------------------------------------

@st.cache_data
def scrape_cnbc_data(query, date, jumlah, param_kosong):
    data = []
    page = 1
    progress_text = "Scraping in progress. Please wait."
    my_bar = st.progress(len(data), text=progress_text)


    while len (data) < jumlah :
        try :

            url = f"https://www.cnbcindonesia.com/search?query={query}&p={page}&kanal=&tipe=artikel&date={date}"
            user_agents = [
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
            ]

            # Mendapatkan User-Agent acak
            random_user_agent = random.choice(user_agents)

            # Menggunakan User-Agent dalam permintaan HTTP
            headers = {
                "User-Agent": random_user_agent,
                "Accept-Language": "en-US,en;q=0.5"
            }
            timeout = 10
            response = requests.get(url, headers=headers, timeout = timeout)
            soup = BeautifulSoup(response.content, 'html.parser')

            articles = soup.find_all('article')

            if not articles:
                break

            for article in articles:
                title = article.find('h2').text.strip()
                link = article.find('a')['href']
                category = article.find('span', class_ = 'label').text.strip()
                date_category = article.find('span', class_='date').text.strip()
                text_parts = date_category.split(' - ')
                date = text_parts[1].strip()

                data.append({
                    'category': category,
                    'date': date,
                    'judul-berita': title,
                    'link-berita': link,
                })
            if len(data) > jumlah:
                data = data[:jumlah]
            break

            prop = min(len(data) / jumlah, 1)
            my_bar.progress(prop, text=progress_text)
            page += 1
        except requests.exceptions.RequestException as e:
            st.error(f"An error occurred: {e}")
            break



    time.sleep(1)
    my_bar.empty()

    return data


@st.cache_data
def scrape_detik_news(query, date, jumlah, param_kosong):
    start_page = 1
    base_url = "https://www.detik.com/search/searchall"
    data = []
    progress_text = "Scraping in progress... Please wait..."
    my_bar = st.progress(len(data), text=progress_text)
    timeout = 10

    while len(data) < jumlah:
        try:
            params = {
                "query": query,
                "siteid": 2,
                "sortby": "time",
                "page": start_page
            }

            url = f'https://www.detik.com/search/searchall?query={query}&siteid=2&sortby=time&page={start_page}'
            # Daftar beberapa User-Agent
            user_agents = [
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
            ]

            # Mendapatkan User-Agent acak
            random_user_agent = random.choice(user_agents)

            # Menggunakan User-Agent dalam permintaan HTTP
            headers = {
                "User-Agent": random_user_agent,
                "Accept-Language": "en-US,en;q=0.5"
            }
            response = requests.get(url, headers=headers, timeout = timeout)
            response.raise_for_status()

            soup = BeautifulSoup(response.text, 'html.parser')
            articles = soup.find_all('article')

            if not articles :
                break
            for article in articles :
                title = article.find('h2').text.strip()
                link = article.find('a')['href']
                category = article.find('span', class_='category').text
                date_category = article.find('span', class_='date').text
                date = date_category.replace(category, '').strip()
                data.append({
                    'category': category,
                    'date': date,
                    'judul-berita': title,
                    'link-berita': link,
                })

                if len(data) >= jumlah:
                    data = data[:jumlah]
                    break

            prop = min(len(data) / jumlah, 1)
            my_bar.progress(prop, text=progress_text)

            start_page += 1
        except requests.exceptions.RequestException as e:
            st.error(f"An error occurred: {e}")
            break

    time.sleep(1)
    my_bar.empty()
    return data

@st.cache_data
def scrape_viva_data(query, date, jumlah, param_kosong):
    data = []
    page = 1
    progress_text = "Scraping in progress. Please wait."
    my_bar = st.progress(len(data), text=progress_text)


    while len (data) < jumlah :
        try :

            url = f"https://www.viva.co.id/search?q={query}"

            user_agents = [
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
            ]

            # Mendapatkan User-Agent acak
            random_user_agent = random.choice(user_agents)

            # Menggunakan User-Agent dalam permintaan HTTP
            headers = {
                "User-Agent": random_user_agent,
                "Accept-Language": "en-US,en;q=0.5"
            }
            timeout = 10
            response = requests.get(url, headers=headers, timeout = timeout)
            soup = BeautifulSoup(response.content, 'html.parser')

            articles = soup.find_all('div', class_='card-box ft240 margin-bottom-sm')
            if not articles :
                break

            for article in articles :

                title = article.find('h2', class_='title').text
                link = article.find('a')['href']
                category_element = article.find('span', class_="kanal cl-dark")
                category = category_element.text.strip() if category_element else None
                date_element = article.find('h4', class_="date")
                date_before = date_element.text.strip() if date_element else None
                date = date_before.replace(category, '')
                data.append({
                    'category': category,
                    'date': date,
                    'judul-berita': title,
                    'link-berita': link,
                })
            if len(data) > jumlah:
                data = data[:jumlah]
            break

            prop = min(len(data) / jumlah, 1)
            my_bar.progress(prop, text=progress_text)
            page += 1
        except requests.exceptions.RequestException as e:
            st.error(f"An error occurred: {e}")
            break



    time.sleep(1)
    my_bar.empty()

    return data

@st.cache_data
def scrape_tempo_data(query, date, jumlah, selected_channel):
    data = []
    domain = 1
    max_domains = 5
    progress_text = "Scraping in progress. Please wait."
    my_bar = st.progress(len(data), text=progress_text)
    # List of channel values
    default_channels = {
        'All(Latest Only)': '',
        'Nasional': '20',
        'Metro': '19',
        'Dunia': '5',
        'Bisnis': '1',
        'Bola': '21',
        'Sport': '33',
        'Gaya': '9',
        'Seleb': '32',
        'Cantik': '2',
        'Tekno': '34',
        'Otomotif': '23',
        'Travel': '35',
        'Blog': '43',
        'Difabel': '44',
        'Ramadan': '30',
        'Kolom': '14',
        'Fokus': '8',
        'Creative Lab': '47',
        'Event': '62',
        'Data': '65',
        'Cek Fakta': '66',
        'Newsletter': '63',
        'Inforial': '12'
    }

    # Ubah channels sesuai dengan selected_channel
    if selected_channel != 'Defaults' and selected_channel in default_channels:
        channels = {selected_channel: default_channels[selected_channel]}
    else:
        channels = default_channels
    seen_titles = set()  # Set untuk melacak judul berita yang sudah muncul

    try:
        while len(data) < jumlah and domain <= max_domains:
            for kanal, value in channels.items():
                url = f"https://www.tempo.co/search?waktu={waktu}&kanal={value}&subkanal=&domain={domain}&q={query}"
                user_agents = [
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134",
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.2 Safari/605.1.15",
                ]
                # Get a random User-Agent
                random_user_agent = random.choice(user_agents)
                # Use User-Agent in the HTTP request
                headers = {
                    "User-Agent": random_user_agent,
                    "Accept-Language": "en-US,en;q=0.5"
                }
                timeout = 10
                response = requests.get(url, headers=headers, timeout=timeout)
                soup = BeautifulSoup(response.text, 'html.parser')
                articles = soup.find_all('div', class_='card-box ft240 margin-bottom-sm')
                if not articles:
                    break
                for article in articles:
                    title = article.find('h2', class_='title').text
                    # Hanya proses artikel yang belum pernah ditemui
                    if title not in seen_titles:
                        link = article.find('a')['href']
                        category_element = article.find('span', class_="kanal cl-dark")
                        category = category_element.text.strip() if category_element else None
                        date_element = article.find('h4', class_="date")
                        date_before = date_element.text.strip() if date_element else None
                        date = date_before.replace(category, '')
                        data.append({
                            'category': category,
                            'kanal' : kanal,
                            'date': date,
                            'judul-berita': title,
                            'link-berita': link,
                        })
                        seen_titles.add(title)  # Tambahkan judul berita ke set
                        if len(data) >= jumlah:
                            break
                if len(data) >= jumlah:
                    break
                prop = min(len(data) / jumlah, 1)
                my_bar.progress(prop, text=progress_text)
            domain += 1
    except requests.exceptions.RequestException as e:
        st.error(f"An error occurred: {e}")
    time.sleep(1)
    my_bar.empty()
    return data
#---------------------------------------------------Data Cleaning (RegEx)----------------------------------------------------------------------

def clean_text(text):
    # Pastikan text adalah string
    if not isinstance(text, str):
        text = str(text)
    # Tahap-1: Menghapus karakter non-ASCII
    text = re.sub(r'[^\x00-\x7F]+', '', text)

    # Tahap-2: Menghapus URL
    text = re.sub(r'http[s]?://.[a-zA-Z0-9./_?=%&#+!]+', '', text)
    text = re.sub(r'pic.twitter.com?.[a-zA-Z0-9./_?=%&#+!]+', '', text)

    # Tahap-3: Menghapus mentions
    text = re.sub(r'@[\w]+', '', text)

    # Tahap-4: Menghapus hashtag
    text = re.sub(r'#([\w]+)', '', text)

    # Tahap-5 Menghapus 'amp' yang menempel pada '&' dan 'gt' yang menempel pada '&'
    text = re.sub(r'&amp;|&gt;', '', text)

    # Tahap-6: Menghapus karakter khusus (simbol)
    text = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,./]', '', text)

    # Tahap-7: Menghapus angka
    text = re.sub(r'[0-9]+', '', text)

    # Tahap-8: Menggabungkan spasi ganda menjadi satu spasi
    text = re.sub(' +', ' ', text)

    # Tahap-9: Menghapus spasi di awal dan akhir kalimat
    text = text.strip()

    # Tahap-10: Konversi teks ke huruf kecil
    text = text.lower()

    # Tahap-11: koreksi duplikasi tiga karakter beruntun atau lebih (contoh. yukkk)
    # text = re.sub(r'([a-zA-Z])\1\1', '\\1', text)
    #text = re.sub(r'(.)(\1{2,})', r'\1\1', text)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)

    return text

#---------------------------------------------------Normalisasi----------------------------------------------------------------------

# Membaca kamus kata gaul Salsabila
kamus_path = '_json_colloquial-indonesian-lexicon.txt'  # Ganti dengan path yang benar
with open(kamus_path) as f:
    data = f.read()
lookp_dict = json.loads(data)

# Dict kata gaul saya sendiri yang tidak masuk di dict Salsabila
kamus_sendiri_path = 'kamus_gaul_custom.txt'
with open(kamus_sendiri_path) as f:
    kamus_sendiri = f.read()
kamus_gaul_baru = json.loads(kamus_sendiri)

# Menambahkan dict kata gaul baru ke kamus yang sudah ada
lookp_dict.update(kamus_gaul_baru)

# Fungsi untuk normalisasi kata gaul
def normalize_slang(text, slang_dict):
    words = text.split()
    normalized_words = [slang_dict.get(word, word) for word in words]
    return ' '.join(normalized_words)

#---------------------------------------------------NLTK Remove Stopwords----------------------------------------------------------------------

# Inisialisasi stopwords bahasa Indonesia
nltk.download("stopwords")
stop_words = set(stopwords.words("indonesian"))

def remove_stopwords(text, stop_words):
    # Pecah teks menjadi kata-kata
    words = text.split()

    # Hapus stopwords bahasa Indonesia
    words = [word for word in words if word not in stop_words]

    return " ".join(words)


def preprocessing_data(hidden_data):
    # Initialize results
    results_prep = []
    df = pd.DataFrame(hidden_data)
    texts = df["judul-berita"]
    # Process the text data
    for text in texts:
        cleaned_text = clean_text(text)
        norm_slang_text = normalize_slang(cleaned_text, lookp_dict)
        tanpa_stopwords = remove_stopwords(norm_slang_text, stop_words)

        results_prep.append({
            'judul-berita': text,
            'cleaned-text' : cleaned_text,
            'normalisasi-text' : norm_slang_text,
            'stopwords-remove' : tanpa_stopwords,
        })
    return results_prep


def eksplorasi_data(selected_options, results, colormap, words):
    # Kolom pertama untuk Word Cloud
    if 'Hasil EDA' in selected_options:
        # Membagi tampilan menjadi dua kolom
        columns = st.columns(2)
        all_texts = ""
        with columns[0]:
            if results:
                all_texts = all_texts = [result.get('stopwords-remove') for result in results if pd.notna(result.get('stopwords-remove'))]
                all_texts = " ".join(all_texts)

                st.subheader("Word Cloud")

                if all_texts:
                    wordcloud = WordCloud(width=800, height=500, background_color='white',
                                            colormap=colormap,
                                            contour_color='black',
                                            contour_width=2,
                                            mask=None).generate(all_texts)
                    st.image(wordcloud.to_array())

        # Kolom kedua untuk Most Common Words
        with columns[1]:
            st.subheader("Most Common Words")

            if all_texts:
                word_counts = Counter(all_texts.split())
                most_common_words = word_counts.most_common(words)

                words, counts = zip(*most_common_words)

                fig, ax = plt.subplots(figsize=(10, 6))
                ax.bar(words, counts)
                ax.set_xlabel("Kata-kata")
                ax.set_ylabel("Jumlah")
                ax.set_title("Kata-kata Paling Umum")
                ax.tick_params(axis='x', rotation=45)

                st.pyplot(fig)
@st.cache_data
def scrape_and_explore_data(_scrape_function, query, date, jumlah, selected_options, colormap, words, param):
    data_df = _scrape_function(query, date, jumlah, param)
    hidden_data = data_df
    scraping_done = True
    results = preprocessing_data(hidden_data)

    # Eksplorasi Data
    eksplorasi_data(selected_options, results, colormap, words)
    return hidden_data, scraping_done, results
#---------------------------------------------------User Interface----------------------------------------------------------------------

# Streamlit UI
st.title("Aplikasi Web Scraping & Explorasi Data")

with st.expander("Scraping Settings :"):
    # Pilihan untuk memilih situs web
    selected_site = st.selectbox("Pilih Situs Web :", ["CNBC Indonesia", "Detik.com", "Viva.co.id", "Tempo.co", "Liputan6.com"])
    if selected_site == "Tempo.co":
        waktu = st.selectbox("Pilih Rentang Waktu :", ["1tahun", "1bulan", "1minggu", "1hari", "6jam"])
        selected_channel = st.selectbox("Pilih Kanal :", ['Defaults','All(Latest Only)', 'Nasional', 'Metro', 'Dunia', 'Bisnis', 'Bola', 'Sport', 'Gaya', 'Seleb', 'Cantik', 'Tekno', 'Otomotif', 'Travel', 'Blog', 'Difabel', 'Ramadan', 'Kolom', 'Fokus', 'Creative Lab', 'Event', 'Data', 'Cek Fakta', 'Newsletter', 'Inforial'])
    query = st.text_input("Masukkan Query :").replace(' ', '+')

    jumlah = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
    date = date.today()
    download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON", "TXT"])
param_kosong = []
with st.expander("Preference Settings :"):
    selected_options = st.multiselect(
        'Pilih tampilan:',
        ['Hasil Scraping', 'Hasil Preprocessing', 'Hasil EDA'],
        ["Hasil Scraping", "Hasil EDA"]
    )
    if "Hasil EDA" in selected_options:
        colormap = st.selectbox("Pilih Warna Wordclouds :", ["Greys", "Purples", "Blues", "Greens", "Oranges", "Reds", "YlOrBr", "YlOrRd", "OrRd", "PuRd", "RdPu", "BuPu", "GnBu", "PuBu", "YlGnBu", "PuBuGn", "BuGn", "YlGn"])
        words = st.number_input("Masukkan Jumlah Most Common Words :", min_value = 1, max_value = 15, step = 1, value = 10, placeholder="Type a number...")
    else :
        colormap = "Greys"
        words = 10

st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")

#------------------------------------------------------------Bakcend----------------------------------------------------------------------------------

# Variabel tersembunyi untuk menyimpan hasil scraping
hidden_data = []

scraping_done = False  # Tambahkan variabel ini

if st.button("Mulai Scraping"):
    if not query:
        st.error("Mohon isi query.")
    else:
        # CNBC Indonesia
        if selected_site == "CNBC Indonesia":
            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_cnbc_data, query, date.strftime("%Y/%m/%d"), jumlah, selected_options, colormap, words, param_kosong)

        # Detik.com
        elif selected_site == "Detik.com":
            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_detik_news, query, date, jumlah, selected_options, colormap, words, param_kosong)

        # Viva.co.id
        elif selected_site == "Viva.co.id":
            st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_viva_data, query, date, jumlah, selected_options, colormap, words, param_kosong)

        # Tempo.co
        elif selected_site == "Tempo.co":
            st.warning("Masih dalam penegmbangan, silahkan gunakan situs yang lain.")
            hidden_data, scraping_done, results = scrape_and_explore_data(scrape_tempo_data, query, waktu, jumlah, selected_options, colormap, words, selected_channel)

        # Liputan6.com
        elif selected_site == "Liputan6.com":
            st.error("Belum bisa dipakai.")

#---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------

# Tampilkan hasil scraping
if scraping_done:
    if hidden_data:
        df = pd.DataFrame(hidden_data)
        df_prep = pd.DataFrame(results)
        # Menampilkan hasil sentimen dalam kotak yang dapat diperluas
        if 'Hasil Scraping' in selected_options:
            with st.expander(f"Hasil Scraping {selected_site} :"):
                st.write(df)
        if 'Hasil Preprocessing' in selected_options:
            with st.expander(f"Hasil Preprocessing Data :"):
                st.write(df_prep)
        if download_format == "XLSX":
            df.to_excel(f"hasil_scraping_{query}.xlsx", index=False)
            df_prep.to_excel(f"hasil_preprocess_{query}.xlsx", index=False)
            st.download_button(label=f"Unduh Hasil Scraping XLSX ({len(hidden_data)} data)", data=open(f"hasil_scraping_{query}.xlsx", "rb").read(), key="xlsx_download", file_name=f"hasil_scraping_{query}.xlsx")
            st.download_button(label=f"Unduh Hasil Preprocess XLSX ({len(results)} data)", data=open(f"hasil_preprocess_{query}.xlsx", "rb").read(), key="xlsx_download_2", file_name=f"hasil_preprocess_{query}.xlsx")
        elif download_format == "CSV":
            csv = df.to_csv(index=False)
            csv_prep = df_prep.to_csv(index = False)
            st.download_button(label=f"Unduh Hasil Scraping CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=f"hasil_scraping_{query}.csv")
            st.download_button(label=f"Unduh Hasil Preprocess CSV ({len(results)} data)", data=csv_prep, key="csv_download_2", file_name=f"hasil_preprocess_{query}.csv")
        elif download_format == "JSON":
            json_data = pd.DataFrame(hidden_data, columns=["date", "judul-berita", "link-berita"]).to_json(orient="records")
            json_data_prep = pd.DataFrame(results, columns=["Teks", "Cleaned Text", "Norm Text", "Tanpa Stopwords"]).to_json(orient="records")
            st.download_button(label=f"Unduh Hasil Scraping JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=f"hasil_scraping_{query}.json")
            st.download_button(label=f"Unduh Hasil Preprocess JSON ({len(results)} data)", data=json_data_prep, key="json_download_2", file_name=f"hasil_preprocess_{query}.json")
        elif download_format == "TXT":
            text_data = "\n".join([f"{row['date']} - {row['judul-berita']} - {row['link-berita']}" for row in hidden_data])

            st.download_button(label=f"Unduh Hasil Scraping TXT ({len(hidden_data)} data)", data=text_data, key="txt_download", file_name=f"hasil_scraping_{query}.txt")

    if not hidden_data:
        st.warning(f"Tidak ada data pada query '{query}'", icon="⚠️")
if not scraping_done:
    st.write("Tidak ada data untuk diunduh.")

st.divider()
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
st.write('Terima kasih telah mencoba demo ini!')